--- /dev/null 2018-09-25 19:25:10.000000000 +0300 +++ new/src/hotspot/cpu/aarch32/macroAssembler_aarch32.cpp 2018-09-25 19:25:10.000000000 +0300 @@ -0,0 +1,4941 @@ +/* + * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. + * Copyright (c) 2015, Linaro Ltd. All rights reserved. + * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include + +#include "precompiled.hpp" +#include "jvm.h" +#include "asm/assembler.hpp" +#include "asm/assembler.inline.hpp" +#include "gc/shared/barrierSet.hpp" +#include "gc/shared/cardTable.hpp" +#include "gc/shared/barrierSetAssembler.hpp" +#include "gc/shared/cardTableBarrierSet.hpp" +#include "interpreter/interpreter.hpp" +#include "compiler/disassembler.hpp" +#include "memory/resourceArea.hpp" +#include "nativeInst_aarch32.hpp" +#include "oops/accessDecorators.hpp" +//This ifdef was introduced so a core build can be built +#ifdef COMPILER2 +#include "opto/compile.hpp" +#include "opto/node.hpp" +#endif + +#include "runtime/biasedLocking.hpp" +#include "runtime/icache.hpp" +#include "runtime/interfaceSupport.inline.hpp" +#include "runtime/jniHandles.inline.hpp" +#include "runtime/sharedRuntime.hpp" + +#ifdef PRODUCT +#define BLOCK_COMMENT(str) /* nothing */ +#define STOP(error) stop(error) +#else +#define BLOCK_COMMENT(str) block_comment(str) +#define STOP(error) block_comment(error); stop(error) +#endif + +#define BIND(label) bind(label); BLOCK_COMMENT(#label ":") + +// FIXME This is not a nice fix, this constant was in a compiler2 header +#define MAX_stubs_size_div2 (128 / 2) +// FIXME END + +// Note the corrections in the following three instructions for the PC. +// All literal modes that use the PC need to have the offset adjusted +// Patch any kind of instruction; there may be several instructions. +// Return the total length (in bytes) of the instructions. + +int MacroAssembler::pd_patch_instruction_size(address branch, address target) { + // Note the corrections + int instructions = 1; + long offset = target - (branch + 8); // correct for that PC = PC_this + 2 instructions + bool add = offset >= 0; + unsigned insn = *(unsigned*)branch; + int opc = Instruction_aarch32::extract(insn, 27, 24); + + if(0b1010 == opc || 0b1011 == opc) { + // Branch or branch with link + assert(0 == (offset & 3), "not aligned correctly"); + Instruction_aarch32::spatch(branch, 23, 0, offset / 4); + } else if (0b0011 == opc) { + // Movw, Movt or mov, orr, orr, orr + // patch up address load to registers (absolute address). + instructions = patch_oop(branch, target) / NativeInstruction::arm_insn_sz; + } else if (0b010 == (opc >> 1)) { + // LDR, LDRB, STR, STRB + Instruction_aarch32::patch(branch, 11, 0, uabs(offset)); + Instruction_aarch32::patch(branch, 23, 23, add); + } else if (0b000 == (opc >> 1)) { + // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD + offset = uabs(offset); + Instruction_aarch32::patch(branch, 3, 0, offset & 0xf); + Instruction_aarch32::patch(branch, 11, 8, offset >> 4); + Instruction_aarch32::patch(branch, 23, 23, add); + } else if (0b1101 == opc) { + // VLDR, VSTR - NOTE VSTR(lit) is deprecated + offset = uabs(offset); + assert(0 == (offset & 3), "vldr, vstr can't do unaligned access"); + Instruction_aarch32::patch(branch, 7, 0, offset >> 2); + Instruction_aarch32::patch(branch, 23, 23, add); + } else if (0b0010 == opc) { + // ADR + Instruction_aarch32::patch(branch, 11, 0, encode_imm12(uabs(offset))); + Instruction_aarch32::patch(branch, 23, 22, add ? 0b10 : 0b01 ); + } else { + ShouldNotReachHere(); + } + // aarch64 had something for polling page load? + return instructions * NativeInstruction::arm_insn_sz; +} + +int MacroAssembler::patch_oop(address insn_addr, address o) { + unsigned insn = *(unsigned*)insn_addr; + int opc = Instruction_aarch32::extract(insn, 27, 21); + if(0b0011000 == opc) { + //32-bit pointers, formed of a mov and a movt + assert(nativeInstruction_at(insn_addr+4)->is_movt(), "wrong insns in patch"); + + uint32_t btm = (uint32_t)o & 0xffff; + Instruction_aarch32::patch(insn_addr, 19, 16, btm >> 12); + Instruction_aarch32::patch(insn_addr, 11, 0, btm & 0xfff); + uint32_t top = (uint32_t)o >> 16; + Instruction_aarch32::patch(insn_addr + 4, 19, 16, top >> 12); + Instruction_aarch32::patch(insn_addr + 4, 11, 0, top & 0xfff); + return 2 * NativeInstruction::arm_insn_sz; + } else if(0b0011101 == opc) { + //Instead 32bit load sequence uses mov, orr, orr, orr + assert(nativeInstruction_at(insn_addr+4 )->is_orr(), "wrong insns in patch"); + assert(nativeInstruction_at(insn_addr+8 )->is_orr(), "wrong insns in patch"); + assert(nativeInstruction_at(insn_addr+12)->is_orr(), "wrong insns in patch"); + // FIXME this could carry us outside valid memory + + uint32_t addr = (uint32_t)o; + Instruction_aarch32::patch(insn_addr + 0, 11, 0, (0b0000 << 8) | ((addr >> 0) & 0xff)); + Instruction_aarch32::patch(insn_addr + 4, 11, 0, (0b1100 << 8) | ((addr >> 8) & 0xff)); + Instruction_aarch32::patch(insn_addr + 8, 11, 0, (0b1000 << 8) | ((addr >> 16) & 0xff)); + Instruction_aarch32::patch(insn_addr + 12, 11, 0, (0b0100 << 8) | ((addr >> 24) & 0xff)); + return 4 * NativeInstruction::arm_insn_sz; + } else { + ShouldNotReachHere(); + } + return 0; //won't reach here +} + +address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { + long offset = 0; + int opc = Instruction_aarch32::extract(insn, 27, 24); + + if(0b1010 == opc || 0b1011 == opc) { + // Branch or branch with link + offset = Instruction_aarch32::sextract(insn, 23, 0) * 4; + } else if (0b0011 == opc) { + unsigned *insn_buf = (unsigned*)insn_addr; + int opc2 = Instruction_aarch32::extract(insn, 23, 21); + if(0b000 == opc2) { + // movw, movt (only on newer ARMs) + assert(nativeInstruction_at(&insn_buf[1])->is_movt(), "wrong insns in patch"); + uint32_t addr; + addr = Instruction_aarch32::extract(insn_buf[1], 19, 16) << 28; + addr |= Instruction_aarch32::extract(insn_buf[1], 11, 0) << 16; + addr |= Instruction_aarch32::extract(insn_buf[0], 19, 16) << 12; + addr |= Instruction_aarch32::extract(insn_buf[0], 11, 0); + return address(addr); + } else if(0b101 == opc2) { + // mov, orr, orr, orr + assert(nativeInstruction_at(&insn_buf[1])->is_orr(), "wrong insns in patch"); + assert(nativeInstruction_at(&insn_buf[2])->is_orr(), "wrong insns in patch"); + assert(nativeInstruction_at(&insn_buf[3])->is_orr(), "wrong insns in patch"); + uint32_t addr; + // TODO Check that the rotations are in the expected order. + addr = Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[0], 11, 0)); + addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[1], 11, 0)); + addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[2], 11, 0)); + addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[3], 11, 0)); + return address(addr); + } else { + ShouldNotReachHere(); + } + } else if (0b010 == (opc >> 1)) { + // LDR, LDRB, STR, STRB + offset = Instruction_aarch32::extract(insn, 11, 0); + bool add = Instruction_aarch32::extract(insn, 23, 23); + offset = add ? offset : -offset; + } else if (0b000 == (opc >> 1)) { + // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD + offset = Instruction_aarch32::extract(insn, 3, 0); + offset |= Instruction_aarch32::extract(insn, 11, 8) << 4; + bool add = Instruction_aarch32::extract(insn, 23, 23); + offset = add ? offset : -offset; + } else if (0b1101 == opc) { + // VLDR, VSTR - NOTE VSTR(lit) is deprecated + offset = Instruction_aarch32::extract(insn, 7, 0) << 2; + bool add = Instruction_aarch32::extract(insn, 23, 23); + offset = add ? offset : -offset; + } else if (0b0010 == opc) { + // ADR + offset = decode_imm12(Instruction_aarch32::extract(insn, 11, 0)); + int code = Instruction_aarch32::extract(insn, 23, 22); + switch(code) { + case 0b01: offset = -offset; break; + case 0b10: break; + default: ShouldNotReachHere(); + } + } else { + ShouldNotReachHere(); + } + //Correct offset for PC + offset += 8; + return address(((uint32_t)insn_addr + offset)); +} + + +void MacroAssembler::serialize_memory(Register thread, Register tmp) { + dmb(Assembler::ISH); +} + +void MacroAssembler::safepoint_poll(Label& slow_path) { + if (SafepointMechanism::uses_thread_local_poll()) { + ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); + tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); + } else { + mov(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state())); + ldr(rscratch1, Address(rscratch1)); + cmp(rscratch1, SafepointSynchronize::_not_synchronized); + b(slow_path, Assembler::NE); + } +} + +// Just like safepoint_poll, but use an acquiring load for thread- +// local polling. +// +// We need an acquire here to ensure that any subsequent load of the +// global SafepointSynchronize::_state flag is ordered after this load +// of the local Thread::_polling page. We don't want this poll to +// return false (i.e. not safepointing) and a later poll of the global +// SafepointSynchronize::_state spuriously to return true. +// +// This is to avoid a race when we're in a native->Java transition +// racing the code which wakes up from a safepoint. +// +void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { + if (SafepointMechanism::uses_thread_local_poll()) { + lea(rscratch1, Address(rthread, Thread::polling_page_offset())); + ldr(rscratch1, rscratch1); + dmb(Assembler::ISH); + tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); + } else { + safepoint_poll(slow_path); + } +} + +void MacroAssembler::reset_last_Java_frame(bool clear_fp) { + mov(rscratch1, 0); + // we must set sp to zero to clear frame + str(rscratch1, Address(rthread, JavaThread::last_Java_sp_offset())); + // must clear fp, so that compiled frames are not confused; it is + // possible that we need it only for debugging + if (clear_fp) { + str(rscratch1, Address(rthread, JavaThread::last_Java_fp_offset())); + } + + // Always clear the pc because it could have been set by make_walkable() + str(rscratch1, Address(rthread, JavaThread::last_Java_pc_offset())); +} + +// Calls to C land +// +// When entering C land, the rfp & sp of the last Java frame have to be recorded +// in the (thread-local) JavaThread object. When leaving C land, the last Java fp +// has to be reset to 0. This is required to allow proper stack traversal. +void MacroAssembler::set_last_Java_frame(Register last_java_sp, + Register last_java_fp, + Register last_java_pc, + Register scratch) { + + if (last_java_pc->is_valid()) { + str(last_java_pc, Address(rthread, + JavaThread::frame_anchor_offset() + + JavaFrameAnchor::last_Java_pc_offset())); + } + + // determine last_java_sp register + if (last_java_sp == sp) { + mov(scratch, sp); + last_java_sp = scratch; + } else if (!last_java_sp->is_valid()) { + last_java_sp = sp; + } + + str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); + + // last_java_fp is optional + if (last_java_fp->is_valid()) { + str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); + } +} + +void MacroAssembler::set_last_Java_frame(Register last_java_sp, + Register last_java_fp, + address last_java_pc, + Register scratch) { + if (last_java_pc != NULL) { + adr(scratch, last_java_pc); + } else { + // FIXME: This is almost never correct. We should delete all + // cases of set_last_Java_frame with last_java_pc=NULL and use the + // correct return address instead. + adr(scratch, pc()); + } + + str(scratch, Address(rthread, + JavaThread::frame_anchor_offset() + + JavaFrameAnchor::last_Java_pc_offset())); + + set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); +} + +void MacroAssembler::set_last_Java_frame(Register last_java_sp, + Register last_java_fp, + Label &L, + Register scratch) { + if (L.is_bound()) { + set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); + } else { + InstructionMark im(this); + L.add_patch_at(code(), locator()); + set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch); + } +} + +void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf) { + assert(CodeCache::find_blob(entry.target()) != NULL, + "destination of far call not found in code cache"); + if (far_branches()) { + lea(lr, entry); + if (cbuf) cbuf->set_insts_mark(); + bl(lr); + } else { + if (cbuf) cbuf->set_insts_mark(); + bl(entry); + } +} + +void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { + assert(CodeCache::find_blob(entry.target()) != NULL, + "destination of far call not found in code cache"); + if (far_branches()) { + lea(tmp, entry); + if (cbuf) cbuf->set_insts_mark(); + b(tmp); + } else { + if (cbuf) cbuf->set_insts_mark(); + b(entry); + } +} + +void MacroAssembler::reserved_stack_check() { + // testing if reserved zone needs to be enabled + Label no_reserved_zone_enabling; + + ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); + cmp(sp, rscratch1); + b(no_reserved_zone_enabling, Assembler::LO); + + enter(); // LR and FP are live. + lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); + mov(c_rarg0, rthread); + bl(rscratch1); + leave(); + + // We have already removed our own frame. + // throw_delayed_StackOverflowError will think that it's been + // called by our caller. + lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); + b(rscratch1); + should_not_reach_here(); + + bind(no_reserved_zone_enabling); +} + +int MacroAssembler::biased_locking_enter(Register obj_reg, + Register swap_reg, + Register tmp_reg, + Register tmp_reg2, + bool swap_reg_contains_mark, + Label& done, + Label* slow_case, + BiasedLockingCounters* counters) { + assert(UseBiasedLocking, "why call this otherwise?"); + + if (PrintBiasedLockingStatistics && counters == NULL) + counters = BiasedLocking::counters(); + + assert(tmp_reg != noreg, "must be real register"); + assert_different_registers(obj_reg, swap_reg, tmp_reg, tmp_reg2); + assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); + Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); + Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); + + // Biased locking + // See whether the lock is currently biased toward our thread and + // whether the epoch is still valid + // Note that the runtime guarantees sufficient alignment of JavaThread + // pointers to allow age to be placed into low bits + // First check to see whether biasing is even enabled for this object + Label cas_label; + int null_check_offset = -1; + if (!swap_reg_contains_mark) { + null_check_offset = offset(); + ldr(swap_reg, mark_addr); + } + andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); + cmp(tmp_reg, markOopDesc::biased_lock_pattern); + b(cas_label, Assembler::NE); + // The bias pattern is present in the object's header. Need to check + // whether the bias owner and the epoch are both still current. + load_prototype_header(tmp_reg, obj_reg); + orr(tmp_reg, tmp_reg, rthread); + eor(tmp_reg, swap_reg, tmp_reg); +// andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); + bic(tmp_reg, tmp_reg, markOopDesc::age_mask_in_place); + if (counters != NULL) { + Label around; + cbnz(tmp_reg, around); + atomic_inc(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, tmp_reg2); + b(done); + bind(around); + } else { + cbz(tmp_reg, done); + } + + Label try_revoke_bias; + Label try_rebias; + + // At this point we know that the header has the bias pattern and + // that we are not the bias owner in the current epoch. We need to + // figure out more details about the state of the header in order to + // know what operations can be legally performed on the object's + // header. + + // If the low three bits in the xor result aren't clear, that means + // the prototype header is no longer biased and we have to revoke + // the bias on this object. + andr(tmp_reg2, tmp_reg, markOopDesc::biased_lock_mask_in_place); + cbnz(tmp_reg2, try_revoke_bias); + + // Biasing is still enabled for this data type. See whether the + // epoch of the current bias is still valid, meaning that the epoch + // bits of the mark word are equal to the epoch bits of the + // prototype header. (Note that the prototype header's epoch bits + // only change at a safepoint.) If not, attempt to rebias the object + // toward the current thread. Note that we must be absolutely sure + // that the current epoch is invalid in order to do this because + // otherwise the manipulations it performs on the mark word are + // illegal. + andr(tmp_reg2, tmp_reg, markOopDesc::epoch_mask_in_place); + cbnz(tmp_reg2, try_rebias); + + // The epoch of the current bias is still valid but we know nothing + // about the owner; it might be set or it might be clear. Try to + // acquire the bias of the object using an atomic operation. If this + // fails we will go in to the runtime to revoke the object's bias. + // Note that we first construct the presumed unbiased header so we + // don't accidentally blow away another thread's valid bias. + { + Label here; + mov(tmp_reg2, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); + andr(swap_reg, swap_reg, tmp_reg2); + orr(tmp_reg, swap_reg, rthread); + cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, tmp_reg2, here, slow_case); + // If the biasing toward our thread failed, this means that + // another thread succeeded in biasing it toward itself and we + // need to revoke that bias. The revocation will occur in the + // interpreter runtime in the slow case. + bind(here); + if (counters != NULL) { + atomic_inc(Address((address)counters->anonymously_biased_lock_entry_count_addr()), + tmp_reg, tmp_reg2); + } + } + b(done); + + bind(try_rebias); + // At this point we know the epoch has expired, meaning that the + // current "bias owner", if any, is actually invalid. Under these + // circumstances _only_, we are allowed to use the current header's + // value as the comparison value when doing the cas to acquire the + // bias in the current epoch. In other words, we allow transfer of + // the bias from one thread to another directly in this situation. + // + // FIXME: due to a lack of registers we currently blow away the age + // bits in this situation. Should attempt to preserve them. + { + Label here; + load_prototype_header(tmp_reg, obj_reg); + orr(tmp_reg, rthread, tmp_reg); + cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, tmp_reg2, here, slow_case); + // If the biasing toward our thread failed, then another thread + // succeeded in biasing it toward itself and we need to revoke that + // bias. The revocation will occur in the runtime in the slow case. + bind(here); + if (counters != NULL) { + atomic_inc(Address((address)counters->rebiased_lock_entry_count_addr()), + tmp_reg, tmp_reg2); + } + } + b(done); + + bind(try_revoke_bias); + // The prototype mark in the klass doesn't have the bias bit set any + // more, indicating that objects of this data type are not supposed + // to be biased any more. We are going to try to reset the mark of + // this object to the prototype value and fall through to the + // CAS-based locking scheme. Note that if our CAS fails, it means + // that another thread raced us for the privilege of revoking the + // bias of this particular object, so it's okay to continue in the + // normal locking code. + // + // FIXME: due to a lack of registers we currently blow away the age + // bits in this situation. Should attempt to preserve them. + { + Label here, nope; + load_prototype_header(tmp_reg, obj_reg); + cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, tmp_reg2, here, &nope); + bind(here); + + // Fall through to the normal CAS-based lock, because no matter what + // the result of the above CAS, some thread must have succeeded in + // removing the bias bit from the object's header. + if (counters != NULL) { + atomic_inc(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, + tmp_reg2); + } + bind(nope); + } + + bind(cas_label); + + return null_check_offset; +} + +void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { + assert(UseBiasedLocking, "why call this otherwise?"); + + // Check for biased locking unlock case, which is a no-op + // Note: we do not have to check the thread ID for two reasons. + // First, the interpreter checks for IllegalMonitorStateException at + // a higher level. Second, if the bias was revoked while we held the + // lock, the object could not be rebiased toward another thread, so + // the bias bit would be clear. + ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); + andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); + cmp(temp_reg, markOopDesc::biased_lock_pattern); + b(done, Assembler::EQ); +} + + +static void pass_arg0(MacroAssembler* masm, Register arg) { + if (c_rarg0 != arg ) { + masm->mov(c_rarg0, arg); + } +} + +static void pass_arg1(MacroAssembler* masm, Register arg) { + if (c_rarg1 != arg ) { + masm->mov(c_rarg1, arg); + } +} + +static void pass_arg2(MacroAssembler* masm, Register arg) { + if (c_rarg2 != arg ) { + masm->mov(c_rarg2, arg); + } +} + +static void pass_arg3(MacroAssembler* masm, Register arg) { + if (c_rarg3 != arg ) { + masm->mov(c_rarg3, arg); + } +} + +void MacroAssembler::call_VM_base(Register oop_result, + Register java_thread, + Register last_java_sp, + address entry_point, + int number_of_arguments, + bool check_exceptions) { + // determine java_thread register + if (!java_thread->is_valid()) { + java_thread = rthread; + } + + // determine last_java_sp register + if (!last_java_sp->is_valid()) { + last_java_sp = sp; + } + + // debugging support + assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); + assert(java_thread == rthread, "unexpected register"); + + assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); + assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); + + // push java thread (becomes first argument of C function) + + mov(c_rarg0, java_thread); + + // set last Java frame before call + assert(last_java_sp != rfp, "can't use rfp"); + + Label l; + set_last_Java_frame(last_java_sp, rfp, l, rscratch2); + + + // FIXME - Can save lr in more elegant way ? + //str(lr, pre(sp, -wordSize)); + + // do the call, remove parameters + MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); + + //ldr(lr, post(sp, wordSize)); + + // reset last Java frame + // Only interpreter should have to clear fp + reset_last_Java_frame(true); + + // C++ interp handles this in the interpreter + check_and_handle_popframe(java_thread); + check_and_handle_earlyret(java_thread); + + if (check_exceptions) { + // check for pending exceptions (java_thread is set upon return) + ldr(rscratch2, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); + Label ok; + cbz(rscratch2, ok); + + lea(rscratch2, RuntimeAddress(StubRoutines::forward_exception_entry())); + // forward_exception uses LR to choose exception handler but LR is trashed by previous code + // since we used to get here from interpreted code BL is acceptable way to acquire correct LR (see StubGenerator::generate_forward_exception) + bl(rscratch2); + bind(ok); + } + + // get oop result if there is one and reset the value in the thread + if (oop_result->is_valid()) { + get_vm_result(oop_result, java_thread); + } +} + +void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { + call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); +} + +// Maybe emit a call via a trampoline. If the code cache is small +// trampolines won't be emitted. + +void MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { + assert(JavaThread::current()->is_Compiler_thread(), "just checking"); + assert(entry.rspec().type() == relocInfo::runtime_call_type + || entry.rspec().type() == relocInfo::opt_virtual_call_type + || entry.rspec().type() == relocInfo::static_call_type + || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); + + if (cbuf) { + cbuf->set_insts_mark(); + } + + if (far_branches()) { + // Have make trampoline such way: destination address should be raw 4 byte value, + // so it's patching could be done atomically. + relocate(entry.rspec()); + address start = pc(); + add(lr, r15_pc, NativeCall::instruction_size - 2 * NativeInstruction::arm_insn_sz); + ldr(r15_pc, Address(r15_pc, 4)); + emit_int32((uintptr_t) entry.target()); + // possibly pad the call to the NativeCall size to make patching happy + while (pc() - start < NativeCall::instruction_size) { + nop(); + } + assert(pc() - start == NativeCall::instruction_size, "fix NativeTrampolineCall::instruction_size!"); + } else { + bl(entry); + } +} + +void MacroAssembler::c2bool(Register x) { + ands(r0, r0, 0xff); + mov(r0, 1, Assembler::NE); +} + +void MacroAssembler::ic_call(address entry, jint method_index) { + RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); + // address const_ptr = long_constant((jlong)Universe::non_oop_word()); + // unsigned long offset; + // ldr_constant(rscratch2, const_ptr); + movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); + trampoline_call(Address(entry, rh)); +} + +// Implementation of call_VM versions + +void MacroAssembler::call_VM(Register oop_result, + address entry_point, + bool check_exceptions) { + call_VM_helper(oop_result, entry_point, 0, check_exceptions); +} + +void MacroAssembler::call_VM(Register oop_result, + address entry_point, + Register arg_1, + bool check_exceptions) { + pass_arg1(this, arg_1); + call_VM_helper(oop_result, entry_point, 1, check_exceptions); +} + +void MacroAssembler::call_VM(Register oop_result, + address entry_point, + Register arg_1, + Register arg_2, + bool check_exceptions) { + assert(arg_1 != c_rarg2, "smashed arg"); + pass_arg2(this, arg_2); + pass_arg1(this, arg_1); + call_VM_helper(oop_result, entry_point, 2, check_exceptions); +} + +void MacroAssembler::call_VM(Register oop_result, + address entry_point, + Register arg_1, + Register arg_2, + Register arg_3, + bool check_exceptions) { + assert(arg_1 != c_rarg3, "smashed arg"); + assert(arg_2 != c_rarg3, "smashed arg"); + pass_arg3(this, arg_3); + + assert(arg_1 != c_rarg2, "smashed arg"); + pass_arg2(this, arg_2); + + pass_arg1(this, arg_1); + call_VM_helper(oop_result, entry_point, 3, check_exceptions); +} + +void MacroAssembler::call_VM(Register oop_result, + Register last_java_sp, + address entry_point, + int number_of_arguments, + bool check_exceptions) { + call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); +} + +void MacroAssembler::call_VM(Register oop_result, + Register last_java_sp, + address entry_point, + Register arg_1, + bool check_exceptions) { + pass_arg1(this, arg_1); + call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); +} + +void MacroAssembler::call_VM(Register oop_result, + Register last_java_sp, + address entry_point, + Register arg_1, + Register arg_2, + bool check_exceptions) { + + assert(arg_1 != c_rarg2, "smashed arg"); + pass_arg2(this, arg_2); + pass_arg1(this, arg_1); + call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); +} + +void MacroAssembler::call_VM(Register oop_result, + Register last_java_sp, + address entry_point, + Register arg_1, + Register arg_2, + Register arg_3, + bool check_exceptions) { + assert(arg_1 != c_rarg3, "smashed arg"); + assert(arg_2 != c_rarg3, "smashed arg"); + pass_arg3(this, arg_3); + assert(arg_1 != c_rarg2, "smashed arg"); + pass_arg2(this, arg_2); + pass_arg1(this, arg_1); + call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); +} + + +void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { + ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); + assert(oop_result != rscratch2, "can't be"); + mov(rscratch2, 0); + str(rscratch2, Address(java_thread, JavaThread::vm_result_offset())); + verify_oop(oop_result, "broken oop in call_VM_base"); +} + +void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { + ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); + assert(metadata_result != rscratch2 && + java_thread != rscratch2, "can't be"); + mov(rscratch2, 0); + str(rscratch2, Address(java_thread, JavaThread::vm_result_2_offset())); +} + +void MacroAssembler::align(int modulus) { + while (offset() % modulus != 0) nop(); +} + +// these are no-ops overridden by InterpreterMacroAssembler + +void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } + +void MacroAssembler::check_and_handle_popframe(Register java_thread) { } + + +RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, + Register tmp, + int offset) { + intptr_t value = *delayed_value_addr; + if (value != 0) + return RegisterOrConstant(value + offset); + + // load indirectly to solve generation ordering problem + ldr(tmp, ExternalAddress((address) delayed_value_addr)); + + if (offset != 0) + add(tmp, tmp, offset); + + return RegisterOrConstant(tmp); +} + + +// Look up the method for a megamorphic invokeinterface call. +// The target method is determined by . +// The receiver klass is in recv_klass. +// On success, the result will be in method_result, and execution falls through. +// On failure, execution transfers to the given label. +void MacroAssembler::lookup_interface_method(Register recv_klass, + Register intf_klass, + RegisterOrConstant itable_index, + Register method_result, + Register scan_temp, + Label& L_no_such_interface, + bool return_method) { + assert_different_registers(recv_klass, intf_klass, scan_temp); + assert_different_registers(method_result, intf_klass, scan_temp); + assert(recv_klass != method_result || !return_method, + "recv_klass can be destroyed when method isn't needed"); + + // Compute start of first itableOffsetEntry (which is at the end of the vtable) + int vtable_base = in_bytes(InstanceKlass::vtable_start_offset()); + int itentry_off = itableMethodEntry::method_offset_in_bytes(); + int scan_step = itableOffsetEntry::size() * wordSize; + int vte_size = vtableEntry::size_in_bytes(); + assert(vte_size == wordSize, "else adjust times_vte_scale"); + + ldr(scan_temp, Address(recv_klass, in_bytes(InstanceKlass::vtable_length_offset()))); + + // %%% Could store the aligned, prescaled offset in the klassoop. + // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); + lea(scan_temp, Address(recv_klass, scan_temp, lsl(2))); + add(scan_temp, scan_temp, vtable_base); + + if (return_method) { + // Adjust recv_klass by scaled itable_index, so we can free itable_index. + assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); + // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); + lea(recv_klass, itable_index.is_register() ? + Address(recv_klass, itable_index, lsl(2)) : + Address(recv_klass, itable_index.as_constant() << 2)); + if (itentry_off) + add(recv_klass, recv_klass, itentry_off); + } + + // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { + // if (scan->interface() == intf) { + // result = (klass + scan->offset() + itable_index); + // } + // } + Label search, found_method; + + for (int peel = 1; peel >= 0; peel--) { + ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); + cmp(intf_klass, method_result); + + if (peel) { + b(found_method, Assembler::EQ); + } else { + b(search, Assembler::NE); + // (invert the test to fall through to found_method...) + } + + if (!peel) break; + + bind(search); + + // Check that the previous entry is non-null. A null entry means that + // the receiver class doesn't implement the interface, and wasn't the + // same as when the caller was compiled. + cbz(method_result, L_no_such_interface); + add(scan_temp, scan_temp, scan_step); + } + + bind(found_method); + + if (return_method) { + // Got a hit. + ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); + ldr(method_result, Address(recv_klass, scan_temp)); + } +} + +// virtual method calling +void MacroAssembler::lookup_virtual_method(Register recv_klass, + RegisterOrConstant vtable_index, + Register method_result) { + const int base = in_bytes(InstanceKlass::vtable_start_offset()); + int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); + if (vtable_index.is_register()) { + lea(method_result, Address(recv_klass, + vtable_index.as_register(), + lsl(LogBytesPerWord))); + ldr(method_result, Address(method_result, vtable_offset_in_bytes)); + } else { + vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; + if(is_valid_for_offset_imm(vtable_offset_in_bytes, 12)) { + ldr(method_result, Address(recv_klass, vtable_offset_in_bytes)); + } else { + mov(method_result, vtable_offset_in_bytes); + ldr(method_result, Address(recv_klass, method_result)); + } + } +} + +void MacroAssembler::check_klass_subtype(Register sub_klass, + Register super_klass, + Register temp_reg, + Label& L_success) { + Label L_failure; + check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); + check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); + bind(L_failure); +} + + +void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, + Register super_klass, + Register temp_reg, + Label* L_success, + Label* L_failure, + Label* L_slow_path, + RegisterOrConstant super_check_offset) { + assert_different_registers(sub_klass, super_klass, temp_reg); + bool must_load_sco = (super_check_offset.constant_or_zero() == -1); + if (super_check_offset.is_register()) { + assert_different_registers(sub_klass, super_klass, + super_check_offset.as_register()); + } else if (must_load_sco) { + assert(temp_reg != noreg, "supply either a temp or a register offset"); + } + + Label L_fallthrough; + int label_nulls = 0; + if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } + if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } + if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } + assert(label_nulls <= 1, "at most one NULL in the batch"); + + int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); + int sco_offset = in_bytes(Klass::super_check_offset_offset()); + Address super_check_offset_addr(super_klass, sco_offset); + + // Hacked jmp, which may only be used just before L_fallthrough. +#define final_jmp(label) \ + if (&(label) == &L_fallthrough) { /*do nothing*/ } \ + else b(label) /*omit semi*/ + + // If the pointers are equal, we are done (e.g., String[] elements). + // This self-check enables sharing of secondary supertype arrays among + // non-primary types such as array-of-interface. Otherwise, each such + // type would need its own customized SSA. + // We move this check to the front of the fast path because many + // type checks are in fact trivially successful in this manner, + // so we get a nicely predicted branch right at the start of the check. + cmp(sub_klass, super_klass); + b(*L_success, Assembler::EQ); + + // Check the supertype display: + if (must_load_sco) { + ldr(temp_reg, super_check_offset_addr); + super_check_offset = RegisterOrConstant(temp_reg); + } + Address super_check_addr(sub_klass, super_check_offset); + ldr(rscratch1, super_check_addr); + cmp(super_klass, rscratch1); // load displayed supertype + + // This check has worked decisively for primary supers. + // Secondary supers are sought in the super_cache ('super_cache_addr'). + // (Secondary supers are interfaces and very deeply nested subtypes.) + // This works in the same check above because of a tricky aliasing + // between the super_cache and the primary super display elements. + // (The 'super_check_addr' can address either, as the case requires.) + // Note that the cache is updated below if it does not help us find + // what we need immediately. + // So if it was a primary super, we can just fail immediately. + // Otherwise, it's the slow path for us (no success at this point). + + if (super_check_offset.is_register()) { + b(*L_success, Assembler::EQ); + cmp(super_check_offset.as_register(), sc_offset); + if (L_failure == &L_fallthrough) { + b(*L_slow_path, Assembler::EQ); + } else { + b(*L_failure, Assembler::NE); + final_jmp(*L_slow_path); + } + } else if (super_check_offset.as_constant() == sc_offset) { + // Need a slow path; fast failure is impossible. + if (L_slow_path == &L_fallthrough) { + b(*L_success, Assembler::EQ); + } else { + b(*L_slow_path, Assembler::NE); + final_jmp(*L_success); + } + } else { + // No slow path; it's a fast decision. + if (L_failure == &L_fallthrough) { + b(*L_success, Assembler::EQ); + } else { + b(*L_failure, Assembler::NE); + final_jmp(*L_success); + } + } + + bind(L_fallthrough); + +#undef final_jmp +} + +// These two are taken from x86, but they look generally useful + +// scans count pointer sized words at [addr] for occurence of value, +// generic +void MacroAssembler::repne_scan(Register addr, Register value, Register count, + Register scratch) { + Label loop, fail, found; + cmp(count, 0); + b(fail, EQ); + + bind(loop); + ldr(scratch, post(addr, wordSize)); + cmp(value, scratch); + b(found, EQ); + subs(count, count, 1); + b(loop, NE); + + bind(fail); + cmp(sp, 0); // sp never zero + bind(found); +} + +// Form an address from base + offset in Rd. Rd may or may +// not actually be used: you must use the Address that is returned. +// It is up to you to ensure that the shift provided matches the size +// of your data. +Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { + // form_address result should only be used together with ldr/str instructions + // otherwise please provide exact type instead of IDT_INT or apply safe_for() + if (Address::offset_ok_for_immed(byte_offset, Address::IDT_INT)) + // It fits; no need for any heroics + return Address(base, byte_offset); + + // See if we can do this with two 12-bit offsets + { + unsigned long masked_offset = byte_offset & ~0xfff; + if (Address::offset_ok_for_immed(byte_offset - masked_offset, Address::IDT_INT) + && Assembler::operand_valid_for_add_sub_immediate(masked_offset)) { + add(Rd, base, masked_offset); + byte_offset -= masked_offset; + return Address(Rd, byte_offset); + } + } + + // Do it the hard way + mov(Rd, byte_offset); + add(Rd, base, Rd); + return Address(Rd); +} + +// scans count 4 byte words at [addr] for occurence of value, +// generic +/*void MacroAssembler::repne_scanw(Register addr, Register value, Register count, + Register scratch) { + Label Lloop, Lexit; + cbz(count, Lexit); + bind(Lloop); + ldr(scratch, post(addr, wordSize)); + cmp(value, scratch); + b(Lexit, EQ); + sub(count, count, 1); + cbnz(count, Lloop); + bind(Lexit); +}*/ + +void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, + Register super_klass, + Register temp_reg, + Register temp2_reg, + Label* L_success, + Label* L_failure, + bool set_cond_codes) { + assert_different_registers(sub_klass, super_klass, temp_reg); + if (temp2_reg != noreg) + assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); +#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) + + Label L_fallthrough; + int label_nulls = 0; + if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } + if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } + assert(label_nulls <= 1, "at most one NULL in the batch"); + + // a couple of useful fields in sub_klass: + int ss_offset = in_bytes(Klass::secondary_supers_offset()); + int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); + Address secondary_supers_addr(sub_klass, ss_offset); + Address super_cache_addr( sub_klass, sc_offset); + + BLOCK_COMMENT("check_klass_subtype_slow_path"); + + // Do a linear scan of the secondary super-klass chain. + // This code is rarely used, so simplicity is a virtue here. + // The repne_scan instruction uses fixed registers, which we must spill. + // Don't worry too much about pre-existing connections with the input regs. + + assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) + assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) + + RegSet pushed_registers; + if (!IS_A_TEMP(r2)) pushed_registers += r2; + if (!IS_A_TEMP(r14)) pushed_registers += r14; + + if (super_klass != r0) { + if (!IS_A_TEMP(r0)) pushed_registers += r0; + } + + push(pushed_registers, sp); + + // Get super_klass value into r0 (even if it was in r5 or r2). + if (super_klass != r0) { + mov(r0, super_klass); + } + +#ifndef PRODUCT + mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); + Address pst_counter_addr(rscratch2); + ldr(rscratch1, pst_counter_addr); + add(rscratch1, rscratch1, 1); + str(rscratch1, pst_counter_addr); +#endif //PRODUCT + + // We will consult the secondary-super array. + ldr(r14, secondary_supers_addr); + // Load the array length. + ldr(r2, Address(r14, Array::length_offset_in_bytes())); + // Skip to start of data. + add(r14, r14, Array::base_offset_in_bytes()); + + cmp(sp, 0); // Clear Z flag; SP is never zero + // Scan R2 words at [R14] for an occurrence of R0. + // Set NZ/Z based on last compare. + repne_scan(r14, r0, r2, rscratch1); + + // Unspill the temp. registers: + pop(pushed_registers, sp); + + b(*L_failure, Assembler::NE); + + // Success. Cache the super we found and proceed in triumph. + str(super_klass, super_cache_addr); + + if (L_success != &L_fallthrough) { + b(*L_success); + } + +#undef IS_A_TEMP + + bind(L_fallthrough); +} + + +void MacroAssembler::verify_oop(Register reg, const char* s) { + if (!VerifyOops) return; + + // Pass register number to verify_oop_subroutine + const char* b = NULL; + { + ResourceMark rm; + stringStream ss; + ss.print("verify_oop: %s: %s", reg->name(), s); + b = code_string(ss.as_string()); + } + BLOCK_COMMENT("verify_oop {"); + + stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits()); + + mov(r0, reg); + mov(rscratch1, (address)b); + mrs(r1); + + // call indirectly to solve generation ordering problem + reg_printf("Verify oop entry, sp = %p, rfp = %p\n", sp, rfp); + lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); + ldr(rscratch2, Address(rscratch2)); + bl(rscratch2); + reg_printf("Verify oop exit, sp = %p, rfp = %p\n", sp, rfp); + + msr(r1); + ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits()); + + BLOCK_COMMENT("} verify_oop"); +} + +void MacroAssembler::verify_oop_addr(Address addr, const char* s) { + if (!VerifyOops) return; + + const char* b = NULL; + { + ResourceMark rm; + stringStream ss; + ss.print("verify_oop_addr: %s", s); + b = code_string(ss.as_string()); + } + BLOCK_COMMENT("verify_oop_addr {"); + + stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits()); + mrs(r1); + + // addr may contain sp so we will have to adjust it based on the + // pushes that we just did. + if (addr.uses(sp)) { + lea(r0, addr); + ldr(r0, Address(r0, 5 * wordSize)); + } else { + ldr(r0, addr); + } + mov(rscratch1, (address)b); + + // call indirectly to solve generation ordering problem + lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); + ldr(rscratch2, Address(rscratch2)); + bl(rscratch2); + + msr(r1); + ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits()); + + BLOCK_COMMENT("} verify_oop_addr"); +} + +Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, + int extra_slot_offset) { + // cf. TemplateTable::prepare_invoke(), if (load_receiver). + int stackElementSize = Interpreter::stackElementSize; + int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); +#ifdef ASSERT + int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); + assert(offset1 - offset == stackElementSize, "correct arithmetic"); +#endif + if (arg_slot.is_constant()) { + return Address(sp, arg_slot.as_constant() * stackElementSize + + offset); + } else { + add(rscratch1, sp, arg_slot.as_register(), + lsl(exact_log2(stackElementSize))); + return Address(rscratch1, offset); + } +} + +void MacroAssembler::call_VM_leaf_base(address entry_point, + int number_of_arguments, + Label *retaddr) { + Label E, L; + + //FIXME Do this alignment in a more elegant way + mov(rscratch2, sp); + sub(sp, sp, wordSize); + bic(sp, sp, 2 * wordSize - 1); // Align to eight bytes + str(rscratch2, Address(sp)); + + // FIXME Do we need to preserve rscratch2? + //str(rscratch2, Address(pre(sp, -wordSize))); + + mov(rscratch2, entry_point); + reg_printf("\tJust about to call into the VM, rfp = %p\n", rfp); + bl(rscratch2); + if (retaddr) + bind(*retaddr); + reg_printf("\tReturned from call into the VM, rfp = %p\n", rfp); + + //ldr(rscratch2, Address(post(sp, wordSize))); + + //Undo alignment + ldr(sp, Address(sp)); + + maybe_isb(); +} + +void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { + call_VM_leaf_base(entry_point, number_of_arguments); +} + +void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { + pass_arg0(this, arg_0); + call_VM_leaf_base(entry_point, 1); +} + +void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { + pass_arg0(this, arg_0); + pass_arg1(this, arg_1); + call_VM_leaf_base(entry_point, 2); +} + +void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, + Register arg_1, Register arg_2) { + pass_arg0(this, arg_0); + pass_arg1(this, arg_1); + pass_arg2(this, arg_2); + call_VM_leaf_base(entry_point, 3); +} + +void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { + pass_arg0(this, arg_0); + MacroAssembler::call_VM_leaf_base(entry_point, 1); +} + +void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { + + assert(arg_0 != c_rarg1, "smashed arg"); + pass_arg1(this, arg_1); + pass_arg0(this, arg_0); + MacroAssembler::call_VM_leaf_base(entry_point, 2); +} + +void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { + assert(arg_0 != c_rarg2, "smashed arg"); + assert(arg_1 != c_rarg2, "smashed arg"); + pass_arg2(this, arg_2); + assert(arg_0 != c_rarg1, "smashed arg"); + pass_arg1(this, arg_1); + pass_arg0(this, arg_0); + MacroAssembler::call_VM_leaf_base(entry_point, 3); +} + +void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { + assert(arg_0 != c_rarg3, "smashed arg"); + assert(arg_1 != c_rarg3, "smashed arg"); + assert(arg_2 != c_rarg3, "smashed arg"); + pass_arg3(this, arg_3); + assert(arg_0 != c_rarg2, "smashed arg"); + assert(arg_1 != c_rarg2, "smashed arg"); + pass_arg2(this, arg_2); + assert(arg_0 != c_rarg1, "smashed arg"); + pass_arg1(this, arg_1); + pass_arg0(this, arg_0); + MacroAssembler::call_VM_leaf_base(entry_point, 4); +} + +// Clobbers rscratch1 +void MacroAssembler::null_check(Register reg, int offset) { + if (needs_explicit_null_check(offset)) { + // provoke OS NULL exception if reg = NULL by + // accessing M[reg] w/o changing any registers + // NOTE: this is plenty to provoke a segv + reg_printf("Generating OS check null with ptr = %p\n", reg); + assert(reg != rscratch1, "can't be"); + ldr(rscratch1, Address(reg)); + } else { + // nothing to do, (later) access of M[reg + offset] + // will provoke OS NULL exception if reg = NULL + } +} + +// MacroAssembler protected routines needed to implement +// public methods + +void MacroAssembler::mov(Register r, Address dest, Condition cond) { + code_section()->relocate(pc(), dest.rspec()); + uint32_t imm32 = (uint32_t)dest.target(); + movptr(r, imm32, cond); +} + +// Move a constant pointer into r. In aarch32 address space +// is 32 bits in size and so a pointer can be encoded in two mov +// instructions. +void MacroAssembler::movptr(Register r, uintptr_t imm32, Condition cond) { +#ifndef PRODUCT + { + char buffer[64]; + snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32); + block_comment(buffer); + } +#endif + Assembler::mov_immediate32(r, imm32, cond, false); +} + +void MacroAssembler::ret(Register reg) { + assert(reg == lr, "Can do return only to LR"); + b(lr); +} + +void MacroAssembler::atomic_inc(Register counter_addr, Register tmp) { + Label retry_load; + bind(retry_load); + // flush and load exclusive from the memory location + ldrex(tmp, counter_addr); + add(tmp, tmp, 1); + // if we store+flush with no intervening write tmp wil be zero + strex(tmp, tmp, counter_addr); + cmp(tmp, 0); + b(retry_load, Assembler::NE); +} + + +// MacroAssembler routines found actually to be needed + +void MacroAssembler::push(Register src) +{ + str(src, Address(pre(sp, -1 * wordSize))); +} + +void MacroAssembler::pop(Register dst) +{ + ldr(dst, Address(post(sp, 1 * wordSize))); +} + +// Note: load_unsigned_short used to be called load_unsigned_word. +int MacroAssembler::load_unsigned_short(Register dst, Address src) { + int off = offset(); + ldrh(dst, src); + return off; +} + +int MacroAssembler::load_unsigned_byte(Register dst, Address src) { + int off = offset(); + ldrb(dst, src); + return off; +} + +int MacroAssembler::load_signed_short(Register dst, Address src) { + int off = offset(); + ldrsh(dst, src); + return off; +} + +int MacroAssembler::load_signed_byte(Register dst, Address src) { + int off = offset(); + ldrsb(dst, src); + return off; +} + +void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { + switch (size_in_bytes) { + //case 8: ldr(dst, src); break; + case 4: ldr(dst, src); break; + case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; + case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; + default: ShouldNotReachHere(); + } +} + +void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { + switch (size_in_bytes) { + //case 8: str(src, dst); break; + case 4: str(src, dst); break; + case 2: strh(src, dst); break; + case 1: strb(src, dst); break; + default: ShouldNotReachHere(); + } +} + +void MacroAssembler::decrement(Register reg, int value) { + if (value < 0) { + increment(reg, -value); + return; + } + if (value == 0) { + return; + } + if (operand_valid_for_add_sub_immediate(value)) { + sub(reg, reg, value); + return; + } + assert(reg != rscratch2, "invalid register for decrement"); + mov(rscratch2, (unsigned int) value); + sub(reg, reg, rscratch2); +} + +void MacroAssembler::decrement(Address dst, int value) { + assert(!dst.uses(rscratch1), "invalid address for decrement"); + ldr(rscratch1, dst); + decrement(rscratch1, value); + str(rscratch1, dst); +} + +void MacroAssembler::increment(Register reg, int value) { + if (value < 0) { + decrement(reg, -value); + return; + } + if (value == 0) { + return; + } + if (operand_valid_for_add_sub_immediate(value)) { + add(reg, reg, value); + return; + } + assert(reg != rscratch2, "invalid register for increment"); + mov(rscratch2, (unsigned int) value); + add(reg, reg, rscratch2); +} + +void MacroAssembler::increment(Address dst, int value) { + assert(!dst.uses(rscratch1), "invalid address for increment"); + ldr(rscratch1, dst); + increment(rscratch1, value); + str(rscratch1, dst); +} + +// Loads and stores everything except the pc and sp +void MacroAssembler::pusha() { + unsigned regset = 0b0101111111111111; + stmdb(sp, regset); +} +void MacroAssembler::popa() { + unsigned regset = 0b0101111111111111; + ldmia(sp, regset); +} + +static void multiple_reg_check(unsigned int bitset, Register stack) { + const unsigned int pcbit = 1 << r15_pc->encoding(); + const unsigned int lrbit = 1 << lr->encoding(); + const unsigned int spbit = 1 << sp->encoding(); + const unsigned int stackbit = 1 << stack->encoding(); + assert(!(bitset & spbit), "The SP can be in the list. However, " + "ARM deprecates using these instructions with SP in the list."); + assert(!(bitset & pcbit) || !(bitset & lrbit), + "ARM deprecates using these instructions with both " + "the LR and the PC in the list."); + assert(!(bitset & stackbit), "Instructions with the base register " + "in the list and ! specified are only available before ARMv7, " + "and ARM deprecates the use of such instructions. " + "The value of the base register after such an instruction is UNKNOWN"); +} + +// Push lots of registers in the bit set supplied. Don't push sp. +// Return the number of words pushed +int MacroAssembler::push(unsigned int bitset, Register stack) { + multiple_reg_check(bitset, stack); + unsigned bc = bitset, count = 0, i; + for(i = 0; i <= 15; i++) { + if (1 & bc) count++; + bc >>= 1; + } + // TODO Also why did it only do even quantities before? + stmdb(stack, bitset); + return count; +} + +int MacroAssembler::pop(unsigned int bitset, Register stack) { + multiple_reg_check(bitset, stack); + unsigned bc = bitset, count = 0, i; + for(i = 0; i <= 15; i++) { + if (1 & bc) count++; + bc >>= 1; + } + // TODO Also why did it only do even quantities before? + ldmia(stack, bitset); + return count; +} + +void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { + Label done, not_weak; + cbz(value, done); // Use NULL as-is. + + STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); + tbz(value, 0, not_weak); // Test for jweak tag. + + // Resolve jweak. + + access_load_word_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, + value, Address(value, -JNIHandles::weak_tag_value), tmp, noreg); + verify_oop(value); + b(done); + + + bind(not_weak); + // Resolve (untagged) jobject. + access_load_word_at(T_OBJECT, IN_NATIVE, value, Address(value), tmp, noreg); + verify_oop(value); + bind(done); +} + +void MacroAssembler::stop(const char* msg) { + pusha(); + // Save old sp value + add(rscratch2, sp, 14 * wordSize); + str(rscratch2, Address(pre(sp, -4))); + mov(c_rarg0, (address)msg); + mov(c_rarg1, r15_pc); + sub(c_rarg1, c_rarg1, 8); // Restore to actual value + mov(c_rarg2, sp); + mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug32)); + bl(c_rarg3); + hlt(0); +} + +void MacroAssembler::unimplemented(const char* what) { + const char* buf = NULL; + { + ResourceMark rm; + stringStream ss; + ss.print("unimplemented: %s", what); + buf = code_string(ss.as_string()); + } + stop(buf); +} + +// this simulates the behaviour of the x86 cmpxchg instruction using a +// load linked/store conditional pair. we use the acquire/release +// versions of these instructions so that we flush pending writes as +// per Java semantics. + +// n.b the x86 version assumes the old value to be compared against is +// in rax and updates rax with the value located in memory if the +// cmpxchg fails. we supply a register for the old value explicitly + +// the aarch32 load linked/store conditional instructions do not +// accept an offset. so, unlike x86, we must provide a plain register +// to identify the memory word to be compared/exchanged rather than a +// register+offset Address. + +void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, + Label &succeed, Label *fail) { + // oldv holds comparison value + // newv holds value to write in exchange + // addr identifies memory word to compare against/update + // tmp returns 0/1 for success/failure + Label retry_load, nope; + + bind(retry_load); + // flush and load exclusive from the memory location + // and fail if it is not what we expect + ldrex(tmp, addr); + cmp(tmp, oldv); + b(nope, Assembler::NE); + // if we store+flush with no intervening write tmp wil be zero + strex(tmp, newv, addr); + cmp(tmp, 0); + b(succeed, Assembler::EQ); + // retry so we only ever return after a load fails to compare + // ensures we don't return a stale value after a failed write. + b(retry_load); + // if the memory word differs we return it in oldv and signal a fail + bind(nope); + membar(AnyAny); + mov(oldv, tmp); + if (fail) + b(*fail); +} + +void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, + Label &succeed, Label *fail) { + assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); + cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); +} + +void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, + Label &succeed, Label *fail) { + // oldv holds comparison value + // newv holds value to write in exchange + // addr identifies memory word to compare against/update + // tmp returns 0/1 for success/failure + Label retry_load, nope; + + bind(retry_load); + // flush and load exclusive from the memory location + // and fail if it is not what we expect + ldrex(tmp, addr); + cmp(tmp, oldv); + b(nope, Assembler::NE); + // if we store+flush with no intervening write tmp wil be zero + strex(tmp, newv, addr); + cmp(tmp, 0); + b(succeed, Assembler::EQ); + // retry so we only ever return after a load fails to compare + // ensures we don't return a stale value after a failed write. + b(retry_load); + // if the memory word differs we return it in oldv and signal a fail + bind(nope); + membar(AnyAny); + mov(oldv, tmp); + if (fail) + b(*fail); +} + +#ifndef PRODUCT +extern "C" void findpc(intptr_t x); +#endif + +void MacroAssembler::debug32(char* msg, int32_t pc, int32_t regs[]) +{ + print_unseen_bytecodes(); + // In order to get locks to work, we need to fake a in_VM state + if (ShowMessageBoxOnError) { + JavaThread* thread = JavaThread::current(); + JavaThreadState saved_state = thread->thread_state(); + thread->set_thread_state(_thread_in_vm); +#ifndef PRODUCT + if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { + ttyLocker ttyl; + BytecodeCounter::print(); + } +#endif + if (os::message_box(msg, "Execution stopped, print registers?")) { + ttyLocker ttyl; + tty->print_cr(" pc = 0x%016x", pc); +#ifndef PRODUCT + tty->cr(); + findpc(pc); + tty->cr(); +#endif + tty->print_cr("THIS IS WRONG!"); + tty->print_cr(" r0 = 0x%016x", regs[0]); + tty->print_cr(" r1 = 0x%016x", regs[1]); + tty->print_cr(" r2 = 0x%016x", regs[2]); + tty->print_cr(" r3 = 0x%016x", regs[3]); + tty->print_cr(" r4 = 0x%016x", regs[4]); + tty->print_cr(" r5 = 0x%016x", regs[5]); + tty->print_cr(" r6 = 0x%016x", regs[6]); + tty->print_cr(" r7 = 0x%016x", regs[7]); + tty->print_cr(" r8 = 0x%016x", regs[8]); + tty->print_cr(" r9 = 0x%016x", regs[9]); + tty->print_cr("r10 = 0x%016x", regs[10]); + tty->print_cr("r11 = 0x%016x", regs[11]); + tty->print_cr("r12 = 0x%016x", regs[12]); + tty->print_cr("r13 = 0x%016x", regs[13]); + tty->print_cr("r14 = 0x%016x", regs[14]); + tty->print_cr("r15 = 0x%016x", regs[15]); + BREAKPOINT; + } + ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); + } else { + { + ttyLocker ttyl; + ::tty->print_cr("=============== DEBUG MESSAGE: %s ================", msg); + ::tty->print_cr(" r0 [ arg0 ] = 0x%08x", regs[1]); + ::tty->print_cr(" r1 [ arg1 ] = 0x%08x", regs[2]); + ::tty->print_cr(" r2 [ arg2 ] = 0x%08x", regs[3]); + ::tty->print_cr(" r3 [ arg3 ] = 0x%08x", regs[4]); + ::tty->print_cr(" r4 [ rdispatch ] = 0x%08x", regs[5]); + ::tty->print_cr(" r5 [ rbcp ] = 0x%08x", regs[6]); + ::tty->print_cr(" r6 [ rlocals ] = 0x%08x", regs[7]); + ::tty->print_cr(" r7 [ rcpool ] = 0x%08x", regs[8]); + ::tty->print_cr(" r8 [ rmethod ] = 0x%08x", regs[9]); + ::tty->print_cr(" r9 [ rscratch1 ] = 0x%08x", regs[10]); + ::tty->print_cr("r10 [ rthread ] = 0x%08x", regs[11]); + ::tty->print_cr("r11 [ rfp ] = 0x%08x", regs[12]); + ::tty->print_cr("r12 [ rscratch2 ] = 0x%08x", regs[13]); + ::tty->print_cr("r13 [ sp ] = 0x%08x", regs[0]); + ::tty->print_cr("r14 [ lr ] = 0x%08x", regs[14]); + ::tty->print_cr("r15 [ pc ] = 0x%08x", pc); + } + assert(false, "DEBUG MESSAGE: %s", msg); + } +} + +void MacroAssembler::push_call_clobbered_registers() { + push(RegSet::range(r0, r3), sp); + if(hasFPU()) { + const int nfloat = 16; // number of callee-saved 32-bit float registers + vstmdb_f64(sp, (1 << nfloat/2) - 1); + } +} + +void MacroAssembler::pop_call_clobbered_registers() { + if(hasFPU()) { + const int nfloat = 16; // number of callee-saved 32-bit float registers + vldmia_f64(sp, (1 << nfloat/2) - 1); + } + pop(RegSet::range(r0, r3), sp); +} + +void MacroAssembler::push_CPU_state() { + // if fix this, update also RegisterSaved::save_live_registers and it's map + push(0x5fff, sp); // integer registers except sp & (aarch32 pc) + + if(hasFPU()) { + const int nfloat = FPUStateSizeInWords / 2; // saved by pairs + vstmdb_f64(sp, (1 << nfloat) - 1); + } else { + sub(sp, sp, FPUStateSizeInWords * wordSize); + } +} + +void MacroAssembler::pop_CPU_state() { + if(hasFPU()) { + const int nfloat = FloatRegisterImpl::number_of_registers / 2; + vldmia_f64(sp, (1 << nfloat) - 1); + } else { + add(sp, sp, FPUStateSizeInWords * wordSize); + } + + pop(0x5fff, sp); // integer registers except sp & (aarch32 pc) +} + +// appears this needs to round up! +void MacroAssembler::round_to(Register reg, int modulus) { + // from x86 + add(reg, reg, modulus - 1); + bic(reg, reg, modulus - 1); // and( reg, -modulus) +} + +SkipIfEqual::SkipIfEqual( + MacroAssembler* masm, const bool* flag_addr, bool value) { + _masm = masm; + _masm->mov(rscratch1, ExternalAddress((address)flag_addr)); + _masm->ldrb(rscratch1, rscratch1); + _masm->cmp(rscratch1, 0); + _masm->b(_label, value ? Assembler::NE : Assembler::EQ); +} + +SkipIfEqual::~SkipIfEqual() { + _masm->bind(_label); +} + +void MacroAssembler::cmpptr(Register src1, Address src2) { + mov(rscratch1, src2); + ldr(rscratch1, Address(rscratch1)); + cmp(src1, rscratch1); +} + +void MacroAssembler::cmpoop(Register obj1, Register obj2) { + BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); + bs->obj_equals(this, obj1, obj2); +} + +void MacroAssembler::load_klass(Register dst, Register src) { + ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); +} + +// ((OopHandle)result).resolve(); +void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { + // OopHandle::resolve is an indirection. + access_load_word_at(T_OBJECT, IN_NATIVE, result, Address(result), tmp, noreg); +} + +void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { + const int mirror_offset = in_bytes(Klass::java_mirror_offset()); + ldr(dst, Address(rmethod, Method::const_offset())); + ldr(dst, Address(dst, ConstMethod::constants_offset())); + ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); + ldr(dst, Address(dst, mirror_offset)); + resolve_oop_handle(dst, tmp); +} + +void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { + ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); + cmp(trial_klass, tmp); +} + +void MacroAssembler::load_prototype_header(Register dst, Register src) { + load_klass(dst, src); + ldr(dst, Address(dst, Klass::prototype_header_offset())); +} + +void MacroAssembler::store_klass(Register dst, Register src) { + str(src, Address(dst, oopDesc::klass_offset_in_bytes())); +} + +void MacroAssembler::store_klass_gap(Register dst, Register src) { } + +void MacroAssembler::access_load_word_at(BasicType type, DecoratorSet decorators, + Register dst, Address src, + Register tmp1, Register thread_tmp) { + BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); + decorators = AccessInternal::decorator_fixup(decorators); + bool as_raw = (decorators & AS_RAW) != 0; + if (as_raw) { + bs->BarrierSetAssembler::load_word_at(this, decorators, type, dst, src, tmp1, thread_tmp); + } else { + bs->load_word_at(this, decorators, type, dst, src, tmp1, thread_tmp); + } +} + +void MacroAssembler::access_store_word_at(BasicType type, DecoratorSet decorators, + Address dst, Register src, + Register tmp1, Register thread_tmp) { + BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); + decorators = AccessInternal::decorator_fixup(decorators); + bool as_raw = (decorators & AS_RAW) != 0; + if (as_raw) { + bs->BarrierSetAssembler::store_word_at(this, decorators, type, dst, src, tmp1, thread_tmp); + } else { + bs->store_word_at(this, decorators, type, dst, src, tmp1, thread_tmp); + } +} + +void MacroAssembler::access_load_tos_at(BasicType type, DecoratorSet decorators, + Address src, + Register tmp1, Register thread_tmp) { + BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); + decorators = AccessInternal::decorator_fixup(decorators); + bool as_raw = (decorators & AS_RAW) != 0; + if (as_raw) { + bs->BarrierSetAssembler::load_tos_at(this, decorators, type, src, tmp1, thread_tmp); + } else { + bs->load_tos_at(this, decorators, type, src, tmp1, thread_tmp); + } +} + +void MacroAssembler::access_store_tos_at(BasicType type, DecoratorSet decorators, + Address dst, + Register tmp1, Register thread_tmp) { + BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); + decorators = AccessInternal::decorator_fixup(decorators); + bool as_raw = (decorators & AS_RAW) != 0; + if (as_raw) { + bs->BarrierSetAssembler::store_tos_at(this, decorators, type, dst, tmp1, thread_tmp); + } else { + bs->store_tos_at(this, decorators, type, dst, tmp1, thread_tmp); + } +} + +void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, + Register thread_tmp, DecoratorSet decorators) { + access_load_word_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); +} + +void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, + Register thread_tmp, DecoratorSet decorators) { + access_load_word_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); +} + +void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, + Register thread_tmp, DecoratorSet decorators) { + access_store_word_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); +} + +// Used for storing NULLs. +void MacroAssembler::store_heap_oop_null(Address dst, Register tmp) { + access_store_word_at(T_OBJECT, IN_HEAP, dst, noreg, tmp, noreg); +} + +Address MacroAssembler::allocate_metadata_address(Metadata* obj) { + assert(oop_recorder() != NULL, "this assembler needs a Recorder"); + int index = oop_recorder()->allocate_metadata_index(obj); + RelocationHolder rspec = metadata_Relocation::spec(index); + return Address((address)obj, rspec); +} + +// Move an oop into a register. immediate is true if we want +// immediate instrcutions, i.e. we are not going to patch this +// instruction while the code is being executed by another thread. In +// that case we can use move immediates rather than the constant pool. +void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { + int oop_index; + if (obj == NULL) { + oop_index = oop_recorder()->allocate_oop_index(obj); + } else { +#ifdef ASSERT + { + ThreadInVMfromUnknown tiv; + assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); + } +#endif + oop_index = oop_recorder()->find_index(obj); + } + if (! immediate) { + far_load_oop(dst, oop_index); + } else { + RelocationHolder rspec = oop_Relocation::spec(oop_index); + mov(dst, Address((address)obj, rspec)); + } +} + +// Move a metadata address into a register. +void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { + int oop_index; + if (obj == NULL) { + oop_index = oop_recorder()->allocate_metadata_index(obj); + } else { + oop_index = oop_recorder()->find_index(obj); + } + RelocationHolder rspec = metadata_Relocation::spec(oop_index); + mov(dst, Address((address)obj, rspec)); +} + +void MacroAssembler::far_load(Register dst, address addr) { + address far_load_addr = pc(); + add(dst, r15_pc, 0); + ldr(dst, Address(dst)); + + NativeFarLdr* far_load = (NativeFarLdr*) far_load_addr; + far_load->set_data_addr((intptr_t*) addr); +} + +void MacroAssembler::far_load_oop(Register dst, int oop_index) { + relocate(oop_Relocation::spec(oop_index)); + // can't provide meaningful addr, give far_load addr itself + far_load(dst, pc()); +} + +void MacroAssembler::far_load_metadata(Register dst, int metadata_index) { + relocate(metadata_Relocation::spec(metadata_index)); + // can't provide meaningful addr, give far_load addr itself + far_load(dst, pc()); +} + +void MacroAssembler::far_load_const(Register dst, address const_addr) { + relocate(section_word_Relocation::spec(const_addr, CodeBuffer::SECT_CONSTS)); + far_load(dst, const_addr); +} + +Address MacroAssembler::constant_oop_address(jobject obj) { +#ifdef ASSERT + { + ThreadInVMfromUnknown tiv; + assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); + assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); + } +#endif + int oop_index = oop_recorder()->find_index(obj); + return Address((address)obj, oop_Relocation::spec(oop_index)); +} + +// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. +void MacroAssembler::tlab_allocate(Register obj, + Register var_size_in_bytes, + int con_size_in_bytes, + Register t1, + Register t2, + Label& slow_case) { + BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); + bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); +} + +// Defines obj, preserves var_size_in_bytes +void MacroAssembler::eden_allocate(Register obj, + Register var_size_in_bytes, + int con_size_in_bytes, + Register t1, + Label& slow_case) { + BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); + bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); +} + +// Zero words; len is in bytes +// Destroys all registers except addr +// len must be a nonzero multiple of wordSize +void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { + assert_different_registers(addr, len, t1, rscratch1, rscratch2); + +#ifdef ASSERT + { Label L; + tst(len, BytesPerWord - 1); + b(L, Assembler::EQ); + stop("len is not a multiple of BytesPerWord"); + bind(L); + } +#endif + +#ifndef PRODUCT + block_comment("zero memory"); +#endif + + Label loop; + Label entry; + +// Algorithm: +// +// scratch1 = cnt & 7; +// cnt -= scratch1; +// p += scratch1; +// switch (scratch1) { +// do { +// cnt -= 8; +// p[-8] = 0; +// case 7: +// p[-7] = 0; +// case 6: +// p[-6] = 0; +// // ... +// case 1: +// p[-1] = 0; +// case 0: +// p += 8; +// } while (cnt); +// } + + const int unroll = 8; // Number of str instructions we'll unroll + + lsr(len, len, LogBytesPerWord); + andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll + sub(len, len, rscratch1); // cnt -= unroll + // t1 always points to the end of the region we're about to zero + add(t1, addr, rscratch1, lsl(LogBytesPerWord)); + adr(rscratch2, entry); + sub(rscratch2, rscratch2, rscratch1, lsl(2)); + mov(rscratch1, 0); + b(rscratch2); + bind(loop); + sub(len, len, unroll); + for (int i = -unroll; i < 0; i++) + str(rscratch1, Address(t1, i * wordSize)); + bind(entry); + add(t1, t1, unroll * wordSize); + cbnz(len, loop); +} + +void MacroAssembler::verify_tlab() { +#ifdef ASSERT + if (UseTLAB && VerifyOops) { + Label next, ok; + + strd(rscratch2, rscratch1, Address(pre(sp, -16))); + + ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); + ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); + cmp(rscratch2, rscratch1); + b(next, Assembler::HS); + STOP("assert(top >= start)"); + should_not_reach_here(); + + bind(next); + ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); + ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); + cmp(rscratch2, rscratch1); + b(ok, Assembler::HS); + STOP("assert(top <= end)"); + should_not_reach_here(); + + bind(ok); + ldrd(rscratch2, rscratch1, Address(post(sp, 16))); + } +#endif +} + +// Writes to stack successive pages until offset reached to check for +// stack overflow + shadow pages. This clobbers tmp. +void MacroAssembler::bang_stack_size(Register size, Register tmp) { + assert_different_registers(tmp, size, rscratch1); + mov(tmp, sp); + // Bang stack for total size given plus shadow page size. + // Bang one page at a time because large size can bang beyond yellow and + // red zones. + Label loop; + mov(rscratch1, os::vm_page_size()); + bind(loop); + lea(tmp, Address(tmp, -os::vm_page_size())); + subs(size, size, rscratch1); + str(size, Address(tmp)); + b(loop, Assembler::GT); + + // Bang down shadow pages too. + // At this point, (tmp-0) is the last address touched, so don't + // touch it again. (It was touched as (tmp-pagesize) but then tmp + // was post-decremented.) Skip this address by starting at i=1, and + // touch a few more pages below. N.B. It is important to touch all + // the way down to and including i=StackShadowPages. + for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { + // this could be any sized move but this is can be a debugging crumb + // so the bigger the better. + lea(tmp, Address(tmp, -os::vm_page_size())); + str(size, Address(tmp)); + } +} + + +// Move the address of the polling page into dest. +void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { + if (SafepointMechanism::uses_thread_local_poll()) { + ldr(dest, Address(rthread, Thread::polling_page_offset())); + } else { + mov(dest, Address(page, rtype)); + } +} + +// Move the address of the polling page into r, then read the polling +// page. +address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { + get_polling_page(r, page, rtype); + return read_polling_page(r, rtype); +} + +address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { + InstructionMark im(this); + code_section()->relocate(inst_mark(), rtype); + // It's ok to load to reg from reg + off (without write-back) + ldr(r, Address(r, 0)); + return inst_mark(); +} + +// Helper functions for 64-bit multipliction, division and remainder +// does = * +void MacroAssembler::mult_long(Register Rd, Register Rn, Register Rm) { + Register Rdh = (Register)(Rd->encoding_nocheck() + 1); + Register Rnh = (Register)(Rn->encoding_nocheck() + 1); + Register Rmh = (Register)(Rm->encoding_nocheck() + 1); + + mult_long(Rd, Rdh, Rn, Rnh, Rm, Rmh); +} + +// does = * +void MacroAssembler::mult_long(Register Rd, Register Rdh, Register Rn, Register Rnh, Register Rm, Register Rmh) { + assert_different_registers(Rn, Rnh); + assert_different_registers(Rm, Rmh); + assert_different_registers(Rd, Rdh); // umull restriction + const Register t = rscratch1; + + mul(t, Rm, Rnh); + mla(t, Rn, Rmh, t); + umull(Rd, Rdh, Rm, Rn); + add(Rdh, t, Rdh); +} + + +int64_t internal_ldiv(int64_t a, int64_t b) { + return a / b; +} + +int64_t internal_lmod(int64_t a, int64_t b) { + return a % b; +} + +void MacroAssembler::divide32(Register res, Register num, Register den, bool want_mod) { + Register cnt = rscratch1; + Register mod = rscratch2; + Register sign = r14; + assert_different_registers(num, den, rscratch1, rscratch2, r14); + + // FIXME This works by first converting any negative values to positive ones, however + // it is not possible to express |INT_MIN|. Need to fix this + + //Convert to positive values + mov(sign, 0); + + cmp(num, 0); + mov(sign, 1, MI); + rsb(num, num, 0, MI); + + cmp(den, 0); + if(!want_mod) eor(sign, sign, 1, MI); + rsb(den, den, 0, MI); + + // Algorithm from + // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt + // Graeme Williams + mov(cnt, 28); + mov(mod, num, lsr(4)); + cmp(den, mod, lsr(12)); + sub(cnt, cnt, 16, Assembler::LE); + mov(mod, mod, lsr(16), Assembler::LE); + cmp(den, mod, lsr(4)); + sub(cnt, cnt, 8, Assembler::LE); + mov(mod, mod, lsr(8), Assembler::LE); + cmp(den, mod); + sub(cnt, cnt, 4, Assembler::LE); + mov(mod, mod, lsr(4), Assembler::LE); + mov(num, num, lsl(cnt)); + rsb(den, den, 0); + + adds(num, num, num); + //Now skip over cnt copies of the 3 instr. loop. + add(cnt, cnt, cnt, lsl(1)); + add(r15_pc, r15_pc, cnt, lsl(2)); + mov(r0, r0); + + for(int i = 0; i < 32; i++) { + adcs(mod, den, mod, lsl(1)); + sub(mod, mod, den, Assembler::LO); + adcs(num, num, num); + } + + cmp(sign, 0); + rsb(res, want_mod? mod : num, 0, NE); + mov(res, want_mod? mod : num, EQ); +} + + +// = / +// = % +// = / +// = % +void MacroAssembler::divide(Register Rd, Register Rn, Register Rm, int width, bool want_remainder) { + //Dispatch to best possible + Register Rdh = (Register)(Rd->encoding_nocheck() + 1); + Register Rnh = (Register)(Rn->encoding_nocheck() + 1); + Register Rmh = (Register)(Rm->encoding_nocheck() + 1); + + assert(32 == width || 64 == width, "Invalid width"); + bool is64b = 64 == width; + + if(is64b) { + assert_different_registers(Rn, Rnh, Rm, Rmh, rscratch1, rscratch2); + } + + if(!is64b && VM_Version::features() & FT_HW_DIVIDE) { + // Emit a hw instruction sequnce. + if(want_remainder) { + sdiv(rscratch1, Rn, Rm); + mls(Rd, rscratch1, Rm, Rn); + } else { + sdiv(Rd, Rn, Rm); + } + } else if(!is64b) { + // Fall back to assembly software routine + divide32(Rd, Rn, Rm, want_remainder); + } else { + // Fall back to C software routine for + // 64 bit divide/mod + if(Rn != r0) { + mov(rscratch1, Rm); + mov(rscratch2, Rmh); + + mov(r0, Rn); + mov(r1, Rnh); + + mov(r2, rscratch1); + mov(r3, rscratch2); + } else if(Rm != r2) { + mov(r2, Rm); + mov(r3, Rmh); + } + address function; + if(want_remainder) function = (address)internal_lmod; + else function = (address)internal_ldiv; + + mov(rscratch1, function); + bl(rscratch1); + if(Rd != r0) { + mov(Rd, r0); + if(is64b) mov(Rdh, r1); + } + } +} + +void MacroAssembler::extract_bits(Register dest, Register source, int lsb, int width) { + assert(lsb >= 0 && lsb + width <= 32 && width != 0, "Invalid lsb/width"); + // Dispatch to the best sequence + if(0 == (lsb & 7) && (width == 8 || width == 16 || width == 32)) { + // Can use extend X + switch(width){ + case 8: uxtb(dest, source, ror(lsb)); break; + case 16: uxth(dest, source, ror(lsb)); break; + default: break; + } + } else if(VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2)) { + ubfx(dest, source, lsb, width); + } else { + // Do two shifts + lsl(dest, source, 32 - (width + lsb)); + lsr(dest, dest, 32 - width); + } +} + + +void MacroAssembler::atomic_ldrd(Register Rt, Register Rt2, Register Rbase) { + assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register"); + assert((Register) (Rt + 1) == Rt2, "Must be contiguous"); + if(VM_Version::features() & FT_SINGLE_CORE) { + ldrd(Rt, Rbase); + } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) { +#ifdef ASSERT + Label lbl; + tst(Rbase, 7); + b(lbl, EQ); + stop("atomic_ldrd is not doubleword aligned!"); + bind(lbl); +#endif // ASSERT + + ldrexd(Rt, Rbase); + } else { + // TODO: Find Java way of logging + static bool warning_printed = false; + if(!warning_printed) { + fprintf(stderr, "Unable to provide atomic doubleword load.\n"); + warning_printed = true; + } + ldrd(Rt, Rbase); + } +} + +void MacroAssembler::atomic_strd(Register Rt, Register Rt2, Register Rbase, + Register temp, Register temp2) { + assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register"); + assert((Register) (Rt + 1) == Rt2, "Must be contiguous"); + assert((Register) (temp + 1) == temp2, "Must be contiguous"); + assert_different_registers(temp, Rt, Rbase, temp2); + if(VM_Version::features() & FT_SINGLE_CORE) { + strd(Rt, Rbase); + } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) { + // First need to gain exclusive access + Label retry; + +#ifdef ASSERT + tst(Rbase, 7); + b(retry, EQ); + stop("atomic_strd is not doubleword aligned!"); +#endif // ASSERT + + bind(retry); + ldrexd(temp, Rbase); + strexd(temp, Rt, Rbase); + cmp(temp, 0); + b(retry, NE); + } else { + // TODO: Find Java way of logging + static bool warning_printed = false; + if(!warning_printed) { + fprintf(stderr, "Unable to provide atomic doubleword store.\n"); + warning_printed = true; + } + strd(Rt, Rbase); + } +} + + +#define ENABLE_DEBUGGING 0 +// Helloworld is 2,482,397 +uint32_t MacroAssembler::bytecodes_until_print = 2400000; //13795328; //6888000L; //6881772L; + +uint32_t MacroAssembler::bytecodes_executed = 0; + +int MacroAssembler::enable_debug = 0; +int MacroAssembler::enable_method_debug = 0; +int MacroAssembler::enable_debugging_static = ENABLE_DEBUGGING; + +#define N_J_BYTECODES 238 +const char* j_bytecodes[N_J_BYTECODES] = {"nop", "aconstnull", "iconstm1", "iconst0", "iconst1", "iconst2", "iconst3", "iconst4", "iconst5", "lconst0", +"lconst1", "fconst0", "fconst1", "fconst2", "dconst0", "dconst1", "bipush", "sipush", "ldc", "ldcw", "ldc2w", +"iload", "lload", "fload", "dload", "aload", "iload0", "iload1", "iload2", "iload3", "lload0", "lload1", "lload2", +"lload3", "fload0", "fload1", "fload2", "fload3", "dload0", "dload1", "dload2", "dload3", "aload0", "aload1", "aload2", +"aload3", "iaload", "laload", "faload", "daload", "aaload", "baload", "caload", "saload", "istore", "lstore", "fstore", +"dstore", "astore", "istore0", "istore1", "istore2", "istore3", "lstore0", "lstore1", "lstore2", "lstore3", "fstore0", +"fstore1", "fstore2", "fstore3", "dstore0", "dstore1", "dstore2", "dstore3", "astore0", "astore1", "astore2", "astore3", +"iastore", "lastore", "fastore", "dastore", "aastore", "bastore", "castore", "sastore", "pop", "pop2", "dup", "dupx1", +"dupx2", "dup2", "dup2x1", "dup2x2", "swap", "iadd", "ladd", "fadd", "dadd", "isub", "lsub", "fsub", "dsub", "imul", +"lmul", "fmul", "dmul", "idiv", "ldiv", "fdiv", "ddiv", "irem", "lrem", "frem", "drem", "ineg", "lneg", "fneg", "dneg", +"ishl", "lshl", "ishr", "lshr", "iushr", "lushr", "iand", "land", "ior", "lor", "ixor", "lxor", "iinc", "i2l", "i2f", +"i2d", "l2i", "l2f", "l2d", "f2i", "f2l", "f2d", "d2i", "d2l", "d2f", "i2b", "i2c", "i2s", "lcmp", "fcmpl", "fcmpg", +"dcmpl", "dcmpg", "ifeq", "ifne", "iflt", "ifge", "ifgt", "ifle", "ificmpeq", "ificmpne", "ificmplt", "ificmpge", +"ificmpgt", "ificmple", "ifacmpeq", "ifacmpne", "goto", "jsr", "ret", "tableswitch", "lookupswitch", "ireturn", +"lreturn", "freturn", "dreturn", "areturn", "return", "getstatic", "putstatic", "getfield", "putfield", +"invokevirtual", "invokespecial", "invokestatic", "invokeinterface", "invokedynamic", "new", "newarray", +"anewarray", "arraylength", "athrow", "checkcast", "instanceof", "monitorenter", "monitorexit", "wide", +"multianewarray", "ifnull", "ifnonnull", "gotow", "jsrw", "breakpoint", "fast_agetfield", "fast_bgetfield", +"fast_cgetfield", "fast_dgetfield", "fast_fgetfield", "fast_igetfield", "fast_lgetfield", "fast_sgetfield", +"fast_aputfield", "fast_bputfield", "fast_cputfield", "fast_dputfield", "fast_fputfield", "fast_iputfield", +"fast_lputfield", "fast_sputfield", "fast_aload_0", "fast_iaccess_0", "fast_aaccess_0", "fast_faccess_0", +"fast_iload", "fast_iload2", "fast_icaload", "fast_invokevfinal", "fast_linearswitch", "fast_binaryswitch", +"fast_aldc", "fast_aldc_w", "return_register_finalizer", "invokehandle", "nofast_getfield", "nofast_putfield", +"nofast_aload_0", "nofast_iload", "INVALID"}; + +int bytecodes_seen[256]; + +void MacroAssembler::init_unseen_bytecodes() { + for(int i = 0; i < 256; i++ ) { + bytecodes_seen[i] = 0; + } +} + +void MacroAssembler::bytecode_seen(Register bc_reg, Register scratch) { + if(ENABLE_DEBUGGING) { + mov(scratch, (address)bytecodes_seen); + add(scratch, scratch, bc_reg, lsl(2)); + add(bc_reg, bc_reg, 1); + str(bc_reg, Address(scratch)); + sub(bc_reg, bc_reg, 1); + } +} + +void MacroAssembler::print_unseen_bytecodes() { + if(ENABLE_DEBUGGING) { + printf("=== Unseen bytecodes ===\n"); + for(int i = 0; i < N_J_BYTECODES; i++) { + if(0 == bytecodes_seen[i]) { + printf("\t%s\n", j_bytecodes[i]); + } + } + printf("=== End unseen ===\n"); + } else { + printf("Not kept track, enable debugging to view info\n"); + } + fflush(stdout); +} + +int machine_state_regset = 0b0101111111111111; +int machine_state_float_regset = 0b11; + +void MacroAssembler::save_machine_state() { + stmdb(sp, machine_state_regset); + if(hasFPU()) { + vstmdb_f64(sp, machine_state_float_regset); + } + enter(); +} + +void MacroAssembler::restore_machine_state() { + leave(); + if(hasFPU()) { + vldmia_f64(sp, machine_state_float_regset); + } + ldmia(sp, machine_state_regset); +} + +void internal_internal_printf(const char *fmt, ...) { + va_list args; + va_start (args, fmt); + vprintf (fmt, args); + fflush(stdout); + va_end(args); +} + +void internal_printf(const char *format, uint32_t a, uint32_t b, uint32_t c) { + char buf[2048]; + char fmt[2048]; + buf[0] = '\0'; + const char *thread_str = "THREAD 0x%08x : "; + int id = pthread_self(); + strcpy(fmt, format); + + char *str = strtok(fmt, "\n"); + int nreplace = 0; + while(str) { + strcpy(buf, thread_str); + strcat(buf, str); + strcat(buf, "\n"); + internal_internal_printf((const char*)buf, id, a, b, c); + str = strtok(NULL, "\n"); + } +} + +void MacroAssembler::get_bytecode(Register dst, Register bc) { + if(ENABLE_DEBUGGING) { + int nbytecodes = N_J_BYTECODES; + mov(dst, (address)j_bytecodes); + cmp(bc, nbytecodes); + + ldr(dst, Address(dst, bc, lsl(2)), Assembler::LT); + ldr(dst, Address(dst, wordSize * nbytecodes), Assembler::GE); + } +} + +int invocation_depth_count = -1; //TODO remove this with debugging info + +#define MAX_FCALL_DEPTH 4096 +struct thread_method_record{ + int thread_id; + char names[MAX_FCALL_DEPTH][512]; + int invocation_depth_count; +}; +int ntmrs = 0; +#define MAX_TMRS 10 +thread_method_record tmr_list[MAX_TMRS]; + +void push_tmr(Method *meth, int *thread_id, int *invocation_depth_count, char **name) { + int id = pthread_self(); + *thread_id = id; + for(int i = 0; i < ntmrs; i++) { + thread_method_record *tmr = &tmr_list[i]; + if(id == tmr->thread_id) { + // Add a new frame + if(tmr->invocation_depth_count >= -1 && + tmr->invocation_depth_count < (MAX_FCALL_DEPTH - 1)) { + *invocation_depth_count = ++(tmr->invocation_depth_count); + *name = tmr->names[tmr->invocation_depth_count]; + meth->name_and_sig_as_C_string(tmr->names[tmr->invocation_depth_count], 512); + return; + } else { + fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count); + exit(1); + } + } + } + // Add a new thread + if(ntmrs >= MAX_TMRS) { + fprintf(stderr, "Too many tmrs\n"); + exit(1); + } + //Create a new tmr + tmr_list[ntmrs].thread_id = id; + tmr_list[ntmrs].invocation_depth_count = 0; + meth->name_and_sig_as_C_string(tmr_list[ntmrs].names[0], 512); + *invocation_depth_count = 0; + *name = tmr_list[ntmrs].names[0]; + ntmrs++; +} + +void pop_tmr(int *thread_id, int *invocation_depth_count, char **name) { + int id = pthread_self(); + *thread_id = id; + for(int i = 0; i < ntmrs; i++) { + thread_method_record *tmr = &tmr_list[i]; + if(id == tmr->thread_id) { + if(tmr->invocation_depth_count >= 0 && + tmr->invocation_depth_count < MAX_FCALL_DEPTH) { + // Pop frame + *name = tmr->names[tmr->invocation_depth_count]; + *invocation_depth_count = (tmr->invocation_depth_count)--; + return; + } else if ( -1 == tmr->invocation_depth_count) { + *name = (char*)"JVM-EXCEPTION-EXIT:(NOT-REALLY-A-FRAME)"; + *invocation_depth_count = 0; + return; + } else { + fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count); + exit(1); + } + } + } + fprintf(stderr, "Unable to find suitable tmr\n"); + exit(1); +} + +void prepare_entry_exit_prefix(char *buf, int id, int invocation_depth_count) { + sprintf(buf, "THREAD 0x%08x : ", id); + for(int i = 0; i < invocation_depth_count; i++) { + strcat(buf, " "); + } +} + + +void print_entry(Method *meth, int native) { + char *name; + int invocation_depth_count, id; + push_tmr(meth, &id, &invocation_depth_count, &name); + + if(MacroAssembler::enable_method_debug) { + char buf[4096], buf_b[2048]; + prepare_entry_exit_prefix(buf, id, invocation_depth_count); + if(native) { + sprintf(buf_b, "CALL NATIVE : %s\n", name); + } else { + sprintf(buf_b, "CALL JAVA : %s\n", name); + } + strcat(buf, buf_b); + printf("%s", buf); + fflush(stdout); + } +} + +void print_exit(bool normal) { + char *name; + int invocation_depth_count, id; + pop_tmr(&id, &invocation_depth_count, &name); + + if(MacroAssembler::enable_method_debug) { + char buf[4096], buf_b[2048]; + prepare_entry_exit_prefix(buf, id, invocation_depth_count); + sprintf(buf_b, normal ? "EXIT : %s\n" : "EXCPN EXIT : %s\n", name); + strcat(buf, buf_b); + printf("%s", buf); + fflush(stdout); + } +} + +void MacroAssembler::print_method_entry(Register rmethod, bool native) { + if(ENABLE_DEBUGGING) { + save_machine_state(); + + bic(sp, sp, 7); // 8-byte align stack + mov(rscratch2, (address)print_entry); + mov(r0, rmethod); + mov(r1, native); + bl(rscratch2); + + restore_machine_state(); + } +} + +void MacroAssembler::print_method_exit(bool normal) { + if(ENABLE_DEBUGGING) { + save_machine_state(); + + bic(sp, sp, 7); // 8-byte align stack + mov(rscratch2, (address)print_exit); + mov(r0, normal); + bl(rscratch2); + + restore_machine_state(); + } +} + +void MacroAssembler::reg_printf_internal(bool important, const char *fmt, Register ra, Register rb, Register rc) { + if(ENABLE_DEBUGGING) { + Label skip; + save_machine_state(); + + mov(rscratch1, ra); + str(rscratch1, Address(pre(sp, -wordSize))); + mov(rscratch1, rb); + str(rscratch1, Address(pre(sp, -wordSize))); + mov(rscratch1, rc); + str(rscratch1, Address(pre(sp, -wordSize))); + + if(!important) { + mov(r0, (address)&enable_debug); + ldr(r0, Address(r0)); + cmp(r0, 0); + b(skip, Assembler::EQ); + } + + int sp_difference = wordSize * (count_bits(machine_state_regset) + + 2 * count_bits(machine_state_float_regset) + + 2 + 3); //Frame entry and saved + + mov(r0, (address)fmt); + if(ra != sp) ldr(r1, Address(sp, 2 * wordSize)); + else add(r1, sp, sp_difference); + + if(rb != sp) ldr(r2, Address(sp, wordSize)); + else add(r2, sp, sp_difference); + + if(rc != sp) ldr(r3, Address(sp)); + else add(r3, sp, sp_difference); + + bic(sp, sp, 7); // 8-byte align stack + + mov(rscratch2, (address)internal_printf); + bl(rscratch2); + + bind(skip); + restore_machine_state(); + } +} + +void MacroAssembler::reg_printf(const char *fmt, Register ra, Register rb, Register rc) { + reg_printf_internal(false, fmt, ra, rb, rc); +} + +void MacroAssembler::reg_printf_important(const char *fmt, Register ra, Register rb, Register rc) { + reg_printf_internal(true, fmt, ra, rb, rc); +} + +// When debugging, set the break on bkpnt +void bkpnt() { return; } +void MacroAssembler::create_breakpoint() { + if(ENABLE_DEBUGGING) { + save_machine_state(); + bic(sp, sp, 7); // 8-byte align stack + + mov(rscratch2, (address) bkpnt); + bl(rscratch2); + + restore_machine_state(); + } +} + + +void MacroAssembler::print_cpool(InstanceKlass *klass) { + ttyLocker ttyl; + klass->constants()->print_on(tty); +} + +int MacroAssembler::ldrd(Register Rt, Register Rt2, const Address& adr, Register Rtmp, Condition cond) { + if((0 == Rt->encoding_nocheck() % 2 && + (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) && + (uabs(adr.offset()) < (1 << 8))) { + /* Good to go with a ldrd */ + ldrd(Rt, adr, cond); + return 0x0; + } else { + return double_ld_failed_dispatch(Rt, Rt2, adr, &Assembler::ldm, + &Assembler::ldr, Rtmp, cond); + } +} + +int MacroAssembler::strd(Register Rt, Register Rt2, const Address& adr, Condition cond) { + if((0 == Rt->encoding_nocheck() % 2 && + (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) && + (uabs(adr.offset()) < (1 << 8))) { + /* Good to go with a strd */ + strd(Rt, adr, cond); + } else { + double_ldst_failed_dispatch(Rt, Rt2, adr, &Assembler::stm, &Assembler::str, cond); + } + return 0x0; +} + +int MacroAssembler::double_ld_failed_dispatch(Register Rt, Register Rt2, const Address& adr, + void (Assembler::* mul)(unsigned, const Address&, Condition), + void (Assembler::* sgl)(Register, const Address&, Condition), + Register Rtmp, Condition cond) { + if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) && + (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) { + /* Do a load or store multiple instruction */ + (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond); + } else if (!adr.uses(Rt)) { + double_ldst_failed_dispatch(Rt, Rt2, adr, mul, sgl, cond); + } else { + // need to reshuffle operation, otherwise write to Rt destroys adr + if (adr.get_mode() != Address::reg) { + // offset-based addressing. hence Rt2 could not be by adr + if (adr.get_wb_mode() == Address::pre) { + (this->*sgl)(Rt2, Address(pre(adr.base(), adr.offset() + wordSize)), cond); + (this->*sgl)(Rt, Address(pre(adr.base(), -wordSize)), cond); + } else if (adr.get_wb_mode() == Address::post) { + (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond); + (this->*sgl)(Rt, adr, cond); + } else if (adr.get_wb_mode() == Address::off) { + (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond); + (this->*sgl)(Rt, adr, cond); + } else { + ShouldNotReachHere(); + } + } else { + // index-based addressing. both Rt and Rt2 could be used by adr + // hence temp register is necessary + adr.lea(this, Rtmp); + double_ldst_failed_dispatch(Rt, Rt2, Address(Rtmp), mul, sgl, cond); + // adr.lea have only address manipulation and cannot cause trap. + // first instruction when NPE can occur is in double_ldst_failed_dispatch + // so shift offset appropriately + return 0x4; + } + } + return 0x0; +} + +void MacroAssembler::double_ldst_failed_dispatch(Register Rt, Register Rt2, const Address& adr, + void (Assembler::* mul)(unsigned, const Address&, Condition), + void (Assembler::* sgl)(Register, const Address&, Condition), + Condition cond) { + if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) && + (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) { + /* Do a store multiple instruction */ + (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond); + } else { + if (adr.get_mode() != Address::reg) { + // offset-based addressing + if (adr.get_wb_mode() == Address::pre) { + (this->*sgl)(Rt, adr, cond); + (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond); + } else if (adr.get_wb_mode() == Address::post) { + (this->*sgl)(Rt, adr, cond); + (this->*sgl)(Rt2, Address(adr.base(), wordSize - adr.offset()), cond); + } else if (adr.get_wb_mode() == Address::off) { + (this->*sgl)(Rt, adr, cond); + (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond); + } else { + ShouldNotReachHere(); + } + } else { + // index-based addressing + if (adr.get_wb_mode() == Address::pre) { + // current implementation does not use Address::pre for indexed access + ShouldNotReachHere(); + } else if (adr.get_wb_mode() == Address::post) { + // current implementation does not use Address:post for indexed access + // enable the code below and implement proper post() method if it is required +#if 0 + (this->*sgl)(Rt, Address(post(adr.base(), wordSize)), cond); + (this->*sgl)(Rt2, Address(post(adr.base(), adr.index(), adr.shift())), cond); + sub(adr.base(), wordSize, cond); +#endif + ShouldNotReachHere(); + } else if (adr.get_wb_mode() == Address::off) { + (this->*sgl)(Rt, Address(pre(adr.base(), adr.index(), adr.shift(), adr.op())), cond); + (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond); + compensate_addr_offset(adr, cond); + } else { + ShouldNotReachHere(); + } + } + } +} + +#ifdef ASSERT +void MacroAssembler::verify_stack_alignment() { + if (StackAlignmentInBytes > 4) { + Label x; + tst(sp, StackAlignmentInBytes-1); + b(x, EQ); + stop("stack unaligned"); + bind(x); + } +} +#endif + +/** + * Code for BigInteger::multiplyToLen() instrinsic. + * + * r0: x + * r1: xlen + * r2: y + * r3: ylen + * r4: z + * r5: zlen + * + */ +void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, + Register z, Register zlen, + Register tmp1, Register tmp2, Register tmp3, Register tmp4, + Register tmp5, Register tmp6) { + + assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); + + const Register xc = xlen; + const Register yc = tmp1; + const Register zc = tmp2; + + const Register vz = tmp3; + const Register carry = tmp4; + const Register vx = tmp5; + const Register vy = tmp6; + + // ensure y (inner cycle) is shorter than x (outer cycle), this in theory uses CPU caches more effectively + Label L_x_longer; + cmp(xlen, ylen); + b(L_x_longer, Assembler::GE); +#define SWP(X, Y) \ + mov(tmp1, Y); \ + mov(Y, X); \ + mov(X, tmp1) + SWP(x, y); + SWP(xlen, ylen); + bind(L_x_longer); + + lea(xc, Address(x, xlen, lsl(LogBytesPerInt))); // x[xstart] + lea(yc, Address(y, ylen, lsl(LogBytesPerInt))); // y[idx] + lea(zc, Address(z, zlen, lsl(LogBytesPerInt))); // z[kdx] + + // First Loop. + // + // final static long LONG_MASK = 0xffffffffL; + // int xstart = xlen - 1; + // int ystart = ylen - 1; + // long carry = 0; + // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { + // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; + // z[kdx] = (int)product; + // carry = product >>> 32; + // } + // z[xstart] = (int)carry; + // + + ldr(vx, Assembler::pre(xc, -BytesPerInt)); + mov(carry, 0); + + Label L_loop_1; + bind(L_loop_1); + ldr(vy, Assembler::pre(yc, -BytesPerInt)); + mov(vz, 0); + umaal(vz, carry, vx, vy); + str(vz, Assembler::pre(zc, -BytesPerInt)); + cmp(yc, y); + b(L_loop_1, Assembler::GT); + + str(carry, Address(zc, -BytesPerInt)); + + // Second and third (nested) loops. + // + // for (int i = xstart-1; i >= 0; i--) { // Second loop + // carry = 0; + // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop + // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + + // (z[k] & LONG_MASK) + carry; + // z[k] = (int)product; + // carry = product >>> 32; + // } + // z[i] = (int)carry; + // } + // + Label L_loop_2, L_loop_3; + bind(L_loop_2); + + sub(zlen, zlen, 1); + lea(yc, Address(y, ylen, lsl(LogBytesPerInt))); // y[jdx] + lea(zc, Address(z, zlen, lsl(LogBytesPerInt))); // z[k] + + ldr(vx, Assembler::pre(xc, -BytesPerInt)); + mov(carry, 0); + + bind(L_loop_3); + ldr(vy, Assembler::pre(yc, -BytesPerInt)); + ldr(vz, Assembler::pre(zc, -BytesPerInt)); // r1 is vz, r2 is carry + umaal(vz, carry, vx, vy); + str(vz, Address(zc)); + cmp(yc, y); + b(L_loop_3, Assembler::GT); + + str(carry, Address(zc, -BytesPerInt)); + cmp(xc, x); + b(L_loop_2, Assembler::GT); +} + +/** + * Code for BigInteger::mulAdd() instrinsic. + * + * r0: out + * r1: in + * r2: offset + * r3: len + * r4: k + */ +void MacroAssembler::mul_add(Register out, Register in, Register offset, Register len, Register k, + Register tmp1, Register tmp2, Register tmp3) { + + assert_different_registers(out, in, offset, len, k, tmp1, tmp2, tmp3); + + Register vin = tmp1; + Register vout = tmp2; + Register carry = tmp3; + Register result = r0; + +// long kLong = k & LONG_MASK; +// long carry = 0; +// +// offset = out.length-offset - 1; +// for (int j=len-1; j >= 0; j--) { +// long product = (in[j] & LONG_MASK) * kLong + +// (out[offset] & LONG_MASK) + carry; +// out[offset--] = (int)product; +// carry = product >>> 32; +// } +// return (int)carry; + + lea(in, Address(in, len, lsl(LogBytesPerInt))); + lea(out, Address(out, offset, lsl(LogBytesPerInt))); + mov(carry, 0); + + Label L_loop; + bind(L_loop); + ldr(vin, Assembler::pre(in, -BytesPerInt)); + ldr(vout, Assembler::pre(out, -BytesPerInt)); + umaal(vout, carry, vin, k); + str(vout, Address(out)); + subs(len, len, 1); + b(L_loop, Assembler::GT); + + mov(result, carry); +} + +/** + * Emits code to update CRC-32 with a byte value according to constants in table + * + * @param [in,out]crc Register containing the crc. + * @param [in]val Register containing the byte to fold into the CRC. + * @param [in]table Register containing the table of crc constants. + * + * uint32_t crc; + * val = crc_table[(val ^ crc) & 0xFF]; + * crc = val ^ (crc >> 8); + * + */ +void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { + eor(val, val, crc); + andr(val, val, 0xff); + ldr(val, Address(table, val, lsl(2))); + eor(crc, val, crc, Assembler::lsr(8)); +} + +/** + * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 + * + * @param [in,out]crc Register containing the crc. + * @param [in]v Register containing the 32-bit to fold into the CRC. + * @param [in]table0 Register containing table 0 of crc constants. + * @param [in]table1 Register containing table 1 of crc constants. + * @param [in]table2 Register containing table 2 of crc constants. + * @param [in]table3 Register containing table 3 of crc constants. + * + * uint32_t crc; + * v = crc ^ v + * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] + * + */ +void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, + Register tmp2, Register table0, Register table1, Register table2, Register table3) { + eor(v, crc, v); + uxtb(tmp, v); + uxtb(tmp2, v, ror(8)); + ldr(crc, Address(table3, tmp, lsl(2))); + ldr(tmp2, Address(table2, tmp2, lsl(2))); + uxtb(tmp, v, ror(16)); + eor(crc, crc, tmp2); + uxtb(tmp2, v, ror(24)); + ldr(tmp, Address(table1, tmp, lsl(2))); + ldr(tmp2, Address(table0, tmp2, lsl(2))); + eor(crc, crc, tmp); + eor(crc, crc, tmp2); +} + +/** + * @param crc register containing existing CRC (32-bit) + * @param buf register pointing to input byte buffer (byte*) + * @param len register containing number of bytes + * @param table register that will contain address of CRC table + * @param tmp scratch register + */ +void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, + Register table0, Register table1, Register table2, Register table3, + Register tmp, Register tmp2, Register tmp3, int is_crc32c) { + Label L_cpu, L_by8_loop, L_by1, L_by1_loop, L_align_by1_loop, L_align_exit, L_exit; + + if (!is_crc32c) + inv(crc, crc); + if (UseCRC32) { + Label CRC_by4_loop, CRC_by1_loop; + + subs(len, len, 4); + b(CRC_by4_loop, Assembler::GE); + adds(len, len, 4); + b(CRC_by1_loop, Assembler::GT); + b(L_exit); + + BIND(CRC_by4_loop); + ldr(tmp, Address(post(buf, 4))); + subs(len, len, 4); + if (!is_crc32c) + crc32w(crc, crc, tmp); + else // is_crc32c + crc32cw(crc, crc, tmp); + b(CRC_by4_loop, Assembler::GE); + adds(len, len, 4); + b(L_exit, Assembler::LE); + BIND(CRC_by1_loop); + ldrb(tmp, Address(post(buf, 1))); + subs(len, len, 1); + if (!is_crc32c) + crc32b(crc, crc, tmp); + else // is_crc32c + crc32cb(crc, crc, tmp); + b(CRC_by1_loop, Assembler::GT); + BIND(L_exit); + if (!is_crc32c) + inv(crc, crc); + return; + } + lea(table0, ExternalAddress( + !is_crc32c ? + StubRoutines::crc_table_addr() : + StubRoutines::crc32c_table_addr() )); + add(table1, table0, 1*256*sizeof(juint)); + add(table2, table0, 2*256*sizeof(juint)); + add(table3, table0, 3*256*sizeof(juint)); + + BIND(L_align_by1_loop); + tst(buf, 3); + b(L_align_exit, Assembler::EQ); + cmp(len, 0); + b(L_exit, Assembler::EQ); + sub(len, len, 1); + ldrb(tmp, Address(post(buf, 1))); + update_byte_crc32(crc, tmp, table0); + b(L_align_by1_loop); + + BIND(L_align_exit); + + if(VM_Version::features() & FT_AdvSIMD) { + if (UseNeon) { + cmp(len, 32+12); // account for possible need for alignment + b(L_cpu, Assembler::LT); + + Label L_fold, L_align_by4_loop, L_align_by4_exit; + + BIND(L_align_by4_loop); + tst(buf, 0xf); + b(L_align_by4_exit, Assembler::EQ); + ldr(tmp, Address(post(buf, 4))); + update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3); + sub(len, len, 4); + b(L_align_by4_loop); + + BIND(L_align_by4_exit); + + add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants + + vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128); + vld1_64(d4, post(tmp, 8), Assembler::ALIGN_64); + vld1_64(d5, post(tmp, 8), Assembler::ALIGN_64); + vld1_64(d6, post(tmp, 8), Assembler::ALIGN_64); + vld1_64(d7, post(tmp, 8), Assembler::ALIGN_64); + veor_64(d16, d16, d16); + vmov_32(d16, 0, crc); + + veor_64(d0, d0, d16); + sub(len, len, 32); + + BIND(L_fold); + vmullp_8(q8, d0, d5); + vmullp_8(q9, d0, d7); + vmullp_8(q10, d0, d4); + vmullp_8(q11, d0, d6); + + vmullp_8(q12, d1, d5); + vmullp_8(q13, d1, d7); + vmullp_8(q14, d1, d4); + vmullp_8(q15, d1, d6); + + vuzp_128_16(q9, q8); + veor_128(q8, q8, q9); + + vuzp_128_16(q13, q12); + veor_128(q12, q12, q13); + + vshll_16u(q9, d16, 8); + vshll_16u(q8, d17, 8); + + vshll_16u(q13, d24, 8); + vshll_16u(q12, d25, 8); + + veor_128(q8, q8, q10); + veor_128(q12, q12, q14); + veor_128(q9, q9, q11); + veor_128(q13, q13, q15); + + veor_64(d19, d19, d18); + veor_64(d18, d27, d26); + + vshll_32u(q13, d18, 16); + vshll_32u(q9, d19, 16); + + veor_128(q9, q8, q9); + veor_128(q13, q12, q13); + + veor_64(d31, d26, d27); + veor_64(d30, d18, d19); + + vshl_128_64(q15, q15, 1); + vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128); + veor_128(q0, q0, q15); + + subs(len, len, 16); + b(L_fold, Assembler::GE); + + vmov_32(tmp, d0, 0); + mov(crc, 0); + update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3); + vmov_32(tmp, d0, 1); + update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3); + vmov_32(tmp, d1, 0); + update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3); + vmov_32(tmp, d1, 1); + update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3); + + add(len, len, 16); + } + } // if FT_AdvSIMD + + BIND(L_cpu); + subs(len, len, 8); + b(L_by8_loop, Assembler::GE); + adds(len, len, 8); + b(L_by1_loop, Assembler::GT); + b(L_exit); + + BIND(L_by8_loop); + ldr(tmp, Address(post(buf, 4))); + update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3); + ldr(tmp, Address(post(buf, 4))); + update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3); + subs(len, len, 8); + b(L_by8_loop, Assembler::GE); + adds(len, len, 8); + b(L_exit, Assembler::LE); + BIND(L_by1_loop); + subs(len, len, 1); + ldrb(tmp, Address(post(buf, 1))); + update_byte_crc32(crc, tmp, table0); + b(L_by1_loop, Assembler::GT); + + BIND(L_exit); + if (!is_crc32c) + inv(crc, crc); +} + +/** + * First round Key (cpu implementation) + * @param in register containing address of input data (plain or cipher text) + * @param key register containing address of the key data + * @param t0 output register t0 + * @param t1 output register t1 + * @param t2 output register t2 + * @param t3 output register t3 + * @param t4 temporary register + * @param t5 temporary register + * @param t6 temporary register + * @param t7 temporary register + */ +void MacroAssembler::kernel_aescrypt_firstRound(Register in, Register key, + Register t0, Register t1, Register t2, Register t3, + Register t4, Register t5, Register t6, Register t7) { + + ldr(t4, Address(post(key, 4))); + ldr(t5, Address(post(key, 4))); + ldr(t6, Address(post(key, 4))); + ldr(t7, Address(post(key, 4))); + ldr(t0, Address(post(in, 4))); + ldr(t1, Address(post(in, 4))); + ldr(t2, Address(post(in, 4))); + ldr(t3, Address(post(in, 4))); + rev(t0, t0); + rev(t1, t1); + rev(t2, t2); + rev(t3, t3); + eor(t0, t0, t4); + eor(t1, t1, t5); + eor(t2, t2, t6); + eor(t3, t3, t7); +} + +/** + * AES ECB Round + * @param table_te Register contains address of AES replacement table + * @param key register containing address of the key data + * @param t0 Register for input value t0 + * @param t1 Register for input value t1 + * @param t2 Register for input value t2 + * @param t3 Register for input value t3 + * @param a Register for output value + * @param tmp1 Temporary register 1 + * @param tmp2 Temporary register 2 + */ +void MacroAssembler::kernel_aescrypt_round(Register table_te, Register key, + Register t0, Register t1, Register t2, Register t3, + Register a, Register tmp1, Register tmp2) { + + ldr(a, Address(post(key, 4))); // K + uxtb(tmp1, t0, ror(24)); + ldr(tmp1, Address(table_te, tmp1, lsl(2))); // T1 + uxtb(tmp2, t1, ror(16)); + eor(a, a, tmp1); + ldr(tmp2, Address(table_te, tmp2, lsl(2))); // T2 + uxtb(tmp1, t2, ror(8)); + eor(a, a, tmp2, ror(8)); + ldr(tmp1, Address(table_te, tmp1, lsl(2))); // T3 + uxtb(tmp2, t3); + eor(a, a, tmp1, ror(16)); + ldr(tmp2, Address(table_te, tmp2, lsl(2))); // T4 + eor(a, a, tmp2, ror(24)); // a0 +}; + +/** + * + * Last AES encryption round ( 4 bytes ) + * @param table_te + * @param key + * @param to + * @param t0 + * @param t1 + * @param t2 + * @param t3 + * @param t4 + * @param t5 + * @param t6 + * @param t7 + * + * int tt = K[keyOffset++]; + * out[outOffset++] = (byte)(S[(t0 >>> 24) ] ^ (tt >>> 24)); + * out[outOffset++] = (byte)(S[(t1 >>> 16) & 0xFF] ^ (tt >>> 16)); + * out[outOffset++] = (byte)(S[(t2 >>> 8) & 0xFF] ^ (tt >>> 8)); + * out[outOffset++] = (byte)(S[(t3 ) & 0xFF] ^ (tt )); + */ +void MacroAssembler::kernel_aescrypt_lastRound( + Register table_te, Register key, Register to, + Register t0, Register t1, Register t2, Register t3, + Register t4, Register t5, Register t6, Register t7) { + + ldr(t7, Address(post(key, 4))); // tt + + uxtb(t5, t0, ror(24)); + ldr(t4, Address(table_te, t5, lsl(2))); // S[] + uxtb(t6, t1, ror(16)); + eor(t4, t4, t7, lsr(24)); + ldr(t6, Address(table_te, t6, lsl(2))); // S[] + uxtb(t5, t2, ror(8)); + eor(t6, t6, t7, lsr(16)); + uxtb(t6, t6); + add(t4, t4, t6, lsl(8)); + ldr(t5, Address(table_te, t5, lsl(2))); // S[] + uxtb(t6, t3); + eor(t5, t5, t7, lsr(8)); + uxtb(t5, t5); + add(t4, t4, t5, lsl(16)); + ldr(t6, Address(table_te, t6, lsl(2))); // S[] + eor(t6, t6, t7); + uxtb(t6, t6); + add(t4, t4, t6, lsl(24)); + + str(t4, Address(post(to, 4))); + +} + +/** + * + * Last AES encryption round ( 4 bytes ) + * @param table_te + * @param key + * @param to + * @param t0 + * @param t1 + * @param t2 + * @param t3 + * @param t4 + * @param t5 + * @param t6 + * @param t7 + * + * int tt = K[keyOffset++]; + * out[outOffset++] = (byte)(S[(t0 >>> 24) ] ^ (tt >>> 24)); + * out[outOffset++] = (byte)(S[(t1 >>> 16) & 0xFF] ^ (tt >>> 16)); + * out[outOffset++] = (byte)(S[(t2 >>> 8) & 0xFF] ^ (tt >>> 8)); + * out[outOffset++] = (byte)(S[(t3 ) & 0xFF] ^ (tt )); + */ +void MacroAssembler::kernel_aescrypt_lastRound_cbc( + Register table_te, + Register t0, Register t1, Register t2, Register t3, + Register t4, Register t5, Register t6) { + + uxtb(t5, t0, ror(24)); + ldr(t4, Address(table_te, t5, lsl(2))); // S[] + uxtb(t6, t1, ror(16)); + ldr(t6, Address(table_te, t6, lsl(2))); // S[] + uxtb(t5, t2, ror(8)); + add(t4, t4, t6, lsl(8)); + ldr(t5, Address(table_te, t5, lsl(2))); // S[] + uxtb(t6, t3); + add(t4, t4, t5, lsl(16)); + ldr(t6, Address(table_te, t6, lsl(2))); // S[] + add(t4, t4, t6, lsl(24)); +} + +/** + * AES ECB encryption + * + * @param from register pointing to source array address + * @param to register pointing to destination array address + * @param key register pointing to key + * @param keylen register containing key len in bytes + */ +void MacroAssembler::kernel_aescrypt_encryptBlock(Register from, Register to, + Register key, Register keylen, Register table_te, + Register t0, Register t1, Register t2, Register t3, + Register t4, Register t5, Register t6, Register t7) { + Label L_loop; + lea(table_te, ExternalAddress(StubRoutines::aes_table_te_addr())); + + ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - + arrayOopDesc::base_offset_in_bytes(T_INT))); + + + kernel_aescrypt_firstRound(from, key, + t0, t1, t2, t3, t4, t5, t6, t7); + + sub(keylen, keylen, 8); + BIND(L_loop); + + kernel_aescrypt_round(table_te, key, + t0, t1, t2, t3, t4, t7, from); + kernel_aescrypt_round(table_te, key, + t1, t2, t3, t0, t5, t7, from); + kernel_aescrypt_round(table_te, key, + t2, t3, t0, t1, t6, t7, from); + + uxtb(t7, t3, ror(24)); + ldr(t3, Address(table_te, t7, lsl(2))); // T1 + uxtb(t7, t0, ror(16)); + ldr(t7, Address(table_te, t7, lsl(2))); // T2 + mov(t0, t4); // t0=a0 + eor(t3, t3, t7, ror(8)); + uxtb(t7, t1, ror(8)); + ldr(t7, Address(table_te, t7, lsl(2))); // T3 + mov(t1, t5); // t1=a1 + eor(t3, t3, t7, ror(16)); + uxtb(t7, t2); + ldr(t7, Address(table_te, t7, lsl(2))); // T4 + mov(t2, t6); // t2=a2 + eor(t3, t3, t7, ror(24)); + ldr(t7, Address(post(key, 4))); // K + eor(t3, t3, t7); // t3 = a3 + + subs(keylen, keylen, 4); + b(L_loop, Assembler::NE); + + // last round is special + add(table_te, table_te, 4 * 256); //S + + kernel_aescrypt_lastRound( + table_te, key, to, + t0, t1, t2, t3, + t4, t5, t6, t7); + + kernel_aescrypt_lastRound( + table_te, key, to, + t1, t2, t3, t0, + t4, t5, t6, t7); + + kernel_aescrypt_lastRound( + table_te, key, to, + t2, t3, t0, t1, + t4, t5, t6, t7); + + kernel_aescrypt_lastRound( + table_te, key, to, + t3, t0, t1, t2, + t4, t5, t6, t7); +} + +/** + * AES ECB decryption + * @param from register pointing to source array address + * @param to register pointing to destination array address + * @param key register pointing to key + * @param keylen register containing key len in bytes + */ +void MacroAssembler::kernel_aescrypt_decryptBlock(Register from, Register to, + Register key, Register keylen, Register table_te, + Register t0, Register t1, Register t2, Register t3, + Register t4, Register t5, Register t6, Register t7) { + Label L_loop; + lea(table_te, ExternalAddress(StubRoutines::aes_table_td_addr())); + + ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - + arrayOopDesc::base_offset_in_bytes(T_INT))); + + push(key, sp); + + add(key, key, 16); + kernel_aescrypt_firstRound(from, key, + t0, t1, t2, t3, t4, t5, t6, t7); + + sub(keylen, keylen, 8); + BIND(L_loop); + + kernel_aescrypt_round(table_te, key, + t0, t3, t2, t1, t4, t7, from); + kernel_aescrypt_round(table_te, key, + t1, t0, t3, t2, t5, t7, from); + kernel_aescrypt_round(table_te, key, + t2, t1, t0, t3, t6, t7, from); + + uxtb(t7, t3, ror(24)); + ldr(t3, Address(table_te, t7, lsl(2))); // T1 + uxtb(t7, t2, ror(16)); + ldr(t7, Address(table_te, t7, lsl(2))); // T2 + mov(t2, t6); // t2=a2 + eor(t3, t3, t7, ror(8)); + uxtb(t7, t1, ror(8)); + ldr(t7, Address(table_te, t7, lsl(2))); // T3 + mov(t1, t5); // t1=a1 + eor(t3, t3, t7, ror(16)); + uxtb(t7, t0); + ldr(t7, Address(table_te, t7, lsl(2))); // T4 + mov(t0, t4); // t0=a0 + eor(t3, t3, t7, ror(24)); + ldr(t7, Address(post(key, 4))); // K + eor(t3, t3, t7); // t3 = a3 + + subs(keylen, keylen, 4); + b(L_loop, Assembler::NE); + + pop(key, sp); + // last round is special + add(table_te, table_te, 4 * 256); //S + + kernel_aescrypt_lastRound( + table_te, key, to, + t0, t3, t2, t1, + t4, t5, t6, t7); + + kernel_aescrypt_lastRound( + table_te, key, to, + t1, t0, t3, t2, + t4, t5, t6, t7); + + kernel_aescrypt_lastRound( + table_te, key, to, + t2, t1, t0, t3, + t4, t5, t6, t7); + + kernel_aescrypt_lastRound( + table_te, key, to, + t3, t2, t1, t0, + t4, t5, t6, t7); +} + +/** + * AES CBC encryption + * + * @param from register pointing to source array address + * @param to register pointing to destination array address + * @param key register pointing to key + * @param rvec register pointing to roundkey vector + * @param len register containing source len in bytes + */ +void MacroAssembler::kernel_aescrypt_encrypt(Register from, Register to, + Register key, Register rvec, Register len, Register keylen, Register table_te, + Register t0, Register t1, Register t2, Register t3, + Register t4, Register t5, Register t6) { + Label L_loop, L_loop2; + lea(table_te, ExternalAddress(StubRoutines::aes_table_te_addr())); + ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - + arrayOopDesc::base_offset_in_bytes(T_INT))); + + vld1_64(d4, d5, Address(rvec), Assembler::ALIGN_STD); // read rvec bytes to q2 + vld1_64(d2, d3, Address(post(key, 16)), Assembler::ALIGN_STD); // read key to q1 + sub(keylen, keylen, 8); + + add(t4, key, keylen, lsl(2)); + vld1_64(d8, d9, Address(t4), Assembler::ALIGN_STD); // read last key bytes to q4 + vrev32_128_8(q4, q4); + + push(to, sp); + BIND(L_loop2); + // get round key and first round + vld1_64(d0, d1, Address(post(from, 16)), Assembler::ALIGN_STD); // read 16 bytes to q0 + veor_128(q0, q0, q2); + vrev32_128_8(q0, q0); + veor_128(q0, q0, q1); + vmov_f64(t0, t1, d0); + vmov_f64(t2, t3, d1); + + push(RegSet::of(key, from), sp); + push(RegSet::of(to, keylen), sp); + + BIND(L_loop); + + kernel_aescrypt_round(table_te, key, + t0, t1, t2, t3, t4, to, from); + kernel_aescrypt_round(table_te, key, + t1, t2, t3, t0, t5, to, from); + kernel_aescrypt_round(table_te, key, + t2, t3, t0, t1, t6, to, from); + + uxtb(to, t3, ror(24)); + ldr(t3, Address(table_te, to, lsl(2))); // T1 + uxtb(to, t0, ror(16)); + ldr(to, Address(table_te, to, lsl(2))); // T2 + mov(t0, t4); // t0=a0 + eor(t3, t3, to, ror(8)); + uxtb(to, t1, ror(8)); + ldr(to, Address(table_te, to, lsl(2))); // T3 + mov(t1, t5); // t1=a1 + eor(t3, t3, to, ror(16)); + uxtb(to, t2); + ldr(to, Address(table_te, to, lsl(2))); // T4 + mov(t2, t6); // t2=a2 + eor(t3, t3, to, ror(24)); + ldr(to, Address(post(key, 4))); // K + eor(t3, t3, to); // t3 = a3 + + subs(keylen, keylen, 4); + b(L_loop, Assembler::NE); + + // last round is special + add(table_te, table_te, 4 * 256); //S + kernel_aescrypt_lastRound_cbc( + table_te, + t0, t1, t2, t3, + t4, t5, t6); + + kernel_aescrypt_lastRound_cbc( + table_te, + t1, t2, t3, t0, + t5, t6, from); + vmov_f64(d6, t4, t5); + + kernel_aescrypt_lastRound_cbc( + table_te, + t2, t3, t0, t1, + t4, t5, t6); + + kernel_aescrypt_lastRound_cbc( + table_te, + t3, t0, t1, t2, + t5, t6, from); + vmov_f64(d7, t4, t5); + veor_128(q2, q4, q3); + + pop(RegSet::of(to, keylen), sp); + sub(table_te, table_te, 4 * 256); //Te + vst1_64(d4, Address(post(to, 8)), Assembler::ALIGN_STD); + pop(RegSet::of(key, from), sp); + vst1_64(d5, Address(post(to, 8)), Assembler::ALIGN_STD); + + subs(len, len, 16); + b(L_loop2, Assembler::NE); + vstr_f64(d4, Address(rvec)); + vstr_f64(d5, Address(rvec, 8)); + mov(r0, to); + pop(to, sp); + sub(r0, r0, to); +}; + +/** + * AES CBC decryption + * + * @param from register pointing to source array address + * @param to register pointing to destination array address + * @param key register pointing to key + * @param rvec register pointing to roundkey vector + * @param len register containing source len in bytes + */ +void MacroAssembler::kernel_aescrypt_decrypt(Register from, Register to, + Register key, Register rvec, Register len, Register keylen, Register table_te, + Register t0, Register t1, Register t2, Register t3, + Register t4, Register t5, Register t6) { + Label L_loop, L_loop2; + lea(table_te, ExternalAddress(StubRoutines::aes_table_td_addr())); + + ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - + arrayOopDesc::base_offset_in_bytes(T_INT))); + + vld1_64(d2, d3, Address(post(key, 16)), Assembler::ALIGN_STD); // read key to q1 + vld1_64(d4, d5, Address(rvec), Assembler::ALIGN_STD); // read rvec bytes to q2 + vld1_64(d10, d11, Address(post(key, 16)), Assembler::ALIGN_STD); // read key to q5 + vrev32_128_8(q1, q1); + sub(keylen, keylen, 8); + + push(to, sp); + BIND(L_loop2); + // get round key and first round + vld1_64(d8, d9, Address(post(from, 16)), Assembler::ALIGN_STD); // read 16 bytes to q4 + + push(RegSet::of(to, key, from, keylen), sp); + vrev32_128_8(q0, q4); + veor_128(q0, q0, q5); + vmov_f64(t0, t1, d0); + vmov_f64(t2, t3, d1); + + BIND(L_loop); + + kernel_aescrypt_round(table_te, key, + t0, t3, t2, t1, t4, to, from); + kernel_aescrypt_round(table_te, key, + t1, t0, t3, t2, t5, to, from); + kernel_aescrypt_round(table_te, key, + t2, t1, t0, t3, t6, to, from); + + uxtb(to, t3, ror(24)); + ldr(t3, Address(table_te, to, lsl(2))); // T1 + uxtb(to, t2, ror(16)); + ldr(to, Address(table_te, to, lsl(2))); // T2 + mov(t2, t6); // t2=a2 + eor(t3, t3, to, ror(8)); + uxtb(to, t1, ror(8)); + ldr(to, Address(table_te, to, lsl(2))); // T3 + mov(t1, t5); // t1=a1 + eor(t3, t3, to, ror(16)); + uxtb(to, t0); + ldr(to, Address(table_te, to, lsl(2))); // T4 + mov(t0, t4); // t0=a0 + eor(t3, t3, to, ror(24)); + ldr(to, Address(post(key, 4))); // K + eor(t3, t3, to); // t3 = a3 + + subs(keylen, keylen, 4); + b(L_loop, Assembler::NE); + + // last round is special + add(table_te, table_te, 4 * 256); //S + + kernel_aescrypt_lastRound_cbc( + table_te, + t0, t3, t2, t1, + t4, t5, t6); + + kernel_aescrypt_lastRound_cbc( + table_te, + t1, t0, t3, t2, + t5, t6, to); + vmov_f64(d6, t4, t5); //q3 + + kernel_aescrypt_lastRound_cbc( + table_te, + t2, t1, t0, t3, + t4, t5, t6); + + kernel_aescrypt_lastRound_cbc( + table_te, + t3, t2, t1, t0, + t5, t6, to); + vmov_f64(d7, t4, t5); //q3 + pop(RegSet::of(to, key, from, keylen), sp); + veor_128(q3, q1, q3); + veor_128(q3, q3, q2); + vshl_128_64(q2, q4, 0); + + sub(table_te, table_te, 4 * 256); //Te + + vst1_64(d6, Address(post(to, 8)), Assembler::ALIGN_STD); + subs(len, len, 16); + vst1_64(d7, Address(post(to, 8)), Assembler::ALIGN_STD); + + b(L_loop2, Assembler::NE); + + vstr_f64(d4, Address(rvec)); + vstr_f64(d5, Address(rvec, 8)); + mov(r0, to); + pop(to, sp); + sub(r0, r0, to); +}; + +/* + * First round of SHA1 algorithm + */ +void MacroAssembler::sha_round1(Register st_b, Register st_c, Register st_d, + Register tmp, Register st_f, int sh) { + if (sh) { + eor(st_f, st_d, st_c, ror(32-sh)); + } else { + eor(st_f, st_d, st_c); + } + andr(st_f, st_f, st_b); + eor(st_f, st_f, st_d); +} + +/* + * Second and forth round of SHA1 algorithm + */ +void MacroAssembler::sha_round2(Register st_b, Register st_c, Register st_d, + Register tmp, Register st_f, int sh) { + if (sh) { + eor(st_f, st_b, st_c, ror(32-sh)); + } else { + eor(st_f, st_b, st_c); + } + eor(st_f, st_f, st_d); +} + +/* + * Third round of SHA1 algorithm + */ +void MacroAssembler::sha_round3(Register st_b, Register st_c, Register st_d, + Register tmp, Register st_f, int sh) { + if (sh) { + andr(st_f, st_b, st_c, ror(32-sh)); + orr(tmp, st_b, st_c, ror(32-sh)); + } else { + andr(st_f, st_b, st_c); + orr(tmp, st_b, st_c); + } + andr(tmp, st_d, tmp); + orr(st_f, st_f, tmp); +} + +/* + * Calculate Deltas w[i] and w[i+1] + * w[i] = (w[i-3] xor w[i-8] xor w[i-14] xor w[i-16]) rotl 1 + */ +void MacroAssembler::sha_w0(FloatRegister w16, FloatRegister w14, + FloatRegister w8, FloatRegister w4, FloatRegister w2, + FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4, + FloatRegister st_k, FloatRegister st_kw, bool update) { + vadd_64_32(st_kw, st_k, w16); + if(update) { + veor_64(tmp1, w16, w14); + vext_64(tmp2, w2, w4, 4); + veor_64(tmp3, tmp1, w8); + veor_64(tmp4, tmp3, tmp2); + + vshr_64_u32(tmp1, tmp4, 31); + vshl_64_32(tmp2, tmp4, 1); + vorr_64(w16, tmp1, tmp2); + } +} +/* + * Calculate Deltas w[i] and w[i+1] + */ +void MacroAssembler::sha_w(FloatRegister w16, FloatRegister w14, + FloatRegister w12, FloatRegister w10, FloatRegister w8, + FloatRegister w6, FloatRegister w4, FloatRegister w2, + FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4, + FloatRegister st_k, FloatRegister st_kw, Register counter, Register rtmp, + bool update) { + Label L_7, L_6, L_5, L_4, L_3, L_2, L_1, L_done; + andr(rtmp, counter, 0x7); + add(counter, counter, 1); + cmp(rtmp, 7); + b(L_7, Assembler::EQ); + cmp(rtmp, 6); + b(L_6, Assembler::EQ); + cmp(rtmp, 5); + b(L_5, Assembler::EQ); + cmp(rtmp, 4); + b(L_4, Assembler::EQ); + cmp(rtmp, 3); + b(L_3, Assembler::EQ); + cmp(rtmp, 2); + b(L_2, Assembler::EQ); + cmp(rtmp, 1); + b(L_1, Assembler::EQ); + sha_w0(w16, w14, w8, w4, w2, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); + b(L_done); + BIND(L_1); { + sha_w0(w14, w12, w6, w2, w16, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); + b(L_done); + } + BIND(L_2); { + sha_w0(w12, w10, w4, w16, w14, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); + b(L_done); + } + BIND(L_3); { + sha_w0(w10, w8, w2, w14, w12, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); + b(L_done); + } + BIND(L_4); { + sha_w0(w8, w6, w16, w12, w10, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); + b(L_done); + } + BIND(L_5); { + sha_w0(w6, w4, w14, w10, w8, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); + b(L_done); + } + BIND(L_6); { + sha_w0(w4, w2, w12, w8, w6, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); + b(L_done); + } + BIND(L_7); { + sha_w0(w2, w16, w10, w6, w4, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); + } + BIND(L_done); +} + +/** + * SHA1 digest + * + * @param from register pointing to source array address + * @param state register pointing to state array address + */ +void MacroAssembler::kernel_sha_implCompress(Register from, Register state, + Register counter, Register table_k, + Register st_a, Register st_b, + Register st_c, Register st_d, Register st_e, + Register tmp, Register counter2, Register st_new_a, Register st_w) { + Label L_round_1, L_round_2, L_round_3, L_round_4, L_round_4_cont, L_hash_no_w; + + FloatRegister w16 = d0; //q0-q7 + FloatRegister w14 = w16->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w12 = w14->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w10 = w12->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w8 = w10->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w6 = w8->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w4 = w6->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w2 = w4->successor(FloatRegisterImpl::DOUBLE); + FloatRegister wtmp1 = w2->successor(FloatRegisterImpl::DOUBLE); + FloatRegister wtmp2 = wtmp1->successor(FloatRegisterImpl::DOUBLE); + FloatRegister wtmp3 = wtmp2->successor(FloatRegisterImpl::DOUBLE); + FloatRegister wtmp4 = wtmp3->successor(FloatRegisterImpl::DOUBLE); + FloatRegister st_k1 = wtmp4->successor(FloatRegisterImpl::DOUBLE); + FloatRegister st_k2 = st_k1->successor(FloatRegisterImpl::DOUBLE); + FloatRegister st_k = st_k2->successor(FloatRegisterImpl::DOUBLE); + FloatRegister st_kw = st_k->successor(FloatRegisterImpl::DOUBLE); + + + assert_different_registers(st_a,st_b,st_c,st_d,st_e,tmp,counter2, st_new_a, st_w); + assert_different_registers(w2,w4,w6,w8,w10,w12,w14,w16); + + lea(table_k, ExternalAddress(StubRoutines::sha1_table_addr())); + + // read initial 16 W elements + vld1_64(w16, w14, w12, w10, Address(post(from, 32)), Assembler::ALIGN_STD); + vld1_64(w8, w6, w4, w2, Address(from), Assembler::ALIGN_STD); + + // revert W + vrev64_128_8(w16, w16); + vrev64_128_8(w12, w12); + vrev64_128_8(w8, w8); + vrev64_128_8(w4, w4); + // load state + ldr(st_a, Address(post(state, 4))); + ldr(st_b, Address(post(state, 4))); + ldr(st_c, Address(post(state, 4))); + ldr(st_d, Address(post(state, 4))); + ldr(st_e, Address(state)); + sub(state, state, 16); + + mov(counter2, 0); + mov(counter, 10); + // first round + vld1_64(st_k1, st_k2, Address(table_k), Assembler::ALIGN_128); + vdup_64_32(st_k, st_k1, 0); + + BIND(L_round_1); { + sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp); + + sha_round1(st_b, st_c, st_d, tmp, st_new_a, 0); + vmov_32(st_w, st_kw, 1); + add(st_new_a, st_new_a, st_a, ror(32-5)); + add(st_new_a, st_new_a, st_e); + add(st_new_a, st_new_a, st_w); + + vmov_32(st_w, st_kw, 0); + sha_round1(st_a, st_b, st_c, tmp, st_e, 30); + + add(tmp, st_e, st_new_a, ror(32-5)); + add(tmp, tmp, st_d); + + mov(st_e, st_c); + mov(st_d, st_b, ror(32-30)); + mov(st_c, st_a, ror(32-30)); + mov(st_b, st_new_a); + add(st_a, tmp, st_w); + + sub(counter, counter, 1); + }cbnz(counter, L_round_1); + + mov(counter, 10); + // second round + vdup_64_32(st_k, st_k1, 1); + + BIND(L_round_2); { + sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp); + + sha_round2(st_b, st_c, st_d, tmp, st_new_a, 0); + vmov_32(st_w, st_kw, 1); + add(st_new_a, st_new_a, st_a, ror(32-5)); + add(st_new_a, st_new_a, st_e); + add(st_new_a, st_new_a, st_w); + + vmov_32(st_w, st_kw, 0); + sha_round2(st_a, st_b, st_c, tmp, st_e, 30); + + add(tmp, st_e, st_new_a, ror(32-5)); + add(tmp, tmp, st_d); + + mov(st_e, st_c); + mov(st_d, st_b, ror(32-30)); + mov(st_c, st_a, ror(32-30)); + mov(st_b, st_new_a); + add(st_a, tmp, st_w); + + sub(counter, counter, 1); + }cbnz(counter, L_round_2); + + mov(counter, 10); + vdup_64_32(st_k, st_k2, 0); + // third round + + BIND(L_round_3); { + sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp); + + sha_round3(st_b, st_c, st_d, tmp, st_new_a, 0); + vmov_32(st_w, st_kw, 1); + add(st_new_a, st_new_a, st_a, ror(32-5)); + add(st_new_a, st_new_a, st_e); + add(st_new_a, st_new_a, st_w); + + vmov_32(st_w, st_kw, 0); + sha_round3(st_a, st_b, st_c, tmp, st_e, 30); + + add(tmp, st_e, st_new_a, ror(32-5)); + add(tmp, tmp, st_d); + + mov(st_e, st_c); + mov(st_d, st_b, ror(32-30)); + mov(st_c, st_a, ror(32-30)); + mov(st_b, st_new_a); + add(st_a, tmp, st_w); + + sub(counter, counter, 1); + }cbnz(counter, L_round_3); + + mov(counter, 10); + // forth round + vdup_64_32(st_k, st_k2, 1); + + BIND(L_round_4); { + sub(counter, counter, 1); + cmp(counter, 8); + b(L_hash_no_w, Assembler::LO); + sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp); + b(L_round_4_cont); + BIND(L_hash_no_w); + sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp, false); + BIND(L_round_4_cont); + + sha_round2(st_b, st_c, st_d, tmp, st_new_a, 0); + vmov_32(st_w, st_kw, 1); + add(st_new_a, st_new_a, st_a, ror(32-5)); + add(st_new_a, st_new_a, st_e); + add(st_new_a, st_new_a, st_w); + + vmov_32(st_w, st_kw, 0); + sha_round2(st_a, st_b, st_c, tmp, st_e, 30); + + add(tmp, st_e, st_new_a, ror(32-5)); + add(tmp, tmp, st_d); + + mov(st_e, st_c); + mov(st_d, st_b, ror(32-30)); + mov(st_c, st_a, ror(32-30)); + mov(st_b, st_new_a); + add(st_a, tmp, st_w); + + }cbnz(counter, L_round_4); + + // load state + ldr(tmp, Address(post(state, 4))); + add(st_a, st_a, tmp); + ldr(tmp, Address(post(state, 4))); + add(st_b, st_b, tmp); + ldr(tmp, Address(post(state, 4))); + add(st_c, st_c, tmp); + ldr(tmp, Address(post(state, 4))); + add(st_d, st_d, tmp); + ldr(tmp, Address(state)); + add(st_e, st_e, tmp); + sub(state, state, 16); + + // save state + str(st_a, Address(post(state, 4))); + str(st_b, Address(post(state, 4))); + str(st_c, Address(post(state, 4))); + str(st_d, Address(post(state, 4))); + str(st_e, Address(state)); +} +/** + * One iteration of SHA256 algorithm + * Σ0 := (a rotr 2) xor (a rotr 13) xor (a rotr 22) + * Ma := (a and b) xor (a and c) xor (b and c) + * t2 := Σ0 + Ma + * Σ1 := (e rotr 6) xor (e rotr 11) xor (e rotr 25) + * Ch := (e and f) xor ((not e) and g) + * t1 := h + Σ1 + Ch + k[i] + w[i] + * h := g + * g := f + * f := e + * e := d + t1 + * d := c + * c := b + * b := a + * a := t1 + t2 + */ +void MacroAssembler::sha256_implCompress_iter0( + Register Da, Register Db, Register Dc, Register Dd, + Register De, Register Df, Register Dg, Register Dh, + FloatRegister Dkw, int index, + Register Dtmp, + Register Dnew_a, Register Dnew_e + ) { + assert_different_registers(Da, Db, Dc, Dd, De, Df, Dg, Dh); + + // Σ0 := (a rotr 2) xor (a rotr 13) xor (a rotr 22) + // Σ1 := (e rotr 6) xor (e rotr 11) xor (e rotr 25) + andr(Dnew_a, Da, Db); + andr(Dnew_e, Da, Dc); + eor(Dnew_a, Dnew_a, Dnew_e); + andr(Dnew_e, Db, Dc); + eor(Dnew_e, Dnew_a, Dnew_e); //Ma + + mov(Dnew_a, Da, ror(2)); + eor(Dnew_a, Dnew_a, Da, ror(13)); + eor(Dnew_a, Dnew_a, Da, ror(22)); //Σ0 + + add(Dnew_a, Dnew_a, Dnew_e); //t2 + + andr(Dnew_e, De, Df); + mvn(Dtmp, De); + andr(Dtmp, Dtmp, Dg); + eor(Dtmp, Dnew_e, Dtmp); //Ch + + mov(Dnew_e, De, ror(6)); + eor(Dnew_e, Dnew_e, De, ror(11)); + eor(Dnew_e, Dnew_e, De, ror(25)); //Σ1 + + add(Dnew_e, Dnew_e, Dtmp); + vmov_32(Dtmp, Dkw, index); + add(Dnew_e, Dnew_e, Dh); + + add(Dtmp, Dnew_e, Dtmp); //t1 + + add(Dnew_e, Dtmp, Dd); //new_e + add(Dnew_a, Dtmp, Dnew_a); //new_a +}; +/** + * Four iterations of SHA256 algorithm + */ +void MacroAssembler::sha256_implCompress_iter( + Register ra, Register rb, Register rc, Register rd, + Register re, Register rf, Register rg, Register rh, + FloatRegister Dkw1, FloatRegister Dkw2, + Register step, + Register tmp, + Register ra2, Register re2 + ) { + Label L_4, L_3, L_2, L_1, L_done; + cmp(step, 4); + b(L_4, Assembler::EQ); + cmp(step, 3); + b(L_3, Assembler::EQ); + cmp(step, 2); + b(L_2, Assembler::EQ); + cmp(step, 1); + b(L_1, Assembler::EQ); + sha256_implCompress_iter0(ra, rb, rc, rd, re, rf, rg, rh, Dkw1, 0, tmp, ra2, re2); + sha256_implCompress_iter0(ra2, ra, rb, rc, re2, re, rf, rg, Dkw1, 1, tmp, rd, rh); + sha256_implCompress_iter0(rd, ra2, ra, rb, rh, re2, re, rf, Dkw2, 0, tmp, rc, rg); + sha256_implCompress_iter0(rc, rd, ra2, ra, rg, rh, re2, re, Dkw2, 1, tmp, rb, rf); + mov(step, 4); + b(L_done); + BIND(L_1); { + sha256_implCompress_iter0(ra2, ra, rb, rc, re2, re, rf, rg, Dkw1, 0, tmp, rd, rh); + sha256_implCompress_iter0(rd, ra2, ra, rb, rh, re2, re, rf, Dkw1, 1, tmp, rc, rg); + sha256_implCompress_iter0(rc, rd, ra2, ra, rg, rh, re2, re, Dkw2, 0, tmp, rb, rf); + sha256_implCompress_iter0(rb, rc, rd, ra2, rf, rg, rh, re2, Dkw2, 1, tmp, ra, re); + mov(step, 0); + b(L_done); + } + BIND(L_2); { + sha256_implCompress_iter0(rd, ra2, ra, rb, rh, re2, re, rf, Dkw1, 0, tmp, rc, rg); + sha256_implCompress_iter0(rc, rd, ra2, ra, rg, rh, re2, re, Dkw1, 1, tmp, rb, rf); + sha256_implCompress_iter0(rb, rc, rd, ra2, rf, rg, rh, re2, Dkw2, 0, tmp, ra, re); + sha256_implCompress_iter0(ra, rb, rc, rd, re, rf, rg, rh, Dkw2, 1, tmp, ra2, re2); + mov(step, 1); + b(L_done); + } + BIND(L_3); { + sha256_implCompress_iter0(rc, rd, ra2, ra, rg, rh, re2, re, Dkw1, 0, tmp, rb, rf); + sha256_implCompress_iter0(rb, rc, rd, ra2, rf, rg, rh, re2, Dkw1, 1, tmp, ra, re); + sha256_implCompress_iter0(ra, rb, rc, rd, re, rf, rg, rh, Dkw2, 0, tmp, ra2, re2); + sha256_implCompress_iter0(ra2, ra, rb, rc, re2, re, rf, rg, Dkw2, 1, tmp, rd, rh); + mov(step, 2); + b(L_done); + } + BIND(L_4); { + sha256_implCompress_iter0(rb, rc, rd, ra2, rf, rg, rh, re2, Dkw1, 0, tmp, ra, re); + sha256_implCompress_iter0(ra, rb, rc, rd, re, rf, rg, rh, Dkw1, 1, tmp, ra2, re2); + sha256_implCompress_iter0(ra2, ra, rb, rc, re2, re, rf, rg, Dkw2, 0, tmp, rd, rh); + sha256_implCompress_iter0(rd, ra2, ra, rb, rh, re2, re, rf, Dkw2, 1, tmp, rc, rg); + mov(step, 3); + } + BIND(L_done); +}; + + /* + * Calculate Deltas w[i] and w[i+1] + * s0 := (w[i-15] rotr 7) xor (w[i-15] rotr 18) xor (w[i-15] shr 3) + * s1 := (w[i-2] rotr 17) xor (w[i-2] rotr 19) xor (w[i-2] shr 10) + * w[i] := w[i-16] + s0 + w[i-7] + s1 + */ +void MacroAssembler::sha256_w0( + FloatRegister w_m16, FloatRegister w_m15, FloatRegister w_m14, + FloatRegister w_m7, FloatRegister w_m6, + FloatRegister w_m2, + FloatRegister Qtmp_S0, FloatRegister Qtmp_S1, + FloatRegister Qtmp1){ + + vmov_64(Qtmp1, w_m15); + vmov_64(Qtmp1->successor(FloatRegisterImpl::DOUBLE), w_m14); + vshr_128_u64(Qtmp_S0, Qtmp1, 7); + vshr_128_u64(Qtmp_S1, Qtmp1, 18); + veor_128(Qtmp_S0, Qtmp_S0, Qtmp_S1); + vshr_128_u64(Qtmp_S1, Qtmp1, 35); + veor_128(Qtmp_S0, Qtmp_S0, Qtmp_S1); //S0 + + vshr_128_u64(Qtmp_S1, w_m2, 17); + vshr_128_u64(Qtmp1, w_m2, 19); + veor_128(Qtmp_S1, Qtmp_S1, Qtmp1); + vshr_128_u64(Qtmp1, w_m2, 42); + veor_128(Qtmp_S1, Qtmp_S1, Qtmp1); //S1 + + vmov_64(Qtmp1, w_m7); + vmov_64(Qtmp1->successor(FloatRegisterImpl::DOUBLE), w_m6); + vadd_128_32(Qtmp1, Qtmp1, w_m16); + vadd_128_32(Qtmp1, Qtmp1, Qtmp_S0); + vadd_128_32(w_m16, Qtmp1, Qtmp_S1); // w[i/i+1] + + vdup_64_32(w_m16, w_m16, 0); + vdup_64_32(w_m15, w_m15, 0); +} + +/* + * Calculate Deltas w[i] ... w[i+3] + */ +void MacroAssembler::sha256_w(FloatRegister w16, FloatRegister w14, + FloatRegister w12, FloatRegister w10, FloatRegister w8, + FloatRegister w6, FloatRegister w4, FloatRegister w2, + FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, + FloatRegister st_kw, Register counter, Register rtmp) { + FloatRegister w15 = w16->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w13 = w14->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w11 = w12->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w9 = w10->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w7 = w8->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w5 = w6->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w3 = w4->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w1 = w2->successor(FloatRegisterImpl::DOUBLE); + + FloatRegister Dtmp1 = as_FloatRegister(tmp1->encoding()); + FloatRegister Dtmp2 = Dtmp1->successor(FloatRegisterImpl::DOUBLE); + Label L_3, L_2, L_1, L_done; + + andr(rtmp, counter, 0x3); + cmp(rtmp, 3); + b(L_3, Assembler::EQ); + cmp(rtmp, 2); + b(L_2, Assembler::EQ); + cmp(rtmp, 1); + b(L_1, Assembler::EQ); + vext_64(Dtmp1, w16, w15, 4); + vext_64(Dtmp2, w14, w13, 4); + vadd_128_32(st_kw, st_kw, tmp1); + cmp(counter, 3); + b(L_done, Assembler::LO); + sha256_w0(w16, w15, w14, w7, w6, w2, tmp1, tmp2, tmp3); + sha256_w0(w14, w13, w12, w5, w4, w16, tmp1, tmp2, tmp3); + b(L_done); + BIND(L_3); { + vext_64(Dtmp1, w12, w11, 4); + vext_64(Dtmp2, w10, w9, 4); + vadd_128_32(st_kw, st_kw, tmp1); + cmp(counter, 3); + b(L_done, Assembler::LO); + sha256_w0(w12, w11, w10, w3, w2, w14, tmp1, tmp2, tmp3); + sha256_w0(w10, w9, w8, w1, w16, w12, tmp1, tmp2, tmp3); + b(L_done); + } + BIND(L_2); { + vext_64(Dtmp1, w8, w7, 4); + vext_64(Dtmp2, w6, w5, 4); + vadd_128_32(st_kw, st_kw, tmp1); + cmp(counter, 3); + b(L_done, Assembler::LO); + sha256_w0(w8, w7, w6, w15, w14, w10, tmp1, tmp2, tmp3); + sha256_w0(w6, w5, w4, w13, w12, w8, tmp1, tmp2, tmp3); + b(L_done); + } + BIND(L_1); { + vext_64(Dtmp1, w4, w3, 4); + vext_64(Dtmp2, w2, w1, 4); + vadd_128_32(st_kw, st_kw, tmp1); + cmp(counter, 3); + b(L_done, Assembler::LO); + sha256_w0(w4, w3, w2, w11, w10, w6, tmp1, tmp2, tmp3); + sha256_w0(w2, w1, w16, w9, w8, w4, tmp1, tmp2, tmp3); + } + BIND(L_done); +} + +/** + * SHA256 digest + * + * @param from register pointing to source array address + * @param state register pointing to state array address + */ +void MacroAssembler::kernel_sha256_implCompress(Register from, Register state, + Register counter, Register table_k, + Register ra, Register rb, Register rc, Register rd, Register re, + Register rf, Register rg, Register rh, + Register ra2, Register re2) { + + Label L_hash_loop, L_hash_loop_done, L_hash_no_w; + lea(table_k, ExternalAddress(StubRoutines::sha256_table_addr())); + + // read next k + vld1_64(d14, d15, Address(post(table_k, 16)), Assembler::ALIGN_128); + // read initial 16 W elements in q8-q11 + vld1_64(d16, d17, d18, d19, Address(post(from, 32)), Assembler::ALIGN_STD); // read from + vld1_64(d20, d21, d22, d23, Address(post(from, 32)), Assembler::ALIGN_STD); // read from + // revert W + vrev32_128_8(q8, q8); + vrev32_128_8(q9, q9); + vrev32_128_8(q10, q10); + vrev32_128_8(q11, q11); + + vadd_128_32(q7, q7, q8); // k + w + + vdup_64_32(d31, d23, 1); //w1 + vdup_64_32(d30, d23, 0); //w2 + vdup_64_32(d29, d22, 1); //w3 + vdup_64_32(d28, d22, 0); //w4 + vdup_64_32(d27, d21, 1); //w5 + vdup_64_32(d26, d21, 0); //w6 + vdup_64_32(d25, d20, 1); //w7 + vdup_64_32(d24, d20, 0); //w8 + vdup_64_32(d23, d19, 1); //w9 + vdup_64_32(d22, d19, 0); //w10 + vdup_64_32(d21, d18, 1); //w11 + vdup_64_32(d20, d18, 0); //w12 + vdup_64_32(d19, d17, 1); //w13 + vdup_64_32(d18, d17, 0); //w14 + vdup_64_32(d17, d16, 1); //w15 + vdup_64_32(d16, d16, 0); //w16 + + mov(counter, 16); + // load state + push(state, sp); + ldr(ra, Address(post(state, 4))); + ldr(rb, Address(post(state, 4))); + ldr(rc, Address(post(state, 4))); + ldr(rd, Address(post(state, 4))); + ldr(re, Address(post(state, 4))); + ldr(rf, Address(post(state, 4))); + ldr(rg, Address(post(state, 4))); + ldr(rh, Address(state)); + + const Register tmp = from; + const Register step = state; + + // calculate deltas + sha256_w0(d16, d17, d18, d25, d26, d30, q0, q1, q2); + sha256_w0(d18, d19, d20, d27, d28, d16, q0, q1, q2); + + mov(step, 0); // use state for internal counter + sub(counter, counter, 1); + + sha256_implCompress_iter(ra, rb, rc, rd, re, rf, rg, rh, d14, d15, + step, + tmp, ra2, re2); + + BIND(L_hash_loop); { + // read next k + vld1_64(d14, d15, Address(post(table_k, 16)), Assembler::ALIGN_128); + //calculate deltas + sha256_w(q8, q9, q10, q11, q12, q13, q14, q15, + q0, q1, q2, + q7, + counter, tmp); + + //calculate state + sha256_implCompress_iter(ra, rb, rc, rd, re, rf, rg, rh, d14, d15, + step, + tmp, ra2, re2); + sub(counter, counter, 1); + } cbnz(counter, L_hash_loop); + + pop(state, sp); + + // load initial state and add to current state + ldr(tmp, Address(post(state, 4))); + add(rb, rb, tmp); + ldr(tmp, Address(post(state, 4))); + add(rc, rc, tmp); + ldr(tmp, Address(post(state, 4))); + add(rd, rd, tmp); + ldr(tmp, Address(post(state, 4))); + add(ra2, ra2, tmp); + ldr(tmp, Address(post(state, 4))); + add(rf, rf, tmp); + ldr(tmp, Address(post(state, 4))); + add(rg, rg, tmp); + ldr(tmp, Address(post(state, 4))); + add(rh, rh, tmp); + ldr(tmp, Address(state)); + add(re2, re2, tmp); + sub(state, state, 28); + + // save state + str(rb, Address(post(state, 4))); + str(rc, Address(post(state, 4))); + str(rd, Address(post(state, 4))); + str(ra2, Address(post(state, 4))); + str(rf, Address(post(state, 4))); + str(rg, Address(post(state, 4))); + str(rh, Address(post(state, 4))); + str(re2, Address(post(state, 4))); +} + +/** + * SHA512 Sigma + * Sigma(x) = ROTR(x, sh1) XOR ROTR(x, sh2) XOR ROTR(x, sh3) + */ +void MacroAssembler::sha512_sigma(FloatRegister x, + FloatRegister Qtmp, FloatRegister Dsigma, int sh1, int sh2, int sh3) { + FloatRegister Dtmp0 = as_FloatRegister(Qtmp->encoding()); + FloatRegister Dtmp1 = Dtmp0->successor(FloatRegisterImpl::DOUBLE); + assert_different_registers(x, Dtmp0, Dtmp1, Dsigma); + + vshr_64_u64(Dtmp0, x, sh1); + vshl_64_64(Dtmp1, x, 64-sh1); + vorr_64(Dsigma, Dtmp0, Dtmp1); + + vshr_64_u64(Dtmp0, x, sh2); + vshl_64_64(Dtmp1, x, 64-sh2); + vorr_64(Dtmp0, Dtmp0, Dtmp1); + + veor_64(Dsigma, Dsigma, Dtmp0); + + vshr_64_u64(Dtmp0, x, sh3); + vshl_64_64(Dtmp1, x, 64-sh3); + vorr_64(Dtmp0, Dtmp0, Dtmp1); + + veor_64(Dsigma, Dsigma, Dtmp0); +} + +/** + * SHA512 Delta + * Delta(x) = ROTR(x, sh1) XOR ROTR(x, sh2) XOR SHR(x, sh3) + */ +void MacroAssembler::sha512_delta(FloatRegister x, + FloatRegister Qtmp, FloatRegister Ddelta, int sh1, int sh2, int sh3) { + FloatRegister Dtmp0 = as_FloatRegister(Qtmp->encoding()); + FloatRegister Dtmp1 = Dtmp0->successor(FloatRegisterImpl::DOUBLE); + assert_different_registers(x, Dtmp0, Dtmp1, Ddelta); + + vshr_64_u64(Dtmp0, x, sh1); + vshl_64_64(Dtmp1, x, 64-sh1); + vorr_64(Ddelta, Dtmp0, Dtmp1); + + vshr_64_u64(Dtmp0, x, sh2); + vshl_64_64(Dtmp1, x, 64-sh2); + vorr_64(Dtmp0, Dtmp0, Dtmp1); + + veor_64(Ddelta, Ddelta, Dtmp0); + + vshr_64_u64(Dtmp0, x, sh3); + + veor_64(Ddelta, Ddelta, Dtmp0); +} + +/** + * SHA512 Ch + * Ch(x, y, z) = (x AND y) XOR ( NOT x AND z) + */ +void MacroAssembler::sha512_ch(FloatRegister x, FloatRegister y, FloatRegister z, + FloatRegister Dtmp, FloatRegister Dch) { + assert_different_registers(x, Dtmp, Dch); + + vmvn_64(Dtmp, x); + vand_64(Dtmp, Dtmp, z); + + vand_64(Dch, x, y); + veor_64(Dch, Dtmp, Dch); +} + +/** + * SHA512 Maj + * Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) + */ +void MacroAssembler::sha512_maj(FloatRegister x, FloatRegister y, FloatRegister z, + FloatRegister Dtmp, FloatRegister Dmaj) { + assert_different_registers(x, Dtmp, Dmaj); + + vand_64(Dmaj, x, y); + vand_64(Dtmp, x, z); + veor_64(Dmaj, Dmaj, Dtmp); + vand_64(Dtmp, y, z); + veor_64(Dmaj, Dmaj, Dtmp); +} + +/** + * SHA512 digest + * + * @param from register pointing to source array address + * @param state register pointing to state array address + */ +void MacroAssembler::kernel_sha512_implCompress(Register from, Register state, + Register counter, Register table_k) { + Label L_hash_loop, L_hash_no_w; + FloatRegister st_a = d18; //q9-q12 + FloatRegister st_b = st_a->successor(FloatRegisterImpl::DOUBLE); + FloatRegister st_c = st_b->successor(FloatRegisterImpl::DOUBLE); + FloatRegister st_d = st_c->successor(FloatRegisterImpl::DOUBLE); + FloatRegister st_e = st_d->successor(FloatRegisterImpl::DOUBLE); + FloatRegister st_f = st_e->successor(FloatRegisterImpl::DOUBLE); + FloatRegister st_g = st_f->successor(FloatRegisterImpl::DOUBLE); + FloatRegister st_h = st_g->successor(FloatRegisterImpl::DOUBLE); + + FloatRegister w16 = d0; //q0-q7 + FloatRegister w15 = w16->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w14 = w15->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w13 = w14->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w12 = w13->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w11 = w12->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w10 = w11->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w9 = w10->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w8 = w9->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w7 = w8->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w6 = w7->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w5 = w6->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w4 = w5->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w3 = w4->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w2 = w3->successor(FloatRegisterImpl::DOUBLE); + FloatRegister w1 = w2->successor(FloatRegisterImpl::DOUBLE); + + FloatRegister t1 = d26; + FloatRegister t2 = d27; + FloatRegister new_a = st_h; + FloatRegister new_e = st_d; + FloatRegister new_new_a = st_g; + FloatRegister new_new_e = st_c; + + FloatRegister w0 = w1->successor(FloatRegisterImpl::DOUBLE); + assert_different_registers(st_a,st_b,st_c,st_d,st_e,st_f,st_g,st_h); + assert_different_registers(w0,w1,w2,w3,w4,w5,w6,w7); + assert_different_registers(w8,w9,w10,w11,w12,w13,w14,w15,w16); + + lea(table_k, ExternalAddress(StubRoutines::sha512_table_addr())); + + // read initial 16 W elements + vld1_64(w16, w15, w14, w13, Address(post(from, 32)), Assembler::ALIGN_STD); + vld1_64(w12, w11, w10, w9, Address(post(from, 32)), Assembler::ALIGN_STD); + vld1_64(w8, w7, w6, w5, Address(post(from, 32)), Assembler::ALIGN_STD); + vld1_64(w4, w3, w2, w1, Address(from), Assembler::ALIGN_STD); + // read initial state to a,b,c,d,e,f,g,h + vld1_64(st_a, st_b, st_c, st_d, Address(post(state, 32)), Assembler::ALIGN_STD); + vld1_64(st_e, st_f, st_g, st_h, Address(state), Assembler::ALIGN_STD); + sub(state, state, 32); + + // revert W + vrev64_128_8(w16, w16); + vrev64_128_8(w14, w14); + vrev64_128_8(w12, w12); + vrev64_128_8(w10, w10); + vrev64_128_8(w8, w8); + vrev64_128_8(w6, w6); + vrev64_128_8(w4, w4); + vrev64_128_8(w2, w2); + + + mov(counter, 40); + BIND(L_hash_loop); { + sub(counter, counter, 1); + // first iteration + // calculate T1 + // read K + vld1_64(d30, Address(post(table_k, 8)), Assembler::ALIGN_64); + vadd_64_64(d31, st_h, w16); + sha512_ch(st_e, st_f, st_g, t2, t1); + sha512_sigma(st_e, q14, t2, 14, 18, 41); + vadd_128_64(q13, q13, q15); + vadd_64_64(t1, t1, t2); + + // calculate T2 + sha512_maj(st_a, st_b, st_c, d30, d31); + sha512_sigma(st_a, q14, t2, 28, 34, 39); + vadd_64_64(t2, t2, d31); + + vadd_64_64(new_a, t1, t2); + vadd_64_64(new_e, st_d, t1); + + // second iteration + // calculate T1 + // read K + vld1_64(d30, Address(post(table_k, 8)), Assembler::ALIGN_64); + vadd_64_64(d31, st_g, w15); + sha512_ch(new_e, st_e, st_f, t2, t1); + sha512_sigma(new_e, q14, t2, 14, 18, 41); + vadd_128_64(q13, q13, q15); + vadd_64_64(t1, t1, t2); + + // calculate T2 + sha512_maj(new_a, st_a, st_b, d30, d31); + sha512_sigma(new_a, q14, t2, 28, 34, 39); + vadd_64_64(t2, t2, d31); + + vadd_64_64(new_new_a, t1, t2); + vadd_64_64(new_new_e, st_c, t1); + + // restore a,b,c,d,e,f,g,h sequence + vswp_128(st_g, st_a); + vswp_128(st_g, st_c); + vswp_128(st_g, st_e); + + cmp(counter, 8); + b(L_hash_no_w, Assembler::LO); + + // calculate W[+1], W[+2] + sha512_delta(w15, q14, t1, 1, 8, 7); + sha512_delta(w2, q14, d30, 19, 61, 6); + sha512_delta(w14, q14, t2, 1, 8, 7); + sha512_delta(w1, q14, d31, 19, 61, 6); + + vadd_128_64(w16, w16, t1); + vadd_128_64(w16, w16, q15); + vadd_64_64(w16, w16, w7); + vadd_64_64(w15, w15, w6); + + BIND(L_hash_no_w); + + vswp_128(w16, w14); + vswp_128(w14, w12); + vswp_128(w12, w10); + vswp_128(w10, w8); + vswp_128(w8, w6); + vswp_128(w6, w4); + vswp_128(w4, w2); + } cbnz(counter, L_hash_loop); + // read initial state to w16 - w9 + vld1_64(w16, w15, w14, w13, Address(post(state, 32)), Assembler::ALIGN_STD); + vld1_64(w12, w11, w10, w9, Address(state), Assembler::ALIGN_STD); + sub(state, state, 32); + + // update state + vadd_128_64(st_a, st_a, w16); + vadd_128_64(st_c, st_c, w14); + vadd_128_64(st_e, st_e, w12); + vadd_128_64(st_g, st_g, w10); + + // store state + vst1_64(st_a, st_b, st_c, st_d, Address(post(state, 32)), Assembler::ALIGN_STD); + vst1_64(st_e, st_f, st_g, st_h, Address(state), Assembler::ALIGN_STD); +} + +void MacroAssembler::bfc_impl(Register Rd, int lsb, int width, Condition cond) { + if (width > 15 && lsb == 0) { + lsr(Rd, Rd, width); + lsl(Rd, Rd, width); + } else if (width > 15 && lsb + width == 32) { + lsl(Rd, Rd, 32 - lsb); + lsr(Rd, Rd, 32 - lsb); + } else { + const int lsb1 = (lsb & 1); + int w1 = width <= 8 - lsb1 ? width : 8 - lsb1; + while (width) { + bic(Rd, Rd, ((1 << w1) - 1) << lsb); + width -= w1; + lsb += w1; + w1 = width > 8 ? 8 : width; + } + } +} + +// get_thread can be called anywhere inside generated code so we need +// to save whatever non-callee save context might get clobbered by the +// call to the C thread_local lookup call or, indeed, the call setup +// code. x86 appears to save C arg registers. + +void MacroAssembler::get_thread(Register dst) { + // call pthread_getspecific + // void * pthread_getspecific(pthread_key_t key); + + // Save all call-clobbered regs except dst, plus rscratch1 and rscratch2. + RegSet saved_regs = RegSet::range(r0, r3) + rscratch1 + rscratch2 + lr - dst; + push(saved_regs, sp); + + // Align stack and save value for return + mov(c_rarg1, sp); + sub(sp, sp, wordSize); + bic(sp, sp, 7); + str(c_rarg1, Address(sp)); + + mov(rscratch2, CAST_FROM_FN_PTR(address, Thread::current)); + + bl(rscratch2); + //undo alignment + ldr(sp, Address(sp)); + + if (dst != c_rarg0) { + mov(dst, c_rarg0); + } + + // restore pushed registers + pop(saved_regs, sp); +} + +#ifdef COMPILER2 +// 24-bit word range == 26-bit byte range +bool check26(int offset) { + // this could be simplified, but it mimics encoding and decoding + // an actual branch insrtuction + int off1 = offset << 6 >> 8; + int encoded = off1 & ((1<<24)-1); + int decoded = encoded << 8 >> 6; + return offset == decoded; +} + +// Perform some slight adjustments so the default 32MB code cache +// is fully reachable. +static inline address first_cache_address() { + return CodeCache::low_bound() + sizeof(HeapBlock::Header); +} +static inline address last_cache_address() { + return CodeCache::high_bound() - NativeInstruction::arm_insn_sz; +} + +// Can we reach target using unconditional branch or call from anywhere +// in the code cache (because code can be relocated)? +bool MacroAssembler::_reachable_from_cache(address target) { +#ifdef __thumb__ + if ((1 & (intptr_t)target) != 0) { + // Return false to avoid 'b' if we need switching to THUMB mode. + return false; + } +#endif + + address cl = first_cache_address(); + address ch = last_cache_address(); + + if (ForceUnreachable) { + // Only addresses from CodeCache can be treated as reachable. + if (target < CodeCache::low_bound() || CodeCache::high_bound() <= target) { + return false; + } + } + + intptr_t loffset = (intptr_t)target - (intptr_t)cl; + intptr_t hoffset = (intptr_t)target - (intptr_t)ch; + + return check26(loffset - 8) && check26(hoffset - 8); +} + +bool MacroAssembler::_cache_fully_reachable() { + address cl = first_cache_address(); + address ch = last_cache_address(); + return _reachable_from_cache(cl) && _reachable_from_cache(ch); +} + +bool MacroAssembler::reachable_from_cache(address target) { + assert(CodeCache::contains(pc()), "not supported"); + return _reachable_from_cache(target); +} + +bool MacroAssembler::cache_fully_reachable() { + return _cache_fully_reachable(); +} + +// IMPORTANT: does not generate mt-safe patchable code +void MacroAssembler::call(address target, RelocationHolder rspec, Condition cond) { + Register scratch = lr; + assert(rspec.type() == relocInfo::runtime_call_type || rspec.type() == relocInfo::none, "not supported"); + if (reachable_from_cache(target)) { + relocate(rspec); + bl(target, cond); + return; + } + + mov(scratch, (intptr_t)target, cond); + bl(scratch, cond); +} + +// IMPORTANT: does not generate mt-safe patchable code. C2 only uses this method +// for calls into runtime which do not need mt-safe patching +void MacroAssembler::jump(address target, relocInfo::relocType rtype, Register scratch, Condition cond) { + assert((rtype == relocInfo::runtime_call_type) || (rtype == relocInfo::none), "not supported"); + if (reachable_from_cache(target)) { + relocate(rtype); + b(target, cond); + return; + } + + mov(scratch, (intptr_t)target, cond); + b(scratch, cond); +} + +void MacroAssembler::arm_stack_overflow_check(int frame_size_in_bytes, Register tmp) { + // Version of AbstractAssembler::generate_stack_overflow_check optimized for ARM + if (UseStackBanging) { + const int page_size = os::vm_page_size(); + + sub(tmp, sp, StackShadowPages*page_size); + strb(r0, Address(tmp)); + for (; frame_size_in_bytes >= page_size; frame_size_in_bytes -= 0xff0) { + strb(r0, pre(tmp, -0xff0)); + } + } +} + +void MacroAssembler::floating_cmp(Register dst) { + vmrs(dst); + orr(dst, dst, 0x08000000); + eor(dst, dst, dst, lsl(3)); + mov(dst, dst, asr(30)); +} + +void MacroAssembler::fast_lock(Register Roop, Register Rbox, Register Rmark, Register Rscratch, Register Rscratch2) { + assert(Roop != Rscratch, ""); + assert(Roop != Rmark, ""); + assert(Rbox != Rscratch, ""); + assert(Rbox != Rmark, ""); + + Label fast_lock, done; + + if (UseBiasedLocking && !UseOptoBiasInlining) { + Label failed; + biased_locking_enter(Roop, Rmark, Rscratch, Rscratch2, false, done, &failed); + bind(failed); + } + + ldr(Rmark, Address(Roop, oopDesc::mark_offset_in_bytes())); + tst(Rmark, markOopDesc::unlocked_value); + b(fast_lock, Assembler::NE); + + // Check for recursive lock + // See comments in InterpreterMacroAssembler::lock_object for + // explanations on the fast recursive locking check. + // -1- test low 2 bits + movs(Rscratch, Rmark, lsl(30)); + // -2- test (hdr - SP) if the low two bits are 0 + sub(Rscratch, Rmark, sp, Assembler::EQ); + movs(Rscratch, Rscratch, lsr(exact_log2(os::vm_page_size())), Assembler::EQ); + // If still 'eq' then recursive locking OK + // set to zero if recursive lock, set to non zero otherwise (see discussion in JDK-8153107) + str(Rscratch, Address(Rbox, BasicLock::displaced_header_offset_in_bytes())); + b(done); + + bind(fast_lock); + str(Rmark, Address(Rbox, BasicLock::displaced_header_offset_in_bytes())); + + membar(StoreStore); + ldrex(Rscratch, Address(Roop, oopDesc::mark_offset_in_bytes())); + cmp(Rscratch, Rmark); + strex(Rscratch, Rbox, Address(Roop, oopDesc::mark_offset_in_bytes()), Assembler::EQ); + cmp(Rscratch, 0, Assembler::EQ); + membar(AnyAny); + + bind(done); +} + +void MacroAssembler::fast_unlock(Register Roop, Register Rbox, Register Rscratch, Register Rscratch2) { + Register Rmark = Rscratch2; + + assert(Roop != Rscratch, ""); + assert(Roop != Rmark, ""); + assert(Rbox != Rscratch, ""); + assert(Rbox != Rmark, ""); + + Label done; + + if (UseBiasedLocking && !UseOptoBiasInlining) { + biased_locking_exit(Roop, Rscratch, done); + } + + ldr(Rmark, Address(Rbox, BasicLock::displaced_header_offset_in_bytes())); + // If hdr is NULL, we've got recursive locking and there's nothing more to do + cmp(Rmark, 0); + b(done, Assembler::EQ); + + // Restore the object header + membar(AnyAny); + ldrex(Rscratch, Address(Roop, oopDesc::mark_offset_in_bytes())); + cmp(Rscratch, Rmark); + strex(Rscratch, Rbox, Address(Roop, oopDesc::mark_offset_in_bytes()), Assembler::EQ); + cmp(Rscratch, 0, Assembler::EQ); + + membar(StoreLoad); + + bind(done); +} + +#endif