src/hotspot/cpu/arm/stubGenerator_arm.cpp
Index
Unified diffs
Context diffs
Sdiffs
Wdiffs
Patch
New
Old
Previous File
Next File
open Cdiff src/hotspot/cpu/arm/stubGenerator_arm.cpp
src/hotspot/cpu/arm/stubGenerator_arm.cpp
Print this page
*** 83,107 ****
#define IMX515_ARRAYCOPY_CONFIG 2
// Hard coded choices (XXX: could be changed to a command line option)
#define ArmCopyPlatform DEFAULT_ARRAYCOPY_CONFIG
- #ifdef AARCH64
- #define ArmCopyCacheLineSize 64
- #else
#define ArmCopyCacheLineSize 32 // not worth optimizing to 64 according to measured gains
- #endif // AARCH64
-
- // TODO-AARCH64: tune and revise AArch64 arraycopy optimizations
// configuration for each kind of loop
typedef struct {
int pld_distance; // prefetch distance (0 => no prefetch, <0: prefetch_before);
- #ifndef AARCH64
bool split_ldm; // if true, split each STM in STMs with fewer registers
bool split_stm; // if true, split each LTM in LTMs with fewer registers
- #endif // !AARCH64
} arraycopy_loop_config;
// configuration for all loops
typedef struct {
// const char *description;
--- 83,99 ----
*** 112,129 ****
} arraycopy_platform_config;
// configured platforms
static arraycopy_platform_config arraycopy_configurations[] = {
// configuration parameters for arraycopy loops
- #ifdef AARCH64
- {
- {-256 }, // forward aligned
- {-128 }, // backward aligned
- {-256 }, // forward shifted
- {-128 } // backward shifted
- }
- #else
// Configurations were chosen based on manual analysis of benchmark
// results, minimizing overhead with respect to best results on the
// different test cases.
--- 104,113 ----
*** 169,179 ****
{-160, false, false }, // forward aligned
{-160, false, false }, // backward aligned
{-160, false, false }, // forward shifted
{-160, true, true } // backward shifted
}
- #endif // AARCH64
};
class StubGenerator: public StubCodeGenerator {
#ifdef PRODUCT
--- 153,162 ----
*** 188,291 ****
address generate_call_stub(address& return_address) {
StubCodeMark mark(this, "StubRoutines", "call_stub");
address start = __ pc();
- #ifdef AARCH64
- const int saved_regs_size = 192;
-
- __ stp(FP, LR, Address(SP, -saved_regs_size, pre_indexed));
- __ mov(FP, SP);
-
- int sp_offset = 16;
- assert(frame::entry_frame_call_wrapper_offset * wordSize == sp_offset, "adjust this code");
- __ stp(R0, ZR, Address(SP, sp_offset)); sp_offset += 16;
-
- const int saved_result_and_result_type_offset = sp_offset;
- __ stp(R1, R2, Address(SP, sp_offset)); sp_offset += 16;
- __ stp(R19, R20, Address(SP, sp_offset)); sp_offset += 16;
- __ stp(R21, R22, Address(SP, sp_offset)); sp_offset += 16;
- __ stp(R23, R24, Address(SP, sp_offset)); sp_offset += 16;
- __ stp(R25, R26, Address(SP, sp_offset)); sp_offset += 16;
- __ stp(R27, R28, Address(SP, sp_offset)); sp_offset += 16;
-
- __ stp_d(V8, V9, Address(SP, sp_offset)); sp_offset += 16;
- __ stp_d(V10, V11, Address(SP, sp_offset)); sp_offset += 16;
- __ stp_d(V12, V13, Address(SP, sp_offset)); sp_offset += 16;
- __ stp_d(V14, V15, Address(SP, sp_offset)); sp_offset += 16;
- assert (sp_offset == saved_regs_size, "adjust this code");
-
- __ mov(Rmethod, R3);
- __ mov(Rthread, R7);
- __ reinit_heapbase();
-
- { // Pass parameters
- Label done_parameters, pass_parameters;
-
- __ mov(Rparams, SP);
- __ cbz_w(R6, done_parameters);
-
- __ sub(Rtemp, SP, R6, ex_uxtw, LogBytesPerWord);
- __ align_reg(SP, Rtemp, StackAlignmentInBytes);
- __ add(Rparams, SP, R6, ex_uxtw, LogBytesPerWord);
-
- __ bind(pass_parameters);
- __ subs_w(R6, R6, 1);
- __ ldr(Rtemp, Address(R5, wordSize, post_indexed));
- __ str(Rtemp, Address(Rparams, -wordSize, pre_indexed));
- __ b(pass_parameters, ne);
-
- __ bind(done_parameters);
-
- #ifdef ASSERT
- {
- Label L;
- __ cmp(SP, Rparams);
- __ b(L, eq);
- __ stop("SP does not match Rparams");
- __ bind(L);
- }
- #endif
- }
-
- __ mov(Rsender_sp, SP);
- __ blr(R4);
- return_address = __ pc();
-
- __ mov(SP, FP);
-
- __ ldp(R1, R2, Address(SP, saved_result_and_result_type_offset));
-
- { // Handle return value
- Label cont;
- __ str(R0, Address(R1));
-
- __ cmp_w(R2, T_DOUBLE);
- __ ccmp_w(R2, T_FLOAT, Assembler::flags_for_condition(eq), ne);
- __ b(cont, ne);
-
- __ str_d(V0, Address(R1));
- __ bind(cont);
- }
-
- sp_offset = saved_result_and_result_type_offset + 16;
- __ ldp(R19, R20, Address(SP, sp_offset)); sp_offset += 16;
- __ ldp(R21, R22, Address(SP, sp_offset)); sp_offset += 16;
- __ ldp(R23, R24, Address(SP, sp_offset)); sp_offset += 16;
- __ ldp(R25, R26, Address(SP, sp_offset)); sp_offset += 16;
- __ ldp(R27, R28, Address(SP, sp_offset)); sp_offset += 16;
-
- __ ldp_d(V8, V9, Address(SP, sp_offset)); sp_offset += 16;
- __ ldp_d(V10, V11, Address(SP, sp_offset)); sp_offset += 16;
- __ ldp_d(V12, V13, Address(SP, sp_offset)); sp_offset += 16;
- __ ldp_d(V14, V15, Address(SP, sp_offset)); sp_offset += 16;
- assert (sp_offset == saved_regs_size, "adjust this code");
-
- __ ldp(FP, LR, Address(SP, saved_regs_size, post_indexed));
- __ ret();
-
- #else // AARCH64
assert(frame::entry_frame_call_wrapper_offset == 0, "adjust this code");
__ mov(Rtemp, SP);
__ push(RegisterSet(FP) | RegisterSet(LR));
--- 171,180 ----
*** 356,366 ****
#ifndef __SOFTFP__
__ fldmiad(SP, FloatRegisterSet(D8, 8), writeback);
#endif
__ pop(RegisterSet(FP) | RegisterSet(PC));
- #endif // AARCH64
return start;
}
// (in) Rexception_obj: exception oop
--- 245,254 ----
*** 404,414 ****
__ jump(R0); // handler is returned in R0 by runtime function
return start;
}
- #ifndef AARCH64
// Integer division shared routine
// Input:
// R0 - dividend
// R2 - divisor
--- 292,301 ----
*** 793,803 ****
return start;
}
- #endif // AARCH64
#ifdef COMPILER2
// Support for uint StubRoutine::Arm::partial_subtype_check( Klass sub, Klass super );
// Arguments :
//
--- 680,689 ----
*** 881,896 ****
__ raw_pop(saved_set);
__ ret();
// Return failure
__ bind(L_fail);
- #ifdef AARCH64
- // count_temp is 0, can't use ZR here
- __ adds(R0, count_temp, 1); // sets the flags
- #else
__ movs(R0, 1); // sets the flags
- #endif
__ raw_pop(saved_set);
__ ret();
}
return start;
}
--- 767,777 ----
*** 923,937 ****
assert_different_registers(oop, klass, tmp1, tmp2, flags, ret_addr, R7);
Label exit, error;
InlinedAddress verify_oop_count((address) StubRoutines::verify_oop_count_addr());
- #ifdef AARCH64
- __ mrs(flags, Assembler::SysReg_NZCV);
- #else
__ mrs(Assembler::CPSR, flags);
- #endif // AARCH64
__ ldr_literal(tmp1, verify_oop_count);
__ ldr_s32(tmp2, Address(tmp1));
__ add(tmp2, tmp2, 1);
__ str_32(tmp2, Address(tmp1));
--- 804,814 ----
*** 954,968 ****
__ cbz(klass, error); // if klass is NULL it is broken
// return if everything seems ok
__ bind(exit);
- #ifdef AARCH64
- __ msr(Assembler::SysReg_NZCV, flags);
- #else
__ msr(Assembler::CPSR_f, flags);
- #endif // AARCH64
__ ret();
// handle errors
__ bind(error);
--- 831,841 ----
*** 1004,1131 ****
void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size, Register tmp1, Register tmp2) {
const Register from = R0;
const Register to = R1;
const Register count = R2;
const Register to_from = tmp1; // to - from
- #ifndef AARCH64
const Register byte_count = (log2_elem_size == 0) ? count : tmp2; // count << log2_elem_size
- #endif // AARCH64
assert_different_registers(from, to, count, tmp1, tmp2);
// no_overlap version works if 'to' lower (unsigned) than 'from'
// and or 'to' more than (count*size) from 'from'
BLOCK_COMMENT("Array Overlap Test:");
__ subs(to_from, to, from);
- #ifndef AARCH64
if (log2_elem_size != 0) {
__ mov(byte_count, AsmOperand(count, lsl, log2_elem_size));
}
- #endif // !AARCH64
if (NOLp == NULL)
__ b(no_overlap_target,lo);
else
__ b((*NOLp), lo);
- #ifdef AARCH64
- __ subs(ZR, to_from, count, ex_sxtw, log2_elem_size);
- #else
__ cmp(to_from, byte_count);
- #endif // AARCH64
if (NOLp == NULL)
__ b(no_overlap_target, ge);
else
__ b((*NOLp), ge);
}
- #ifdef AARCH64
- // TODO-AARCH64: revise usages of bulk_* methods (probably ldp`s and stp`s should interlace)
-
- // Loads [from, from + count*wordSize) into regs[0], regs[1], ..., regs[count-1]
- // and increases 'from' by count*wordSize.
- void bulk_load_forward(Register from, const Register regs[], int count) {
- assert (count > 0 && count % 2 == 0, "count must be positive even number");
- int bytes = count * wordSize;
-
- int offset = 0;
- __ ldp(regs[0], regs[1], Address(from, bytes, post_indexed));
- offset += 2*wordSize;
-
- for (int i = 2; i < count; i += 2) {
- __ ldp(regs[i], regs[i+1], Address(from, -bytes + offset));
- offset += 2*wordSize;
- }
-
- assert (offset == bytes, "must be");
- }
-
- // Stores regs[0], regs[1], ..., regs[count-1] to [to, to + count*wordSize)
- // and increases 'to' by count*wordSize.
- void bulk_store_forward(Register to, const Register regs[], int count) {
- assert (count > 0 && count % 2 == 0, "count must be positive even number");
- int bytes = count * wordSize;
-
- int offset = 0;
- __ stp(regs[0], regs[1], Address(to, bytes, post_indexed));
- offset += 2*wordSize;
-
- for (int i = 2; i < count; i += 2) {
- __ stp(regs[i], regs[i+1], Address(to, -bytes + offset));
- offset += 2*wordSize;
- }
-
- assert (offset == bytes, "must be");
- }
-
- // Loads [from - count*wordSize, from) into regs[0], regs[1], ..., regs[count-1]
- // and decreases 'from' by count*wordSize.
- // Note that the word with lowest address goes to regs[0].
- void bulk_load_backward(Register from, const Register regs[], int count) {
- assert (count > 0 && count % 2 == 0, "count must be positive even number");
- int bytes = count * wordSize;
-
- int offset = 0;
-
- for (int i = count - 2; i > 0; i -= 2) {
- offset += 2*wordSize;
- __ ldp(regs[i], regs[i+1], Address(from, -offset));
- }
-
- offset += 2*wordSize;
- __ ldp(regs[0], regs[1], Address(from, -bytes, pre_indexed));
- assert (offset == bytes, "must be");
- }
-
- // Stores regs[0], regs[1], ..., regs[count-1] into [to - count*wordSize, to)
- // and decreases 'to' by count*wordSize.
- // Note that regs[0] value goes into the memory with lowest address.
- void bulk_store_backward(Register to, const Register regs[], int count) {
- assert (count > 0 && count % 2 == 0, "count must be positive even number");
- int bytes = count * wordSize;
-
- int offset = 0;
-
- for (int i = count - 2; i > 0; i -= 2) {
- offset += 2*wordSize;
- __ stp(regs[i], regs[i+1], Address(to, -offset));
- }
-
- offset += 2*wordSize;
- __ stp(regs[0], regs[1], Address(to, -bytes, pre_indexed));
-
- assert (offset == bytes, "must be");
- }
- #endif // AARCH64
-
- // TODO-AARCH64: rearrange in-loop prefetches:
// probably we should choose between "prefetch-store before or after store", not "before or after load".
void prefetch(Register from, Register to, int offset, int to_delta = 0) {
__ prefetch_read(Address(from, offset));
- #ifdef AARCH64
- // Next line commented out to avoid significant loss of performance in memory copy - JDK-8078120
- // __ prfm(pstl1keep, Address(to, offset + to_delta));
- #endif // AARCH64
}
// Generate the inner loop for forward aligned array copy
//
// Arguments
--- 877,912 ----
*** 1135,1162 ****
// bytes_per_count: number of bytes for each unit of 'count'
//
// Return the minimum initial value for count
//
// Notes:
! // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
// - 'to' aligned on wordSize
// - 'count' must be greater or equal than the returned value
//
// Increases 'from' and 'to' by count*bytes_per_count.
//
// Scratches 'count', R3.
! // On AArch64 also scratches R4-R10; on 32-bit ARM R4-R10 are preserved (saved/restored).
//
int generate_forward_aligned_copy_loop(Register from, Register to, Register count, int bytes_per_count) {
assert (from == R0 && to == R1 && count == R2, "adjust the implementation below");
const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration
arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_aligned;
int pld_offset = config->pld_distance;
const int count_per_loop = bytes_per_loop / bytes_per_count;
- #ifndef AARCH64
bool split_read= config->split_ldm;
bool split_write= config->split_stm;
// XXX optim: use VLDM/VSTM when available (Neon) with PLD
// NEONCopyPLD
--- 916,942 ----
// bytes_per_count: number of bytes for each unit of 'count'
//
// Return the minimum initial value for count
//
// Notes:
! // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA)
// - 'to' aligned on wordSize
// - 'count' must be greater or equal than the returned value
//
// Increases 'from' and 'to' by count*bytes_per_count.
//
// Scratches 'count', R3.
! // R4-R10 are preserved (saved/restored).
//
int generate_forward_aligned_copy_loop(Register from, Register to, Register count, int bytes_per_count) {
assert (from == R0 && to == R1 && count == R2, "adjust the implementation below");
const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration
arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_aligned;
int pld_offset = config->pld_distance;
const int count_per_loop = bytes_per_loop / bytes_per_count;
bool split_read= config->split_ldm;
bool split_write= config->split_stm;
// XXX optim: use VLDM/VSTM when available (Neon) with PLD
// NEONCopyPLD
*** 1165,1175 ****
// VSTM r0!,{d0-d7}
// SUBS r2,r2,#0x40
// BGE NEONCopyPLD
__ push(RegisterSet(R4,R10));
- #endif // !AARCH64
const bool prefetch_before = pld_offset < 0;
const bool prefetch_after = pld_offset > 0;
Label L_skip_pld;
--- 945,954 ----
*** 1198,1213 ****
prefetch(from, to, offset);
offset += ArmCopyCacheLineSize;
};
}
- #ifdef AARCH64
- const Register data_regs[8] = {R3, R4, R5, R6, R7, R8, R9, R10};
- #endif // AARCH64
{
- // LDM (32-bit ARM) / LDP (AArch64) copy of 'bytes_per_loop' bytes
-
// 32-bit ARM note: we have tried implementing loop unrolling to skip one
// PLD with 64 bytes cache line but the gain was not significant.
Label L_copy_loop;
__ align(OptoLoopAlignment);
--- 977,987 ----
*** 1216,1256 ****
if (prefetch_before) {
prefetch(from, to, bytes_per_loop + pld_offset);
__ BIND(L_skip_pld);
}
- #ifdef AARCH64
- bulk_load_forward(from, data_regs, 8);
- #else
if (split_read) {
// Split the register set in two sets so that there is less
// latency between LDM and STM (R3-R6 available while R7-R10
// still loading) and less register locking issue when iterating
// on the first LDM.
__ ldmia(from, RegisterSet(R3, R6), writeback);
__ ldmia(from, RegisterSet(R7, R10), writeback);
} else {
__ ldmia(from, RegisterSet(R3, R10), writeback);
}
- #endif // AARCH64
__ subs_32(count, count, count_per_loop);
if (prefetch_after) {
prefetch(from, to, pld_offset, bytes_per_loop);
}
- #ifdef AARCH64
- bulk_store_forward(to, data_regs, 8);
- #else
if (split_write) {
__ stmia(to, RegisterSet(R3, R6), writeback);
__ stmia(to, RegisterSet(R7, R10), writeback);
} else {
__ stmia(to, RegisterSet(R3, R10), writeback);
}
- #endif // AARCH64
__ b(L_copy_loop, ge);
if (prefetch_before) {
// the inner loop may end earlier, allowing to skip PLD for the last iterations
--- 990,1022 ----
*** 1262,1335 ****
// still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes
// __ add(count, count, ...); // addition useless for the bit tests
assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits");
- #ifdef AARCH64
- assert (bytes_per_loop == 64, "adjust the code below");
- assert (bytes_per_count <= 8, "adjust the code below");
-
- {
- Label L;
- __ tbz(count, exact_log2(32/bytes_per_count), L);
-
- bulk_load_forward(from, data_regs, 4);
- bulk_store_forward(to, data_regs, 4);
-
- __ bind(L);
- }
-
- {
- Label L;
- __ tbz(count, exact_log2(16/bytes_per_count), L);
-
- bulk_load_forward(from, data_regs, 2);
- bulk_store_forward(to, data_regs, 2);
-
- __ bind(L);
- }
-
- {
- Label L;
- __ tbz(count, exact_log2(8/bytes_per_count), L);
-
- __ ldr(R3, Address(from, 8, post_indexed));
- __ str(R3, Address(to, 8, post_indexed));
-
- __ bind(L);
- }
-
- if (bytes_per_count <= 4) {
- Label L;
- __ tbz(count, exact_log2(4/bytes_per_count), L);
-
- __ ldr_w(R3, Address(from, 4, post_indexed));
- __ str_w(R3, Address(to, 4, post_indexed));
-
- __ bind(L);
- }
-
- if (bytes_per_count <= 2) {
- Label L;
- __ tbz(count, exact_log2(2/bytes_per_count), L);
-
- __ ldrh(R3, Address(from, 2, post_indexed));
- __ strh(R3, Address(to, 2, post_indexed));
-
- __ bind(L);
- }
-
- if (bytes_per_count <= 1) {
- Label L;
- __ tbz(count, 0, L);
-
- __ ldrb(R3, Address(from, 1, post_indexed));
- __ strb(R3, Address(to, 1, post_indexed));
-
- __ bind(L);
- }
- #else
__ tst(count, 16 / bytes_per_count);
__ ldmia(from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes
__ stmia(to, RegisterSet(R3, R6), writeback, ne);
__ tst(count, 8 / bytes_per_count);
--- 1028,1037 ----
*** 1353,1363 ****
__ ldrb(R3, Address(from, 1, post_indexed), ne);
__ strb(R3, Address(to, 1, post_indexed), ne);
}
__ pop(RegisterSet(R4,R10));
- #endif // AARCH64
return count_per_loop;
}
--- 1055,1064 ----
*** 1370,1405 ****
// bytes_per_count: number of bytes for each unit of 'count'
//
// Return the minimum initial value for count
//
// Notes:
! // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
// - 'end_to' aligned on wordSize
// - 'count' must be greater or equal than the returned value
//
// Decreases 'end_from' and 'end_to' by count*bytes_per_count.
//
// Scratches 'count', R3.
! // On AArch64 also scratches R4-R10; on 32-bit ARM R4-R10 are preserved (saved/restored).
//
int generate_backward_aligned_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count) {
assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below");
const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration
const int count_per_loop = bytes_per_loop / bytes_per_count;
arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_aligned;
int pld_offset = config->pld_distance;
- #ifndef AARCH64
bool split_read= config->split_ldm;
bool split_write= config->split_stm;
// See the forward copy variant for additional comments.
__ push(RegisterSet(R4,R10));
- #endif // !AARCH64
__ sub_32(count, count, count_per_loop);
const bool prefetch_before = pld_offset < 0;
const bool prefetch_after = pld_offset > 0;
--- 1071,1104 ----
// bytes_per_count: number of bytes for each unit of 'count'
//
// Return the minimum initial value for count
//
// Notes:
! // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA)
// - 'end_to' aligned on wordSize
// - 'count' must be greater or equal than the returned value
//
// Decreases 'end_from' and 'end_to' by count*bytes_per_count.
//
// Scratches 'count', R3.
! // ARM R4-R10 are preserved (saved/restored).
//
int generate_backward_aligned_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count) {
assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below");
const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration
const int count_per_loop = bytes_per_loop / bytes_per_count;
arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_aligned;
int pld_offset = config->pld_distance;
bool split_read= config->split_ldm;
bool split_write= config->split_stm;
// See the forward copy variant for additional comments.
__ push(RegisterSet(R4,R10));
__ sub_32(count, count, count_per_loop);
const bool prefetch_before = pld_offset < 0;
const bool prefetch_after = pld_offset > 0;
*** 1421,1436 ****
prefetch(end_from, end_to, -(wordSize + offset));
offset += ArmCopyCacheLineSize;
};
}
- #ifdef AARCH64
- const Register data_regs[8] = {R3, R4, R5, R6, R7, R8, R9, R10};
- #endif // AARCH64
{
- // LDM (32-bit ARM) / LDP (AArch64) copy of 'bytes_per_loop' bytes
-
// 32-bit ARM note: we have tried implementing loop unrolling to skip one
// PLD with 64 bytes cache line but the gain was not significant.
Label L_copy_loop;
__ align(OptoLoopAlignment);
--- 1120,1130 ----
*** 1439,1475 ****
if (prefetch_before) {
prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset));
__ BIND(L_skip_pld);
}
- #ifdef AARCH64
- bulk_load_backward(end_from, data_regs, 8);
- #else
if (split_read) {
__ ldmdb(end_from, RegisterSet(R7, R10), writeback);
__ ldmdb(end_from, RegisterSet(R3, R6), writeback);
} else {
__ ldmdb(end_from, RegisterSet(R3, R10), writeback);
}
- #endif // AARCH64
__ subs_32(count, count, count_per_loop);
if (prefetch_after) {
prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop);
}
- #ifdef AARCH64
- bulk_store_backward(end_to, data_regs, 8);
- #else
if (split_write) {
__ stmdb(end_to, RegisterSet(R7, R10), writeback);
__ stmdb(end_to, RegisterSet(R3, R6), writeback);
} else {
__ stmdb(end_to, RegisterSet(R3, R10), writeback);
}
- #endif // AARCH64
__ b(L_copy_loop, ge);
if (prefetch_before) {
__ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count);
--- 1133,1161 ----
*** 1480,1553 ****
// still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes
// __ add(count, count, ...); // addition useless for the bit tests
assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits");
- #ifdef AARCH64
- assert (bytes_per_loop == 64, "adjust the code below");
- assert (bytes_per_count <= 8, "adjust the code below");
-
- {
- Label L;
- __ tbz(count, exact_log2(32/bytes_per_count), L);
-
- bulk_load_backward(end_from, data_regs, 4);
- bulk_store_backward(end_to, data_regs, 4);
-
- __ bind(L);
- }
-
- {
- Label L;
- __ tbz(count, exact_log2(16/bytes_per_count), L);
-
- bulk_load_backward(end_from, data_regs, 2);
- bulk_store_backward(end_to, data_regs, 2);
-
- __ bind(L);
- }
-
- {
- Label L;
- __ tbz(count, exact_log2(8/bytes_per_count), L);
-
- __ ldr(R3, Address(end_from, -8, pre_indexed));
- __ str(R3, Address(end_to, -8, pre_indexed));
-
- __ bind(L);
- }
-
- if (bytes_per_count <= 4) {
- Label L;
- __ tbz(count, exact_log2(4/bytes_per_count), L);
-
- __ ldr_w(R3, Address(end_from, -4, pre_indexed));
- __ str_w(R3, Address(end_to, -4, pre_indexed));
-
- __ bind(L);
- }
-
- if (bytes_per_count <= 2) {
- Label L;
- __ tbz(count, exact_log2(2/bytes_per_count), L);
-
- __ ldrh(R3, Address(end_from, -2, pre_indexed));
- __ strh(R3, Address(end_to, -2, pre_indexed));
-
- __ bind(L);
- }
-
- if (bytes_per_count <= 1) {
- Label L;
- __ tbz(count, 0, L);
-
- __ ldrb(R3, Address(end_from, -1, pre_indexed));
- __ strb(R3, Address(end_to, -1, pre_indexed));
-
- __ bind(L);
- }
- #else
__ tst(count, 16 / bytes_per_count);
__ ldmdb(end_from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes
__ stmdb(end_to, RegisterSet(R3, R6), writeback, ne);
__ tst(count, 8 / bytes_per_count);
--- 1166,1175 ----
*** 1571,1589 ****
__ ldrb(R3, Address(end_from, -1, pre_indexed), ne);
__ strb(R3, Address(end_to, -1, pre_indexed), ne);
}
__ pop(RegisterSet(R4,R10));
- #endif // AARCH64
return count_per_loop;
}
// Generate the inner loop for shifted forward array copy (unaligned copy).
// It can be used when bytes_per_count < wordSize, i.e.
! // byte/short copy on 32-bit ARM, byte/short/int/compressed-oop copy on AArch64.
//
// Arguments
// from: start src address, 64 bits aligned
// to: start dst address, (now) wordSize aligned
// count: number of elements (32-bit int)
--- 1193,1210 ----
__ ldrb(R3, Address(end_from, -1, pre_indexed), ne);
__ strb(R3, Address(end_to, -1, pre_indexed), ne);
}
__ pop(RegisterSet(R4,R10));
return count_per_loop;
}
// Generate the inner loop for shifted forward array copy (unaligned copy).
// It can be used when bytes_per_count < wordSize, i.e.
! // byte/short copy
//
// Arguments
// from: start src address, 64 bits aligned
// to: start dst address, (now) wordSize aligned
// count: number of elements (32-bit int)
*** 1592,1606 ****
// lsl_shift: shift applied to 'new' value to set the high bytes of the next write
//
// Return the minimum initial value for count
//
// Notes:
! // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
// - 'to' aligned on wordSize
// - 'count' must be greater or equal than the returned value
// - 'lsr_shift' + 'lsl_shift' = BitsPerWord
! // - 'bytes_per_count' is 1 or 2 on 32-bit ARM; 1, 2 or 4 on AArch64
//
// Increases 'to' by count*bytes_per_count.
//
// Scratches 'from' and 'count', R3-R10, R12
//
--- 1213,1227 ----
// lsl_shift: shift applied to 'new' value to set the high bytes of the next write
//
// Return the minimum initial value for count
//
// Notes:
! // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA)
// - 'to' aligned on wordSize
// - 'count' must be greater or equal than the returned value
// - 'lsr_shift' + 'lsl_shift' = BitsPerWord
! // - 'bytes_per_count' is 1 or 2
//
// Increases 'to' by count*bytes_per_count.
//
// Scratches 'from' and 'count', R3-R10, R12
//
*** 1620,1633 ****
const int count_per_loop = bytes_per_loop / bytes_per_count;
arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_shifted;
int pld_offset = config->pld_distance;
- #ifndef AARCH64
bool split_read= config->split_ldm;
bool split_write= config->split_stm;
- #endif // !AARCH64
const bool prefetch_before = pld_offset < 0;
const bool prefetch_after = pld_offset > 0;
Label L_skip_pld, L_last_read, L_done;
if (pld_offset != 0) {
--- 1241,1252 ----
*** 1664,1679 ****
} else {
__ cmp_32(count, count_per_loop);
__ b(L_last_read, lt);
}
- #ifdef AARCH64
- const Register data_regs[9] = {R3, R4, R5, R6, R7, R8, R9, R10, R12};
- __ logical_shift_right(R3, R12, lsr_shift); // part of R12 not yet written
- __ subs_32(count, count, count_per_loop);
- bulk_load_forward(from, &data_regs[1], 8);
- #else
// read 32 bytes
if (split_read) {
// if write is not split, use less registers in first set to reduce locking
RegisterSet set1 = split_write ? RegisterSet(R4, R7) : RegisterSet(R4, R5);
RegisterSet set2 = (split_write ? RegisterSet(R8, R10) : RegisterSet(R6, R10)) | R12;
--- 1283,1292 ----
*** 1684,1694 ****
} else {
__ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written
__ ldmia(from, RegisterSet(R4, R10) | R12, writeback); // Note: small latency on R4
__ subs(count, count, count_per_loop);
}
- #endif // AARCH64
if (prefetch_after) {
// do it after the 1st ldm/ldp anyway (no locking issues with early STM/STP)
prefetch(from, to, pld_offset, bytes_per_loop);
}
--- 1297,1306 ----
*** 1699,1831 ****
__ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift)); // ...
__ logical_shift_right(R5, R5, lsr_shift);
__ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift));
__ logical_shift_right(R6, R6, lsr_shift);
__ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift));
- #ifndef AARCH64
if (split_write) {
// write the first half as soon as possible to reduce stm locking
__ stmia(to, RegisterSet(R3, R6), writeback, prefetch_before ? gt : ge);
}
- #endif // !AARCH64
__ logical_shift_right(R7, R7, lsr_shift);
__ orr(R7, R7, AsmOperand(R8, lsl, lsl_shift));
__ logical_shift_right(R8, R8, lsr_shift);
__ orr(R8, R8, AsmOperand(R9, lsl, lsl_shift));
__ logical_shift_right(R9, R9, lsr_shift);
__ orr(R9, R9, AsmOperand(R10, lsl, lsl_shift));
__ logical_shift_right(R10, R10, lsr_shift);
__ orr(R10, R10, AsmOperand(R12, lsl, lsl_shift));
- #ifdef AARCH64
- bulk_store_forward(to, data_regs, 8);
- #else
if (split_write) {
__ stmia(to, RegisterSet(R7, R10), writeback, prefetch_before ? gt : ge);
} else {
__ stmia(to, RegisterSet(R3, R10), writeback, prefetch_before ? gt : ge);
}
- #endif // AARCH64
__ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop)
if (prefetch_before) {
// the first loop may end earlier, allowing to skip pld at the end
__ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count);
- #ifndef AARCH64
__ stmia(to, RegisterSet(R3, R10), writeback); // stmia was skipped
- #endif // !AARCH64
__ b(L_skip_pld, ge);
__ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop);
}
__ BIND(L_last_read);
__ b(L_done, eq);
- #ifdef AARCH64
- assert(bytes_per_count < 8, "adjust the code below");
-
- __ logical_shift_right(R3, R12, lsr_shift);
-
- {
- Label L;
- __ tbz(count, exact_log2(32/bytes_per_count), L);
- bulk_load_forward(from, &data_regs[1], 4);
- __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
- __ logical_shift_right(R4, R4, lsr_shift);
- __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift));
- __ logical_shift_right(R5, R5, lsr_shift);
- __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift));
- __ logical_shift_right(R6, R6, lsr_shift);
- __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift));
- bulk_store_forward(to, data_regs, 4);
- __ logical_shift_right(R3, R7, lsr_shift);
- __ bind(L);
- }
-
- {
- Label L;
- __ tbz(count, exact_log2(16/bytes_per_count), L);
- bulk_load_forward(from, &data_regs[1], 2);
- __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
- __ logical_shift_right(R4, R4, lsr_shift);
- __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift));
- bulk_store_forward(to, data_regs, 2);
- __ logical_shift_right(R3, R5, lsr_shift);
- __ bind(L);
- }
-
- {
- Label L;
- __ tbz(count, exact_log2(8/bytes_per_count), L);
- __ ldr(R4, Address(from, 8, post_indexed));
- __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
- __ str(R3, Address(to, 8, post_indexed));
- __ logical_shift_right(R3, R4, lsr_shift);
- __ bind(L);
- }
-
- const int have_bytes = lsl_shift/BitsPerByte; // number of already read bytes in R3
-
- // It remains less than wordSize to write.
- // Do not check count if R3 already has maximal number of loaded elements (one less than wordSize).
- if (have_bytes < wordSize - bytes_per_count) {
- Label L;
- __ andr(count, count, (uintx)(8/bytes_per_count-1)); // make count exact
- __ cmp_32(count, have_bytes/bytes_per_count); // do we have enough bytes to store?
- __ b(L, le);
- __ ldr(R4, Address(from, 8, post_indexed));
- __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
- __ bind(L);
- }
-
- {
- Label L;
- __ tbz(count, exact_log2(4/bytes_per_count), L);
- __ str_w(R3, Address(to, 4, post_indexed));
- if (bytes_per_count < 4) {
- __ logical_shift_right(R3, R3, 4*BitsPerByte);
- }
- __ bind(L);
- }
-
- if (bytes_per_count <= 2) {
- Label L;
- __ tbz(count, exact_log2(2/bytes_per_count), L);
- __ strh(R3, Address(to, 2, post_indexed));
- if (bytes_per_count < 2) {
- __ logical_shift_right(R3, R3, 2*BitsPerByte);
- }
- __ bind(L);
- }
-
- if (bytes_per_count <= 1) {
- Label L;
- __ tbz(count, exact_log2(1/bytes_per_count), L);
- __ strb(R3, Address(to, 1, post_indexed));
- __ bind(L);
- }
- #else
switch (bytes_per_count) {
case 2:
__ mov(R3, AsmOperand(R12, lsr, lsr_shift));
__ tst(count, 8);
__ ldmia(from, RegisterSet(R4, R7), writeback, ne);
--- 1311,1351 ----
*** 1904,1922 ****
__ tst(count, 1);
__ strb(R3, Address(to, 1, post_indexed), ne); // one last byte
break;
}
- #endif // AARCH64
__ BIND(L_done);
return 0; // no minimum
}
// Generate the inner loop for shifted backward array copy (unaligned copy).
// It can be used when bytes_per_count < wordSize, i.e.
! // byte/short copy on 32-bit ARM, byte/short/int/compressed-oop copy on AArch64.
//
// Arguments
// end_from: end src address, 64 bits aligned
// end_to: end dst address, (now) wordSize aligned
// count: number of elements (32-bit int)
--- 1424,1441 ----
__ tst(count, 1);
__ strb(R3, Address(to, 1, post_indexed), ne); // one last byte
break;
}
__ BIND(L_done);
return 0; // no minimum
}
// Generate the inner loop for shifted backward array copy (unaligned copy).
// It can be used when bytes_per_count < wordSize, i.e.
! // byte/short copy
//
// Arguments
// end_from: end src address, 64 bits aligned
// end_to: end dst address, (now) wordSize aligned
// count: number of elements (32-bit int)
*** 1925,1939 ****
// lsr_shift: shift applied to 'new' value to set the low bytes of the next write
//
// Return the minimum initial value for count
//
// Notes:
! // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
// - 'end_to' aligned on wordSize
// - 'count' must be greater or equal than the returned value
// - 'lsr_shift' + 'lsl_shift' = 'BitsPerWord'
! // - 'bytes_per_count' is 1 or 2 on 32-bit ARM; 1, 2 or 4 on AArch64
//
// Decreases 'end_to' by count*bytes_per_count.
//
// Scratches 'end_from', 'count', R3-R10, R12
//
--- 1444,1458 ----
// lsr_shift: shift applied to 'new' value to set the low bytes of the next write
//
// Return the minimum initial value for count
//
// Notes:
! // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA)
// - 'end_to' aligned on wordSize
// - 'count' must be greater or equal than the returned value
// - 'lsr_shift' + 'lsl_shift' = 'BitsPerWord'
! // - 'bytes_per_count' is 1 or 2 on 32-bit ARM
//
// Decreases 'end_to' by count*bytes_per_count.
//
// Scratches 'end_from', 'count', R3-R10, R12
//
*** 1953,1966 ****
const int count_per_loop = bytes_per_loop / bytes_per_count;
arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_shifted;
int pld_offset = config->pld_distance;
- #ifndef AARCH64
bool split_read= config->split_ldm;
bool split_write= config->split_stm;
- #endif // !AARCH64
const bool prefetch_before = pld_offset < 0;
const bool prefetch_after = pld_offset > 0;
--- 1472,1483 ----
*** 1999,2022 ****
} else {
__ cmp_32(count, count_per_loop);
__ b(L_last_read, lt);
}
- #ifdef AARCH64
- __ logical_shift_left(R12, R3, lsl_shift);
- const Register data_regs[9] = {R3, R4, R5, R6, R7, R8, R9, R10, R12};
- bulk_load_backward(end_from, data_regs, 8);
- #else
if (split_read) {
__ ldmdb(end_from, RegisterSet(R7, R10), writeback);
__ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written
__ ldmdb(end_from, RegisterSet(R3, R6), writeback);
} else {
__ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written
__ ldmdb(end_from, RegisterSet(R3, R10), writeback);
}
- #endif // AARCH64
__ subs_32(count, count, count_per_loop);
if (prefetch_after) { // do prefetch during ldm/ldp latency
prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop);
--- 1516,1533 ----
*** 2032,2170 ****
__ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift));
__ logical_shift_left(R7, R7, lsl_shift);
__ orr(R7, R7, AsmOperand(R6, lsr, lsr_shift));
__ logical_shift_left(R6, R6, lsl_shift);
__ orr(R6, R6, AsmOperand(R5, lsr, lsr_shift));
- #ifndef AARCH64
if (split_write) {
// store early to reduce locking issues
__ stmdb(end_to, RegisterSet(R6, R10) | R12, writeback, prefetch_before ? gt : ge);
}
- #endif // !AARCH64
__ logical_shift_left(R5, R5, lsl_shift);
__ orr(R5, R5, AsmOperand(R4, lsr, lsr_shift));
__ logical_shift_left(R4, R4, lsl_shift);
__ orr(R4, R4, AsmOperand(R3, lsr, lsr_shift));
- #ifdef AARCH64
- bulk_store_backward(end_to, &data_regs[1], 8);
- #else
if (split_write) {
__ stmdb(end_to, RegisterSet(R4, R5), writeback, prefetch_before ? gt : ge);
} else {
__ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback, prefetch_before ? gt : ge);
}
- #endif // AARCH64
__ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop)
if (prefetch_before) {
// the first loop may end earlier, allowing to skip pld at the end
__ cmn_32(count, ((bytes_per_loop + pld_offset)/bytes_per_count));
- #ifndef AARCH64
__ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback); // stmdb was skipped
- #endif // !AARCH64
__ b(L_skip_pld, ge);
__ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop);
}
__ BIND(L_last_read);
__ b(L_done, eq);
- #ifdef AARCH64
- assert(bytes_per_count < 8, "adjust the code below");
-
- __ logical_shift_left(R12, R3, lsl_shift);
-
- {
- Label L;
- __ tbz(count, exact_log2(32/bytes_per_count), L);
- bulk_load_backward(end_from, &data_regs[4], 4);
-
- __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
- __ logical_shift_left(R10, R10, lsl_shift);
- __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift));
- __ logical_shift_left(R9, R9, lsl_shift);
- __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift));
- __ logical_shift_left(R8, R8, lsl_shift);
- __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift));
-
- bulk_store_backward(end_to, &data_regs[5], 4);
- __ logical_shift_left(R12, R7, lsl_shift);
- __ bind(L);
- }
-
- {
- Label L;
- __ tbz(count, exact_log2(16/bytes_per_count), L);
- bulk_load_backward(end_from, &data_regs[6], 2);
-
- __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
- __ logical_shift_left(R10, R10, lsl_shift);
- __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift));
-
- bulk_store_backward(end_to, &data_regs[7], 2);
- __ logical_shift_left(R12, R9, lsl_shift);
- __ bind(L);
- }
-
- {
- Label L;
- __ tbz(count, exact_log2(8/bytes_per_count), L);
- __ ldr(R10, Address(end_from, -8, pre_indexed));
- __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
- __ str(R12, Address(end_to, -8, pre_indexed));
- __ logical_shift_left(R12, R10, lsl_shift);
- __ bind(L);
- }
-
- const int have_bytes = lsr_shift/BitsPerByte; // number of already read bytes in R12
-
- // It remains less than wordSize to write.
- // Do not check count if R12 already has maximal number of loaded elements (one less than wordSize).
- if (have_bytes < wordSize - bytes_per_count) {
- Label L;
- __ andr(count, count, (uintx)(8/bytes_per_count-1)); // make count exact
- __ cmp_32(count, have_bytes/bytes_per_count); // do we have enough bytes to store?
- __ b(L, le);
- __ ldr(R10, Address(end_from, -8, pre_indexed));
- __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
- __ bind(L);
- }
-
- assert (bytes_per_count <= 4, "must be");
-
- {
- Label L;
- __ tbz(count, exact_log2(4/bytes_per_count), L);
- __ logical_shift_right(R9, R12, (wordSize-4)*BitsPerByte);
- __ str_w(R9, Address(end_to, -4, pre_indexed)); // Write 4 MSB
- if (bytes_per_count < 4) {
- __ logical_shift_left(R12, R12, 4*BitsPerByte); // Promote remaining bytes to MSB
- }
- __ bind(L);
- }
-
- if (bytes_per_count <= 2) {
- Label L;
- __ tbz(count, exact_log2(2/bytes_per_count), L);
- __ logical_shift_right(R9, R12, (wordSize-2)*BitsPerByte);
- __ strh(R9, Address(end_to, -2, pre_indexed)); // Write 2 MSB
- if (bytes_per_count < 2) {
- __ logical_shift_left(R12, R12, 2*BitsPerByte); // Promote remaining bytes to MSB
- }
- __ bind(L);
- }
-
- if (bytes_per_count <= 1) {
- Label L;
- __ tbz(count, exact_log2(1/bytes_per_count), L);
- __ logical_shift_right(R9, R12, (wordSize-1)*BitsPerByte);
- __ strb(R9, Address(end_to, -1, pre_indexed)); // Write 1 MSB
- __ bind(L);
- }
- #else
switch(bytes_per_count) {
case 2:
__ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written
__ tst(count, 8);
__ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne);
--- 1543,1580 ----
*** 2244,2254 ****
__ mov(R12, AsmOperand(R12, lsr, 24), ne);
__ strb(R12, Address(end_to, -1, pre_indexed), ne);
break;
}
- #endif // AARCH64
__ BIND(L_done);
return 0; // no minimum
}
--- 1654,1663 ----
*** 2259,2289 ****
} else {
return Address(base, -delta, pre_indexed);
}
}
- #ifdef AARCH64
- // Loads one 'size_in_bytes'-sized value from 'from' in given direction, i.e.
- // if forward: loads value at from and increases from by size
- // if !forward: loads value at from-size_in_bytes and decreases from by size
- void load_one(Register rd, Register from, int size_in_bytes, bool forward) {
- assert_different_registers(from, rd);
- Address addr = get_addr_with_indexing(from, size_in_bytes, forward);
- __ load_sized_value(rd, addr, size_in_bytes, false);
- }
-
- // Stores one 'size_in_bytes'-sized value to 'to' in given direction (see load_one)
- void store_one(Register rd, Register to, int size_in_bytes, bool forward) {
- assert_different_registers(to, rd);
- Address addr = get_addr_with_indexing(to, size_in_bytes, forward);
- __ store_sized_value(rd, addr, size_in_bytes);
- }
- #else
- // load_one and store_one are the same as for AArch64 except for
- // *) Support for condition execution
- // *) Second value register argument for 8-byte values
-
void load_one(Register rd, Register from, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) {
assert_different_registers(from, rd, rd2);
if (size_in_bytes < 8) {
Address addr = get_addr_with_indexing(from, size_in_bytes, forward);
__ load_sized_value(rd, addr, size_in_bytes, false, cond);
--- 1668,1677 ----
*** 2313,2323 ****
} else {
__ stmdb(to, RegisterSet(rd) | rd2, writeback, cond);
}
}
}
- #endif // AARCH64
// Copies data from 'from' to 'to' in specified direction to align 'from' by 64 bits.
// (on 32-bit ARM 64-bit alignment is better for LDM).
//
// Arguments:
--- 1701,1710 ----
*** 2334,2373 ****
// decreases 'count' by the number of elements copied
//
// Returns maximum number of bytes which may be copied.
int align_src(Register from, Register to, Register count, Register tmp, int bytes_per_count, bool forward) {
assert_different_registers(from, to, count, tmp);
- #ifdef AARCH64
- // TODO-AARCH64: replace by simple loop?
- Label Laligned_by_2, Laligned_by_4, Laligned_by_8;
-
- if (bytes_per_count == 1) {
- __ tbz(from, 0, Laligned_by_2);
- __ sub_32(count, count, 1);
- load_one(tmp, from, 1, forward);
- store_one(tmp, to, 1, forward);
- }
-
- __ BIND(Laligned_by_2);
-
- if (bytes_per_count <= 2) {
- __ tbz(from, 1, Laligned_by_4);
- __ sub_32(count, count, 2/bytes_per_count);
- load_one(tmp, from, 2, forward);
- store_one(tmp, to, 2, forward);
- }
-
- __ BIND(Laligned_by_4);
-
- if (bytes_per_count <= 4) {
- __ tbz(from, 2, Laligned_by_8);
- __ sub_32(count, count, 4/bytes_per_count);
- load_one(tmp, from, 4, forward);
- store_one(tmp, to, 4, forward);
- }
- __ BIND(Laligned_by_8);
- #else // AARCH64
if (bytes_per_count < 8) {
Label L_align_src;
__ BIND(L_align_src);
__ tst(from, 7);
// ne => not aligned: copy one element and (if bytes_per_count < 4) loop
--- 1721,1730 ----
*** 2376,2386 ****
store_one(tmp, to, bytes_per_count, forward, ne);
if (bytes_per_count < 4) {
__ b(L_align_src, ne); // if bytes_per_count == 4, then 0 or 1 loop iterations are enough
}
}
- #endif // AARCH64
return 7/bytes_per_count;
}
// Copies 'count' of 'bytes_per_count'-sized elements in the specified direction.
//
--- 1733,1742 ----
*** 2396,2426 ****
// shifts 'from' and 'to'
void copy_small_array(Register from, Register to, Register count, Register tmp, Register tmp2, int bytes_per_count, bool forward, Label & entry) {
assert_different_registers(from, to, count, tmp);
__ align(OptoLoopAlignment);
- #ifdef AARCH64
- Label L_small_array_done, L_small_array_loop;
- __ BIND(entry);
- __ cbz_32(count, L_small_array_done);
-
- __ BIND(L_small_array_loop);
- __ subs_32(count, count, 1);
- load_one(tmp, from, bytes_per_count, forward);
- store_one(tmp, to, bytes_per_count, forward);
- __ b(L_small_array_loop, gt);
-
- __ BIND(L_small_array_done);
- #else
Label L_small_loop;
__ BIND(L_small_loop);
store_one(tmp, to, bytes_per_count, forward, al, tmp2);
__ BIND(entry); // entry point
__ subs(count, count, 1);
load_one(tmp, from, bytes_per_count, forward, ge, tmp2);
__ b(L_small_loop, ge);
- #endif // AARCH64
}
// Aligns 'to' by reading one word from 'from' and writting its part to 'to'.
//
// Arguments:
--- 1752,1768 ----
*** 2498,2508 ****
int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, Register Rval,
int to_remainder, int bytes_per_count, bool forward) {
assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is invalid");
! const Register tmp = forward ? R3 : R12; // TODO-AARCH64: on cojoint_short R4 was used for tmp
assert_different_registers(from, to, count, Rval, tmp);
int required_to_align = align_dst(to, count, Rval, tmp, to_remainder, bytes_per_count, forward);
int lsr_shift = (wordSize - to_remainder) * BitsPerByte;
--- 1840,1850 ----
int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, Register Rval,
int to_remainder, int bytes_per_count, bool forward) {
assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is invalid");
! const Register tmp = forward ? R3 : R12;
assert_different_registers(from, to, count, Rval, tmp);
int required_to_align = align_dst(to, count, Rval, tmp, to_remainder, bytes_per_count, forward);
int lsr_shift = (wordSize - to_remainder) * BitsPerByte;
*** 2532,2645 ****
// 'from' must be aligned by wordSize
// 'to' must be aligned by bytes_per_count but must not be aligned by wordSize
// shifts 'to' by the number of copied bytes
//
// Scratches 'from', 'count', R3 and R12.
! // On AArch64 also scratches R4-R10, on 32-bit ARM saves them to use.
int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool forward) {
const Register Rval = forward ? R12 : R3; // as generate_{forward,backward}_shifted_copy_loop expect
int min_copy = 0;
// Note: if {seq} is a sequence of numbers, L{seq} means that if the execution reaches this point,
// then the remainder of 'to' divided by wordSize is one of elements of {seq}.
- #ifdef AARCH64
- // TODO-AARCH64: simplify, tune
-
- load_one(Rval, from, wordSize, forward);
-
- Label L_loop_finished;
-
- switch (bytes_per_count) {
- case 4:
- min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward);
- break;
- case 2:
- {
- Label L2, L4, L6;
-
- __ tbz(to, 1, L4);
- __ tbz(to, 2, L2);
-
- __ BIND(L6);
- int min_copy6 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 6, bytes_per_count, forward);
- __ b(L_loop_finished);
-
- __ BIND(L2);
- int min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
- __ b(L_loop_finished);
-
- __ BIND(L4);
- int min_copy4 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward);
-
- min_copy = MAX2(MAX2(min_copy2, min_copy4), min_copy6);
- break;
- }
- case 1:
- {
- Label L1, L2, L3, L4, L5, L6, L7;
- Label L15, L26;
- Label L246;
-
- __ tbz(to, 0, L246);
- __ tbz(to, 1, L15);
- __ tbz(to, 2, L3);
-
- __ BIND(L7);
- int min_copy7 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 7, bytes_per_count, forward);
- __ b(L_loop_finished);
-
- __ BIND(L246);
- __ tbnz(to, 1, L26);
-
- __ BIND(L4);
- int min_copy4 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward);
- __ b(L_loop_finished);
-
- __ BIND(L15);
- __ tbz(to, 2, L1);
-
- __ BIND(L5);
- int min_copy5 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 5, bytes_per_count, forward);
- __ b(L_loop_finished);
-
- __ BIND(L3);
- int min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward);
- __ b(L_loop_finished);
-
- __ BIND(L26);
- __ tbz(to, 2, L2);
-
- __ BIND(L6);
- int min_copy6 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 6, bytes_per_count, forward);
- __ b(L_loop_finished);
-
- __ BIND(L1);
- int min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward);
- __ b(L_loop_finished);
-
- __ BIND(L2);
- int min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
-
-
- min_copy = MAX2(min_copy1, min_copy2);
- min_copy = MAX2(min_copy, min_copy3);
- min_copy = MAX2(min_copy, min_copy4);
- min_copy = MAX2(min_copy, min_copy5);
- min_copy = MAX2(min_copy, min_copy6);
- min_copy = MAX2(min_copy, min_copy7);
- break;
- }
- default:
- ShouldNotReachHere();
- break;
- }
- __ BIND(L_loop_finished);
-
- #else
__ push(RegisterSet(R4,R10));
load_one(Rval, from, wordSize, forward);
switch (bytes_per_count) {
case 2:
--- 1874,1893 ----
// 'from' must be aligned by wordSize
// 'to' must be aligned by bytes_per_count but must not be aligned by wordSize
// shifts 'to' by the number of copied bytes
//
// Scratches 'from', 'count', R3 and R12.
! // R4-R10 saved for use.
int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool forward) {
const Register Rval = forward ? R12 : R3; // as generate_{forward,backward}_shifted_copy_loop expect
int min_copy = 0;
// Note: if {seq} is a sequence of numbers, L{seq} means that if the execution reaches this point,
// then the remainder of 'to' divided by wordSize is one of elements of {seq}.
__ push(RegisterSet(R4,R10));
load_one(Rval, from, wordSize, forward);
switch (bytes_per_count) {
case 2:
*** 2692,2702 ****
ShouldNotReachHere();
break;
}
__ pop(RegisterSet(R4,R10));
- #endif // AARCH64
return min_copy;
}
#ifndef PRODUCT
--- 1940,1949 ----
*** 2774,2784 ****
// aligned copy loop which require <=7 bytes for src alignment, and 8 words for aligned copy loop.
const int small_copy_limit = (8*wordSize + 7) / bytes_per_count;
Label L_small_array;
__ cmp_32(count, small_copy_limit);
! __ b(L_small_array, le); // TODO-AARCH64: le vs lt
// Otherwise proceed with large implementation.
bool from_is_aligned = (bytes_per_count >= 8);
if (aligned && forward && (HeapWordSize % 8 == 0)) {
--- 2021,2031 ----
// aligned copy loop which require <=7 bytes for src alignment, and 8 words for aligned copy loop.
const int small_copy_limit = (8*wordSize + 7) / bytes_per_count;
Label L_small_array;
__ cmp_32(count, small_copy_limit);
! __ b(L_small_array, le);
// Otherwise proceed with large implementation.
bool from_is_aligned = (bytes_per_count >= 8);
if (aligned && forward && (HeapWordSize % 8 == 0)) {
*** 2862,2872 ****
// Arguments:
// to: destination pointer after copying.
// if 'forward' then 'to' == upper bound, else 'to' == beginning of the modified region
// count: total number of copied elements, 32-bit int
//
! // Blows all volatile (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR) and 'to', 'count', 'tmp' registers.
void oop_arraycopy_stub_epilogue_helper(Register to, Register count, Register tmp, bool status, bool forward, DecoratorSet decorators) {
assert_different_registers(to, count, tmp);
if (forward) {
// 'to' is upper bound of the modified region
--- 2109,2119 ----
// Arguments:
// to: destination pointer after copying.
// if 'forward' then 'to' == upper bound, else 'to' == beginning of the modified region
// count: total number of copied elements, 32-bit int
//
! // Blows all volatile R0-R3, Rtemp, LR) and 'to', 'count', 'tmp' registers.
void oop_arraycopy_stub_epilogue_helper(Register to, Register count, Register tmp, bool status, bool forward, DecoratorSet decorators) {
assert_different_registers(to, count, tmp);
if (forward) {
// 'to' is upper bound of the modified region
*** 2881,2896 ****
if (status) {
__ mov(R0, 0); // OK
}
- #ifdef AARCH64
- __ raw_pop(LR, ZR);
- __ ret();
- #else
__ pop(PC);
- #endif // AARCH64
}
// Generate stub for assign-compatible oop copy. If "aligned" is true, the
// "from" and "to" addresses are assumed to be heapword aligned.
--- 2128,2138 ----
*** 2937,2951 ****
const Register saved_count = LR;
const int callee_saved_regs = 3; // R0-R2
// LR is used later to save barrier args
- #ifdef AARCH64
- __ raw_push(LR, ZR);
- #else
__ push(LR);
- #endif // AARCH64
DecoratorSet decorators = IN_HEAP | IS_ARRAY;
if (disjoint) {
decorators |= ARRAYCOPY_DISJOINT;
}
--- 2179,2189 ----
*** 3019,3035 ****
oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward, decorators);
}
if (!to_is_aligned) {
- // !to_is_aligned <=> UseCompressedOops && AArch64
__ BIND(L_unaligned_dst);
- #ifdef AARCH64
- assert (UseCompressedOops, "unaligned oop array copy may be requested only with UseCompressedOops");
- #else
ShouldNotReachHere();
- #endif // AARCH64
int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward);
assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count");
oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward, decorators);
}
--- 2257,2268 ----
*** 3058,3071 ****
const Register R3_bits = R3; // test copy of low bits
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
- #ifdef AARCH64
- __ NOT_IMPLEMENTED();
- start = NULL;
- #else
const Register tmp = Rtemp;
// bump this on entry, not on exit:
inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R3, tmp);
--- 2291,2300 ----
*** 3083,3093 ****
__ tst(R3_bits, BytesPerShort-1);
__ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerShort), eq);
__ jump(StubRoutines::_jshort_arraycopy, relocInfo::runtime_call_type, tmp, eq);
__ jump(StubRoutines::_jbyte_arraycopy, relocInfo::runtime_call_type, tmp);
- #endif
return start;
}
// Helper for generating a dynamic type check.
// Smashes only the given temp registers.
--- 2312,2321 ----
*** 3183,3193 ****
// Arguments for generated stub:
// from: R0
// to: R1
// count: R2 treated as signed 32-bit int
// ckoff: R3 (super_check_offset)
! // ckval: R4 (AArch64) / SP[0] (32-bit ARM) (super_klass)
// ret: R0 zero for success; (-1^K) where K is partial transfer count (32-bit)
//
address generate_checkcast_copy(const char * name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
--- 2411,2421 ----
// Arguments for generated stub:
// from: R0
// to: R1
// count: R2 treated as signed 32-bit int
// ckoff: R3 (super_check_offset)
! // ckval: R4 (super_klass)
// ret: R0 zero for success; (-1^K) where K is partial transfer count (32-bit)
//
address generate_checkcast_copy(const char * name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
*** 3198,3261 ****
const Register count = R2; // elements count
const Register R3_ckoff = R3; // super_check_offset
const Register R4_ckval = R4; // super_klass
! const int callee_saved_regs = AARCH64_ONLY(5) NOT_AARCH64(4); // LR saved differently
Label load_element, store_element, do_epilogue, fail;
BLOCK_COMMENT("Entry:");
__ zap_high_non_significant_bits(R2);
- #ifdef AARCH64
- __ raw_push(LR, ZR);
- __ raw_push(R19, R20);
- #else
int pushed = 0;
__ push(LR);
pushed+=1;
- #endif // AARCH64
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, true, to, count, callee_saved_regs);
- #ifndef AARCH64
const RegisterSet caller_saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11;
__ push(caller_saved_regs);
assert(caller_saved_regs.size() == 6, "check the count");
pushed+=6;
__ ldr(R4_ckval,Address(SP, wordSize*pushed)); // read the argument that was on the stack
- #endif // !AARCH64
// Save arguments for barrier generation (after the pre barrier):
// - must be a caller saved register and not LR
// - ARM32: avoid R10 in case RThread is needed
! const Register saved_count = AARCH64_ONLY(R19) NOT_AARCH64(altFP_7_11);
! #ifdef AARCH64
! __ mov_w(saved_count, count);
! __ cbnz_w(count, load_element); // and test count
! #else
__ movs(saved_count, count); // and test count
__ b(load_element,ne);
- #endif // AARCH64
// nothing to copy
__ mov(R0, 0);
- #ifdef AARCH64
- __ raw_pop(R19, R20);
- __ raw_pop(LR, ZR);
- __ ret();
- #else
__ pop(caller_saved_regs);
__ pop(PC);
- #endif // AARCH64
// ======== begin loop ========
// (Loop is rotated; its entry is load_element.)
__ align(OptoLoopAlignment);
__ BIND(store_element);
--- 2426,2471 ----
const Register count = R2; // elements count
const Register R3_ckoff = R3; // super_check_offset
const Register R4_ckval = R4; // super_klass
! const int callee_saved_regs = 4; // LR saved differently
Label load_element, store_element, do_epilogue, fail;
BLOCK_COMMENT("Entry:");
__ zap_high_non_significant_bits(R2);
int pushed = 0;
__ push(LR);
pushed+=1;
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, true, to, count, callee_saved_regs);
const RegisterSet caller_saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11;
__ push(caller_saved_regs);
assert(caller_saved_regs.size() == 6, "check the count");
pushed+=6;
__ ldr(R4_ckval,Address(SP, wordSize*pushed)); // read the argument that was on the stack
// Save arguments for barrier generation (after the pre barrier):
// - must be a caller saved register and not LR
// - ARM32: avoid R10 in case RThread is needed
! const Register saved_count = altFP_7_11;
__ movs(saved_count, count); // and test count
__ b(load_element,ne);
// nothing to copy
__ mov(R0, 0);
__ pop(caller_saved_regs);
__ pop(PC);
// ======== begin loop ========
// (Loop is rotated; its entry is load_element.)
__ align(OptoLoopAlignment);
__ BIND(store_element);
*** 3288,3298 ****
// Note: fail marked by the fact that count differs from saved_count
__ BIND(do_epilogue);
! Register copied = AARCH64_ONLY(R20) NOT_AARCH64(R4); // saved
Label L_not_copied;
__ subs_32(copied, saved_count, count); // copied count (in saved reg)
__ b(L_not_copied, eq); // nothing was copied, skip post barrier
__ sub(to, to, AsmOperand(copied, lsl, LogBytesPerHeapOop)); // initial to value
--- 2498,2508 ----
// Note: fail marked by the fact that count differs from saved_count
__ BIND(do_epilogue);
! Register copied = R4; // saved
Label L_not_copied;
__ subs_32(copied, saved_count, count); // copied count (in saved reg)
__ b(L_not_copied, eq); // nothing was copied, skip post barrier
__ sub(to, to, AsmOperand(copied, lsl, LogBytesPerHeapOop)); // initial to value
*** 3304,3324 ****
inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R3, R12);
__ BIND(L_not_copied);
__ cmp_32(copied, saved_count); // values preserved in saved registers
- #ifdef AARCH64
- __ csinv(R0, ZR, copied, eq); // 0 if all copied else NOT(copied)
- __ raw_pop(R19, R20);
- __ raw_pop(LR, ZR);
- __ ret();
- #else
__ mov(R0, 0, eq); // 0 if all copied
__ mvn(R0, copied, ne); // else NOT(copied)
__ pop(caller_saved_regs);
__ pop(PC);
- #endif // AARCH64
return start;
}
// Perform range checks on the proposed arraycopy.
--- 2514,2527 ----
*** 3358,3368 ****
// Input:
// R0 - src oop
// R1 - src_pos (32-bit int)
// R2 - dst oop
// R3 - dst_pos (32-bit int)
! // R4 (AArch64) / SP[0] (32-bit ARM) - element count (32-bit int)
//
// Output: (32-bit int)
// R0 == 0 - success
// R0 < 0 - need to call System.arraycopy
//
--- 2561,2571 ----
// Input:
// R0 - src oop
// R1 - src_pos (32-bit int)
// R2 - dst oop
// R3 - dst_pos (32-bit int)
! // R4 - element count (32-bit int)
//
// Output: (32-bit int)
// R0 == 0 - success
// R0 < 0 - need to call System.arraycopy
//
*** 3376,3411 ****
const Register dst_pos = R3; // destination position
// registers used as temp
const Register R5_src_klass = R5; // source array klass
const Register R6_dst_klass = R6; // destination array klass
! const Register R_lh = AARCH64_ONLY(R7) NOT_AARCH64(altFP_7_11); // layout handler
const Register R8_temp = R8;
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
__ zap_high_non_significant_bits(R1);
__ zap_high_non_significant_bits(R3);
__ zap_high_non_significant_bits(R4);
- #ifndef AARCH64
int pushed = 0;
const RegisterSet saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11;
__ push(saved_regs);
assert(saved_regs.size() == 6, "check the count");
pushed+=6;
- #endif // !AARCH64
// bump this on entry, not on exit:
inc_counter_np(SharedRuntime::_generic_array_copy_ctr, R5, R12);
const Register length = R4; // elements count
- #ifndef AARCH64
__ ldr(length, Address(SP,4*pushed));
- #endif // !AARCH64
//-----------------------------------------------------------------------
// Assembler stubs will be used for this call to arraycopy
// if the following conditions are met:
--- 2579,2610 ----
const Register dst_pos = R3; // destination position
// registers used as temp
const Register R5_src_klass = R5; // source array klass
const Register R6_dst_klass = R6; // destination array klass
! const Register R_lh = altFP_7_11; // layout handler
const Register R8_temp = R8;
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
__ zap_high_non_significant_bits(R1);
__ zap_high_non_significant_bits(R3);
__ zap_high_non_significant_bits(R4);
int pushed = 0;
const RegisterSet saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11;
__ push(saved_regs);
assert(saved_regs.size() == 6, "check the count");
pushed+=6;
// bump this on entry, not on exit:
inc_counter_np(SharedRuntime::_generic_array_copy_ctr, R5, R12);
const Register length = R4; // elements count
__ ldr(length, Address(SP,4*pushed));
//-----------------------------------------------------------------------
// Assembler stubs will be used for this call to arraycopy
// if the following conditions are met:
*** 3494,3540 ****
const Register count = R2; // elements count
// 'from', 'to', 'count' registers should be set in this order
// since they are the same as 'src', 'src_pos', 'dst'.
- #ifdef AARCH64
-
- BLOCK_COMMENT("choose copy loop based on element size and scale indexes");
- Label Lbyte, Lshort, Lint, Llong;
-
- __ cbz(R12_elsize, Lbyte);
-
- assert (LogBytesPerShort < LogBytesPerInt && LogBytesPerInt < LogBytesPerLong, "must be");
- __ cmp(R12_elsize, LogBytesPerInt);
- __ b(Lint, eq);
- __ b(Llong, gt);
-
- __ BIND(Lshort);
- __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerShort);
- __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerShort);
- __ mov(count, length);
- __ b(StubRoutines::_jshort_arraycopy);
-
- __ BIND(Lint);
- __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerInt);
- __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerInt);
- __ mov(count, length);
- __ b(StubRoutines::_jint_arraycopy);
-
- __ BIND(Lbyte);
- __ add_ptr_scaled_int32(from, src, src_pos, 0);
- __ add_ptr_scaled_int32(to, dst, dst_pos, 0);
- __ mov(count, length);
- __ b(StubRoutines::_jbyte_arraycopy);
-
- __ BIND(Llong);
- __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerLong);
- __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerLong);
- __ mov(count, length);
- __ b(StubRoutines::_jlong_arraycopy);
-
- #else // AARCH64
BLOCK_COMMENT("scale indexes to element size");
__ add(from, src, AsmOperand(src_pos, lsl, R12_elsize)); // src_addr
__ add(to, dst, AsmOperand(dst_pos, lsl, R12_elsize)); // dst_addr
--- 2693,2702 ----
*** 3554,3564 ****
__ cmp(R12_elsize, LogBytesPerInt);
__ b(StubRoutines::_jint_arraycopy,eq);
__ b(StubRoutines::_jlong_arraycopy);
- #endif // AARCH64
}
// ObjArrayKlass
__ BIND(L_objArray);
// live at this point: R5_src_klass, R6_dst_klass, src[_pos], dst[_pos], length
--- 2716,2725 ----
*** 3584,3596 ****
__ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop); // src_addr
__ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop); // dst_addr
__ BIND(L_plain_copy);
__ mov(count, length);
- #ifndef AARCH64
__ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ?
- #endif // !AARCH64
__ b(StubRoutines::_oop_arraycopy);
}
{
__ BIND(L_checkcast_copy);
--- 2745,2755 ----
*** 3626,3657 ****
// Generate the type check.
int sco_offset = in_bytes(Klass::super_check_offset_offset());
__ ldr_u32(sco_temp, Address(R6_dst_klass, sco_offset));
generate_type_check(R5_src_klass, sco_temp, R6_dst_klass,
R8_temp, R9,
! AARCH64_ONLY(R10) NOT_AARCH64(R12),
L_plain_copy);
// Fetch destination element klass from the ObjArrayKlass header.
int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
// the checkcast_copy loop needs two extra arguments:
! const Register Rdst_elem_klass = AARCH64_ONLY(R4) NOT_AARCH64(R3);
__ ldr(Rdst_elem_klass, Address(R6_dst_klass, ek_offset)); // dest elem klass
- #ifndef AARCH64
__ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ?
__ str(Rdst_elem_klass, Address(SP,0)); // dest elem klass argument
- #endif // !AARCH64
__ ldr_u32(R3, Address(Rdst_elem_klass, sco_offset)); // sco of elem klass
__ b(StubRoutines::_checkcast_arraycopy);
}
__ BIND(L_failed);
- #ifndef AARCH64
__ pop(saved_regs);
- #endif // !AARCH64
__ mvn(R0, 0); // failure, with 0 copied
__ ret();
return start;
}
--- 2785,2812 ----
// Generate the type check.
int sco_offset = in_bytes(Klass::super_check_offset_offset());
__ ldr_u32(sco_temp, Address(R6_dst_klass, sco_offset));
generate_type_check(R5_src_klass, sco_temp, R6_dst_klass,
R8_temp, R9,
! R12,
L_plain_copy);
// Fetch destination element klass from the ObjArrayKlass header.
int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
// the checkcast_copy loop needs two extra arguments:
! const Register Rdst_elem_klass = R3;
__ ldr(Rdst_elem_klass, Address(R6_dst_klass, ek_offset)); // dest elem klass
__ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ?
__ str(Rdst_elem_klass, Address(SP,0)); // dest elem klass argument
__ ldr_u32(R3, Address(Rdst_elem_klass, sco_offset)); // sco of elem klass
__ b(StubRoutines::_checkcast_arraycopy);
}
__ BIND(L_failed);
__ pop(saved_regs);
__ mvn(R0, 0); // failure, with 0 copied
__ ret();
return start;
}
*** 3681,3695 ****
case 4: // int32_t
__ ldr_s32(R1, Address(R0));
break;
case 8: // int64_t
- #ifdef AARCH64
- __ ldr(R1, Address(R0));
- #else
Unimplemented();
- #endif // AARCH64
break;
default:
ShouldNotReachHere();
}
--- 2836,2846 ----
*** 3763,4233 ****
StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy");
}
- #ifndef AARCH64
#define COMPILE_CRYPTO
#include "stubRoutinesCrypto_arm.cpp"
- #else
-
- #ifdef COMPILER2
- // Arguments:
- //
- // Inputs:
- // c_rarg0 - source byte array address
- // c_rarg1 - destination byte array address
- // c_rarg2 - K (key) in little endian int array
- //
- address generate_aescrypt_encryptBlock() {
- __ align(CodeEntryAlignment);
- StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
-
- Label L_doLast;
-
- const Register from = c_rarg0; // source array address
- const Register to = c_rarg1; // destination array address
- const Register key = c_rarg2; // key array address
- const Register keylen = R8;
-
- address start = __ pc();
- __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
- __ mov(FP, SP);
-
- __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-
- __ vld1(V0, Address(from), MacroAssembler::VELEM_SIZE_8, 128); // get 16 bytes of input
-
- __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
-
- int quad = 1;
- __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
- __ aese(V0, V1);
- __ aesmc(V0, V0);
- __ aese(V0, V2);
- __ aesmc(V0, V0);
- __ aese(V0, V3);
- __ aesmc(V0, V0);
- __ aese(V0, V4);
- __ aesmc(V0, V0);
-
- __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
- __ aese(V0, V1);
- __ aesmc(V0, V0);
- __ aese(V0, V2);
- __ aesmc(V0, V0);
- __ aese(V0, V3);
- __ aesmc(V0, V0);
- __ aese(V0, V4);
- __ aesmc(V0, V0);
-
- __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ cmp_w(keylen, 44);
- __ b(L_doLast, eq);
-
- __ aese(V0, V1);
- __ aesmc(V0, V0);
- __ aese(V0, V2);
- __ aesmc(V0, V0);
-
- __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ cmp_w(keylen, 52);
- __ b(L_doLast, eq);
-
- __ aese(V0, V1);
- __ aesmc(V0, V0);
- __ aese(V0, V2);
- __ aesmc(V0, V0);
-
- __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ BIND(L_doLast);
-
- __ aese(V0, V1);
- __ aesmc(V0, V0);
- __ aese(V0, V2);
-
- __ vld1(V1, Address(key), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
- __ eor(V0, V0, V1, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ vst1(V0, Address(to), MacroAssembler::VELEM_SIZE_8, 128);
-
- __ mov(R0, 0);
-
- __ mov(SP, FP);
- __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
- __ ret(LR);
-
- return start;
- }
-
- // Arguments:
- //
- // Inputs:
- // c_rarg0 - source byte array address
- // c_rarg1 - destination byte array address
- // c_rarg2 - K (key) in little endian int array
- //
- address generate_aescrypt_decryptBlock() {
- assert(UseAES, "need AES instructions and misaligned SSE support");
- __ align(CodeEntryAlignment);
- StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
- Label L_doLast;
-
- const Register from = c_rarg0; // source array address
- const Register to = c_rarg1; // destination array address
- const Register key = c_rarg2; // key array address
- const Register keylen = R8;
-
- address start = __ pc();
- __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
- __ mov(FP, SP);
-
- __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-
- __ vld1(V0, Address(from), MacroAssembler::VELEM_SIZE_8, 128); // get 16 bytes of input
-
- __ vld1(V5, Address(key, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
-
- int quad = 1;
- __ rev32(V5, V5, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
- __ aesd(V0, V1);
- __ aesimc(V0, V0);
- __ aesd(V0, V2);
- __ aesimc(V0, V0);
- __ aesd(V0, V3);
- __ aesimc(V0, V0);
- __ aesd(V0, V4);
- __ aesimc(V0, V0);
-
- __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
- __ aesd(V0, V1);
- __ aesimc(V0, V0);
- __ aesd(V0, V2);
- __ aesimc(V0, V0);
- __ aesd(V0, V3);
- __ aesimc(V0, V0);
- __ aesd(V0, V4);
- __ aesimc(V0, V0);
-
- __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ cmp_w(keylen, 44);
- __ b(L_doLast, eq);
-
- __ aesd(V0, V1);
- __ aesimc(V0, V0);
- __ aesd(V0, V2);
- __ aesimc(V0, V0);
-
- __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ cmp_w(keylen, 52);
- __ b(L_doLast, eq);
-
- __ aesd(V0, V1);
- __ aesimc(V0, V0);
- __ aesd(V0, V2);
- __ aesimc(V0, V0);
-
- __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ BIND(L_doLast);
-
- __ aesd(V0, V1);
- __ aesimc(V0, V0);
- __ aesd(V0, V2);
-
- __ eor(V0, V0, V5, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ vst1(V0, Address(to), MacroAssembler::VELEM_SIZE_8, 128);
-
- __ mov(R0, 0);
-
- __ mov(SP, FP);
- __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
- __ ret(LR);
-
-
- return start;
- }
-
- // Arguments:
- //
- // Inputs:
- // c_rarg0 - source byte array address
- // c_rarg1 - destination byte array address
- // c_rarg2 - K (key) in little endian int array
- // c_rarg3 - r vector byte array address
- // c_rarg4 - input length
- //
- // Output:
- // x0 - input length
- //
- address generate_cipherBlockChaining_encryptAESCrypt() {
- assert(UseAES, "need AES instructions and misaligned SSE support");
- __ align(CodeEntryAlignment);
- StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
-
- Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
-
- const Register from = c_rarg0; // source array address
- const Register to = c_rarg1; // destination array address
- const Register key = c_rarg2; // key array address
- const Register rvec = c_rarg3; // r byte array initialized from initvector array address
- // and left with the results of the last encryption block
- const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
- const Register keylen = R8;
-
- address start = __ pc();
- __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
- __ mov(FP, SP);
-
- __ mov(R9, len_reg);
- __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-
- __ vld1(V0, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
-
- __ cmp_w(keylen, 52);
- __ b(L_loadkeys_44, cc);
- __ b(L_loadkeys_52, eq);
-
- __ vld1(V17, V18, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
-
- int quad = 1;
- __ rev32(V17, V17, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V18, V18, MacroAssembler::VELEM_SIZE_8, quad);
- __ BIND(L_loadkeys_52);
- __ vld1(V19, V20, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V19, V19, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V20, V20, MacroAssembler::VELEM_SIZE_8, quad);
- __ BIND(L_loadkeys_44);
- __ vld1(V21, V22, V23, V24, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V21, V21, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V22, V22, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V23, V23, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V24, V24, MacroAssembler::VELEM_SIZE_8, quad);
- __ vld1(V25, V26, V27, V28, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V25, V25, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V26, V26, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V27, V27, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V28, V28, MacroAssembler::VELEM_SIZE_8, quad);
- __ vld1(V29, V30, V31, Address(key), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V29, V29, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V30, V30, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V31, V31, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ BIND(L_aes_loop);
- __ vld1(V1, Address(from, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ eor(V0, V0, V1, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ b(L_rounds_44, cc);
- __ b(L_rounds_52, eq);
-
- __ aese(V0, V17);
- __ aesmc(V0, V0);
- __ aese(V0, V18);
- __ aesmc(V0, V0);
- __ BIND(L_rounds_52);
- __ aese(V0, V19);
- __ aesmc(V0, V0);
- __ aese(V0, V20);
- __ aesmc(V0, V0);
- __ BIND(L_rounds_44);
- __ aese(V0, V21);
- __ aesmc(V0, V0);
- __ aese(V0, V22);
- __ aesmc(V0, V0);
- __ aese(V0, V23);
- __ aesmc(V0, V0);
- __ aese(V0, V24);
- __ aesmc(V0, V0);
- __ aese(V0, V25);
- __ aesmc(V0, V0);
- __ aese(V0, V26);
- __ aesmc(V0, V0);
- __ aese(V0, V27);
- __ aesmc(V0, V0);
- __ aese(V0, V28);
- __ aesmc(V0, V0);
- __ aese(V0, V29);
- __ aesmc(V0, V0);
- __ aese(V0, V30);
- __ eor(V0, V0, V31, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ vst1(V0, Address(to, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ sub(len_reg, len_reg, 16);
- __ cbnz(len_reg, L_aes_loop);
-
- __ vst1(V0, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
-
- __ mov(R0, R9);
-
- __ mov(SP, FP);
- __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
- __ ret(LR);
-
- return start;
- }
-
- // Arguments:
- //
- // Inputs:
- // c_rarg0 - source byte array address
- // c_rarg1 - destination byte array address
- // c_rarg2 - K (key) in little endian int array
- // c_rarg3 - r vector byte array address
- // c_rarg4 - input length
- //
- // Output:
- // rax - input length
- //
- address generate_cipherBlockChaining_decryptAESCrypt() {
- assert(UseAES, "need AES instructions and misaligned SSE support");
- __ align(CodeEntryAlignment);
- StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
-
- Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
-
- const Register from = c_rarg0; // source array address
- const Register to = c_rarg1; // destination array address
- const Register key = c_rarg2; // key array address
- const Register rvec = c_rarg3; // r byte array initialized from initvector array address
- // and left with the results of the last encryption block
- const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
- const Register keylen = R8;
-
- address start = __ pc();
- __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
- __ mov(FP, SP);
-
- __ mov(R9, len_reg);
- __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-
- __ vld1(V2, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
-
- __ vld1(V31, Address(key, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
-
- int quad = 1;
- __ rev32(V31, V31, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ cmp_w(keylen, 52);
- __ b(L_loadkeys_44, cc);
- __ b(L_loadkeys_52, eq);
-
- __ vld1(V17, V18, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V17, V17, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V18, V18, MacroAssembler::VELEM_SIZE_8, quad);
- __ BIND(L_loadkeys_52);
- __ vld1(V19, V20, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V19, V19, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V20, V20, MacroAssembler::VELEM_SIZE_8, quad);
- __ BIND(L_loadkeys_44);
- __ vld1(V21, V22, V23, V24, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V21, V21, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V22, V22, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V23, V23, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V24, V24, MacroAssembler::VELEM_SIZE_8, quad);
- __ vld1(V25, V26, V27, V28, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V25, V25, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V26, V26, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V27, V27, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V28, V28, MacroAssembler::VELEM_SIZE_8, quad);
- __ vld1(V29, V30, Address(key), MacroAssembler::VELEM_SIZE_8, 128);
- __ rev32(V29, V29, MacroAssembler::VELEM_SIZE_8, quad);
- __ rev32(V30, V30, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ BIND(L_aes_loop);
- __ vld1(V0, Address(from, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ orr(V1, V0, V0, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ b(L_rounds_44, cc);
- __ b(L_rounds_52, eq);
-
- __ aesd(V0, V17);
- __ aesimc(V0, V0);
- __ aesd(V0, V17);
- __ aesimc(V0, V0);
- __ BIND(L_rounds_52);
- __ aesd(V0, V19);
- __ aesimc(V0, V0);
- __ aesd(V0, V20);
- __ aesimc(V0, V0);
- __ BIND(L_rounds_44);
- __ aesd(V0, V21);
- __ aesimc(V0, V0);
- __ aesd(V0, V22);
- __ aesimc(V0, V0);
- __ aesd(V0, V23);
- __ aesimc(V0, V0);
- __ aesd(V0, V24);
- __ aesimc(V0, V0);
- __ aesd(V0, V25);
- __ aesimc(V0, V0);
- __ aesd(V0, V26);
- __ aesimc(V0, V0);
- __ aesd(V0, V27);
- __ aesimc(V0, V0);
- __ aesd(V0, V28);
- __ aesimc(V0, V0);
- __ aesd(V0, V29);
- __ aesimc(V0, V0);
- __ aesd(V0, V30);
- __ eor(V0, V0, V31, MacroAssembler::VELEM_SIZE_8, quad);
- __ eor(V0, V0, V2, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ vst1(V0, Address(to, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
- __ orr(V2, V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
-
- __ sub(len_reg, len_reg, 16);
- __ cbnz(len_reg, L_aes_loop);
-
- __ vst1(V2, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
-
- __ mov(R0, R9);
-
- __ mov(SP, FP);
- __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
- __ ret(LR);
-
- return start;
- }
-
- #endif // COMPILER2
- #endif // AARCH64
private:
#undef __
#define __ masm->
--- 2914,2925 ----
*** 4296,4316 ****
StubRoutines::_catch_exception_entry = generate_catch_exception();
// stub for throwing stack overflow error used both by interpreter and compiler
StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
- #ifndef AARCH64
// integer division used both by interpreter and compiler
StubRoutines::Arm::_idiv_irem_entry = generate_idiv_irem();
StubRoutines::_atomic_add_entry = generate_atomic_add();
StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg();
StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
StubRoutines::_atomic_load_long_entry = generate_atomic_load_long();
StubRoutines::_atomic_store_long_entry = generate_atomic_store_long();
- #endif // !AARCH64
}
void generate_all() {
// Generates all stubs and initializes the entry points
--- 2988,3006 ----
*** 4336,4363 ****
// Safefetch stubs.
generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
&StubRoutines::_safefetch32_fault_pc,
&StubRoutines::_safefetch32_continuation_pc);
- #ifdef AARCH64
- generate_safefetch("SafeFetchN", wordSize, &StubRoutines::_safefetchN_entry,
- &StubRoutines::_safefetchN_fault_pc,
- &StubRoutines::_safefetchN_continuation_pc);
- #ifdef COMPILER2
- if (UseAESIntrinsics) {
- StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
- StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
- StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
- StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
- }
- #endif
- #else
assert (sizeof(int) == wordSize, "32-bit architecture");
StubRoutines::_safefetchN_entry = StubRoutines::_safefetch32_entry;
StubRoutines::_safefetchN_fault_pc = StubRoutines::_safefetch32_fault_pc;
StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc;
- #endif // AARCH64
#ifdef COMPILE_CRYPTO
// generate AES intrinsics code
if (UseAESIntrinsics) {
aes_init();
--- 3026,3039 ----
src/hotspot/cpu/arm/stubGenerator_arm.cpp
Index
Unified diffs
Context diffs
Sdiffs
Wdiffs
Patch
New
Old
Previous File
Next File