src/hotspot/cpu/arm/stubGenerator_arm.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File open Cdiff src/hotspot/cpu/arm/stubGenerator_arm.cpp

src/hotspot/cpu/arm/stubGenerator_arm.cpp

Print this page

        

*** 83,107 **** #define IMX515_ARRAYCOPY_CONFIG 2 // Hard coded choices (XXX: could be changed to a command line option) #define ArmCopyPlatform DEFAULT_ARRAYCOPY_CONFIG - #ifdef AARCH64 - #define ArmCopyCacheLineSize 64 - #else #define ArmCopyCacheLineSize 32 // not worth optimizing to 64 according to measured gains - #endif // AARCH64 - - // TODO-AARCH64: tune and revise AArch64 arraycopy optimizations // configuration for each kind of loop typedef struct { int pld_distance; // prefetch distance (0 => no prefetch, <0: prefetch_before); - #ifndef AARCH64 bool split_ldm; // if true, split each STM in STMs with fewer registers bool split_stm; // if true, split each LTM in LTMs with fewer registers - #endif // !AARCH64 } arraycopy_loop_config; // configuration for all loops typedef struct { // const char *description; --- 83,99 ----
*** 112,129 **** } arraycopy_platform_config; // configured platforms static arraycopy_platform_config arraycopy_configurations[] = { // configuration parameters for arraycopy loops - #ifdef AARCH64 - { - {-256 }, // forward aligned - {-128 }, // backward aligned - {-256 }, // forward shifted - {-128 } // backward shifted - } - #else // Configurations were chosen based on manual analysis of benchmark // results, minimizing overhead with respect to best results on the // different test cases. --- 104,113 ----
*** 169,179 **** {-160, false, false }, // forward aligned {-160, false, false }, // backward aligned {-160, false, false }, // forward shifted {-160, true, true } // backward shifted } - #endif // AARCH64 }; class StubGenerator: public StubCodeGenerator { #ifdef PRODUCT --- 153,162 ----
*** 188,291 **** address generate_call_stub(address& return_address) { StubCodeMark mark(this, "StubRoutines", "call_stub"); address start = __ pc(); - #ifdef AARCH64 - const int saved_regs_size = 192; - - __ stp(FP, LR, Address(SP, -saved_regs_size, pre_indexed)); - __ mov(FP, SP); - - int sp_offset = 16; - assert(frame::entry_frame_call_wrapper_offset * wordSize == sp_offset, "adjust this code"); - __ stp(R0, ZR, Address(SP, sp_offset)); sp_offset += 16; - - const int saved_result_and_result_type_offset = sp_offset; - __ stp(R1, R2, Address(SP, sp_offset)); sp_offset += 16; - __ stp(R19, R20, Address(SP, sp_offset)); sp_offset += 16; - __ stp(R21, R22, Address(SP, sp_offset)); sp_offset += 16; - __ stp(R23, R24, Address(SP, sp_offset)); sp_offset += 16; - __ stp(R25, R26, Address(SP, sp_offset)); sp_offset += 16; - __ stp(R27, R28, Address(SP, sp_offset)); sp_offset += 16; - - __ stp_d(V8, V9, Address(SP, sp_offset)); sp_offset += 16; - __ stp_d(V10, V11, Address(SP, sp_offset)); sp_offset += 16; - __ stp_d(V12, V13, Address(SP, sp_offset)); sp_offset += 16; - __ stp_d(V14, V15, Address(SP, sp_offset)); sp_offset += 16; - assert (sp_offset == saved_regs_size, "adjust this code"); - - __ mov(Rmethod, R3); - __ mov(Rthread, R7); - __ reinit_heapbase(); - - { // Pass parameters - Label done_parameters, pass_parameters; - - __ mov(Rparams, SP); - __ cbz_w(R6, done_parameters); - - __ sub(Rtemp, SP, R6, ex_uxtw, LogBytesPerWord); - __ align_reg(SP, Rtemp, StackAlignmentInBytes); - __ add(Rparams, SP, R6, ex_uxtw, LogBytesPerWord); - - __ bind(pass_parameters); - __ subs_w(R6, R6, 1); - __ ldr(Rtemp, Address(R5, wordSize, post_indexed)); - __ str(Rtemp, Address(Rparams, -wordSize, pre_indexed)); - __ b(pass_parameters, ne); - - __ bind(done_parameters); - - #ifdef ASSERT - { - Label L; - __ cmp(SP, Rparams); - __ b(L, eq); - __ stop("SP does not match Rparams"); - __ bind(L); - } - #endif - } - - __ mov(Rsender_sp, SP); - __ blr(R4); - return_address = __ pc(); - - __ mov(SP, FP); - - __ ldp(R1, R2, Address(SP, saved_result_and_result_type_offset)); - - { // Handle return value - Label cont; - __ str(R0, Address(R1)); - - __ cmp_w(R2, T_DOUBLE); - __ ccmp_w(R2, T_FLOAT, Assembler::flags_for_condition(eq), ne); - __ b(cont, ne); - - __ str_d(V0, Address(R1)); - __ bind(cont); - } - - sp_offset = saved_result_and_result_type_offset + 16; - __ ldp(R19, R20, Address(SP, sp_offset)); sp_offset += 16; - __ ldp(R21, R22, Address(SP, sp_offset)); sp_offset += 16; - __ ldp(R23, R24, Address(SP, sp_offset)); sp_offset += 16; - __ ldp(R25, R26, Address(SP, sp_offset)); sp_offset += 16; - __ ldp(R27, R28, Address(SP, sp_offset)); sp_offset += 16; - - __ ldp_d(V8, V9, Address(SP, sp_offset)); sp_offset += 16; - __ ldp_d(V10, V11, Address(SP, sp_offset)); sp_offset += 16; - __ ldp_d(V12, V13, Address(SP, sp_offset)); sp_offset += 16; - __ ldp_d(V14, V15, Address(SP, sp_offset)); sp_offset += 16; - assert (sp_offset == saved_regs_size, "adjust this code"); - - __ ldp(FP, LR, Address(SP, saved_regs_size, post_indexed)); - __ ret(); - - #else // AARCH64 assert(frame::entry_frame_call_wrapper_offset == 0, "adjust this code"); __ mov(Rtemp, SP); __ push(RegisterSet(FP) | RegisterSet(LR)); --- 171,180 ----
*** 356,366 **** #ifndef __SOFTFP__ __ fldmiad(SP, FloatRegisterSet(D8, 8), writeback); #endif __ pop(RegisterSet(FP) | RegisterSet(PC)); - #endif // AARCH64 return start; } // (in) Rexception_obj: exception oop --- 245,254 ----
*** 404,414 **** __ jump(R0); // handler is returned in R0 by runtime function return start; } - #ifndef AARCH64 // Integer division shared routine // Input: // R0 - dividend // R2 - divisor --- 292,301 ----
*** 793,803 **** return start; } - #endif // AARCH64 #ifdef COMPILER2 // Support for uint StubRoutine::Arm::partial_subtype_check( Klass sub, Klass super ); // Arguments : // --- 680,689 ----
*** 881,896 **** __ raw_pop(saved_set); __ ret(); // Return failure __ bind(L_fail); - #ifdef AARCH64 - // count_temp is 0, can't use ZR here - __ adds(R0, count_temp, 1); // sets the flags - #else __ movs(R0, 1); // sets the flags - #endif __ raw_pop(saved_set); __ ret(); } return start; } --- 767,777 ----
*** 923,937 **** assert_different_registers(oop, klass, tmp1, tmp2, flags, ret_addr, R7); Label exit, error; InlinedAddress verify_oop_count((address) StubRoutines::verify_oop_count_addr()); - #ifdef AARCH64 - __ mrs(flags, Assembler::SysReg_NZCV); - #else __ mrs(Assembler::CPSR, flags); - #endif // AARCH64 __ ldr_literal(tmp1, verify_oop_count); __ ldr_s32(tmp2, Address(tmp1)); __ add(tmp2, tmp2, 1); __ str_32(tmp2, Address(tmp1)); --- 804,814 ----
*** 954,968 **** __ cbz(klass, error); // if klass is NULL it is broken // return if everything seems ok __ bind(exit); - #ifdef AARCH64 - __ msr(Assembler::SysReg_NZCV, flags); - #else __ msr(Assembler::CPSR_f, flags); - #endif // AARCH64 __ ret(); // handle errors __ bind(error); --- 831,841 ----
*** 1004,1131 **** void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size, Register tmp1, Register tmp2) { const Register from = R0; const Register to = R1; const Register count = R2; const Register to_from = tmp1; // to - from - #ifndef AARCH64 const Register byte_count = (log2_elem_size == 0) ? count : tmp2; // count << log2_elem_size - #endif // AARCH64 assert_different_registers(from, to, count, tmp1, tmp2); // no_overlap version works if 'to' lower (unsigned) than 'from' // and or 'to' more than (count*size) from 'from' BLOCK_COMMENT("Array Overlap Test:"); __ subs(to_from, to, from); - #ifndef AARCH64 if (log2_elem_size != 0) { __ mov(byte_count, AsmOperand(count, lsl, log2_elem_size)); } - #endif // !AARCH64 if (NOLp == NULL) __ b(no_overlap_target,lo); else __ b((*NOLp), lo); - #ifdef AARCH64 - __ subs(ZR, to_from, count, ex_sxtw, log2_elem_size); - #else __ cmp(to_from, byte_count); - #endif // AARCH64 if (NOLp == NULL) __ b(no_overlap_target, ge); else __ b((*NOLp), ge); } - #ifdef AARCH64 - // TODO-AARCH64: revise usages of bulk_* methods (probably ldp`s and stp`s should interlace) - - // Loads [from, from + count*wordSize) into regs[0], regs[1], ..., regs[count-1] - // and increases 'from' by count*wordSize. - void bulk_load_forward(Register from, const Register regs[], int count) { - assert (count > 0 && count % 2 == 0, "count must be positive even number"); - int bytes = count * wordSize; - - int offset = 0; - __ ldp(regs[0], regs[1], Address(from, bytes, post_indexed)); - offset += 2*wordSize; - - for (int i = 2; i < count; i += 2) { - __ ldp(regs[i], regs[i+1], Address(from, -bytes + offset)); - offset += 2*wordSize; - } - - assert (offset == bytes, "must be"); - } - - // Stores regs[0], regs[1], ..., regs[count-1] to [to, to + count*wordSize) - // and increases 'to' by count*wordSize. - void bulk_store_forward(Register to, const Register regs[], int count) { - assert (count > 0 && count % 2 == 0, "count must be positive even number"); - int bytes = count * wordSize; - - int offset = 0; - __ stp(regs[0], regs[1], Address(to, bytes, post_indexed)); - offset += 2*wordSize; - - for (int i = 2; i < count; i += 2) { - __ stp(regs[i], regs[i+1], Address(to, -bytes + offset)); - offset += 2*wordSize; - } - - assert (offset == bytes, "must be"); - } - - // Loads [from - count*wordSize, from) into regs[0], regs[1], ..., regs[count-1] - // and decreases 'from' by count*wordSize. - // Note that the word with lowest address goes to regs[0]. - void bulk_load_backward(Register from, const Register regs[], int count) { - assert (count > 0 && count % 2 == 0, "count must be positive even number"); - int bytes = count * wordSize; - - int offset = 0; - - for (int i = count - 2; i > 0; i -= 2) { - offset += 2*wordSize; - __ ldp(regs[i], regs[i+1], Address(from, -offset)); - } - - offset += 2*wordSize; - __ ldp(regs[0], regs[1], Address(from, -bytes, pre_indexed)); - assert (offset == bytes, "must be"); - } - - // Stores regs[0], regs[1], ..., regs[count-1] into [to - count*wordSize, to) - // and decreases 'to' by count*wordSize. - // Note that regs[0] value goes into the memory with lowest address. - void bulk_store_backward(Register to, const Register regs[], int count) { - assert (count > 0 && count % 2 == 0, "count must be positive even number"); - int bytes = count * wordSize; - - int offset = 0; - - for (int i = count - 2; i > 0; i -= 2) { - offset += 2*wordSize; - __ stp(regs[i], regs[i+1], Address(to, -offset)); - } - - offset += 2*wordSize; - __ stp(regs[0], regs[1], Address(to, -bytes, pre_indexed)); - - assert (offset == bytes, "must be"); - } - #endif // AARCH64 - - // TODO-AARCH64: rearrange in-loop prefetches: // probably we should choose between "prefetch-store before or after store", not "before or after load". void prefetch(Register from, Register to, int offset, int to_delta = 0) { __ prefetch_read(Address(from, offset)); - #ifdef AARCH64 - // Next line commented out to avoid significant loss of performance in memory copy - JDK-8078120 - // __ prfm(pstl1keep, Address(to, offset + to_delta)); - #endif // AARCH64 } // Generate the inner loop for forward aligned array copy // // Arguments --- 877,912 ----
*** 1135,1162 **** // bytes_per_count: number of bytes for each unit of 'count' // // Return the minimum initial value for count // // Notes: ! // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64) // - 'to' aligned on wordSize // - 'count' must be greater or equal than the returned value // // Increases 'from' and 'to' by count*bytes_per_count. // // Scratches 'count', R3. ! // On AArch64 also scratches R4-R10; on 32-bit ARM R4-R10 are preserved (saved/restored). // int generate_forward_aligned_copy_loop(Register from, Register to, Register count, int bytes_per_count) { assert (from == R0 && to == R1 && count == R2, "adjust the implementation below"); const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_aligned; int pld_offset = config->pld_distance; const int count_per_loop = bytes_per_loop / bytes_per_count; - #ifndef AARCH64 bool split_read= config->split_ldm; bool split_write= config->split_stm; // XXX optim: use VLDM/VSTM when available (Neon) with PLD // NEONCopyPLD --- 916,942 ---- // bytes_per_count: number of bytes for each unit of 'count' // // Return the minimum initial value for count // // Notes: ! // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA) // - 'to' aligned on wordSize // - 'count' must be greater or equal than the returned value // // Increases 'from' and 'to' by count*bytes_per_count. // // Scratches 'count', R3. ! // R4-R10 are preserved (saved/restored). // int generate_forward_aligned_copy_loop(Register from, Register to, Register count, int bytes_per_count) { assert (from == R0 && to == R1 && count == R2, "adjust the implementation below"); const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_aligned; int pld_offset = config->pld_distance; const int count_per_loop = bytes_per_loop / bytes_per_count; bool split_read= config->split_ldm; bool split_write= config->split_stm; // XXX optim: use VLDM/VSTM when available (Neon) with PLD // NEONCopyPLD
*** 1165,1175 **** // VSTM r0!,{d0-d7} // SUBS r2,r2,#0x40 // BGE NEONCopyPLD __ push(RegisterSet(R4,R10)); - #endif // !AARCH64 const bool prefetch_before = pld_offset < 0; const bool prefetch_after = pld_offset > 0; Label L_skip_pld; --- 945,954 ----
*** 1198,1213 **** prefetch(from, to, offset); offset += ArmCopyCacheLineSize; }; } - #ifdef AARCH64 - const Register data_regs[8] = {R3, R4, R5, R6, R7, R8, R9, R10}; - #endif // AARCH64 { - // LDM (32-bit ARM) / LDP (AArch64) copy of 'bytes_per_loop' bytes - // 32-bit ARM note: we have tried implementing loop unrolling to skip one // PLD with 64 bytes cache line but the gain was not significant. Label L_copy_loop; __ align(OptoLoopAlignment); --- 977,987 ----
*** 1216,1256 **** if (prefetch_before) { prefetch(from, to, bytes_per_loop + pld_offset); __ BIND(L_skip_pld); } - #ifdef AARCH64 - bulk_load_forward(from, data_regs, 8); - #else if (split_read) { // Split the register set in two sets so that there is less // latency between LDM and STM (R3-R6 available while R7-R10 // still loading) and less register locking issue when iterating // on the first LDM. __ ldmia(from, RegisterSet(R3, R6), writeback); __ ldmia(from, RegisterSet(R7, R10), writeback); } else { __ ldmia(from, RegisterSet(R3, R10), writeback); } - #endif // AARCH64 __ subs_32(count, count, count_per_loop); if (prefetch_after) { prefetch(from, to, pld_offset, bytes_per_loop); } - #ifdef AARCH64 - bulk_store_forward(to, data_regs, 8); - #else if (split_write) { __ stmia(to, RegisterSet(R3, R6), writeback); __ stmia(to, RegisterSet(R7, R10), writeback); } else { __ stmia(to, RegisterSet(R3, R10), writeback); } - #endif // AARCH64 __ b(L_copy_loop, ge); if (prefetch_before) { // the inner loop may end earlier, allowing to skip PLD for the last iterations --- 990,1022 ----
*** 1262,1335 **** // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes // __ add(count, count, ...); // addition useless for the bit tests assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits"); - #ifdef AARCH64 - assert (bytes_per_loop == 64, "adjust the code below"); - assert (bytes_per_count <= 8, "adjust the code below"); - - { - Label L; - __ tbz(count, exact_log2(32/bytes_per_count), L); - - bulk_load_forward(from, data_regs, 4); - bulk_store_forward(to, data_regs, 4); - - __ bind(L); - } - - { - Label L; - __ tbz(count, exact_log2(16/bytes_per_count), L); - - bulk_load_forward(from, data_regs, 2); - bulk_store_forward(to, data_regs, 2); - - __ bind(L); - } - - { - Label L; - __ tbz(count, exact_log2(8/bytes_per_count), L); - - __ ldr(R3, Address(from, 8, post_indexed)); - __ str(R3, Address(to, 8, post_indexed)); - - __ bind(L); - } - - if (bytes_per_count <= 4) { - Label L; - __ tbz(count, exact_log2(4/bytes_per_count), L); - - __ ldr_w(R3, Address(from, 4, post_indexed)); - __ str_w(R3, Address(to, 4, post_indexed)); - - __ bind(L); - } - - if (bytes_per_count <= 2) { - Label L; - __ tbz(count, exact_log2(2/bytes_per_count), L); - - __ ldrh(R3, Address(from, 2, post_indexed)); - __ strh(R3, Address(to, 2, post_indexed)); - - __ bind(L); - } - - if (bytes_per_count <= 1) { - Label L; - __ tbz(count, 0, L); - - __ ldrb(R3, Address(from, 1, post_indexed)); - __ strb(R3, Address(to, 1, post_indexed)); - - __ bind(L); - } - #else __ tst(count, 16 / bytes_per_count); __ ldmia(from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes __ stmia(to, RegisterSet(R3, R6), writeback, ne); __ tst(count, 8 / bytes_per_count); --- 1028,1037 ----
*** 1353,1363 **** __ ldrb(R3, Address(from, 1, post_indexed), ne); __ strb(R3, Address(to, 1, post_indexed), ne); } __ pop(RegisterSet(R4,R10)); - #endif // AARCH64 return count_per_loop; } --- 1055,1064 ----
*** 1370,1405 **** // bytes_per_count: number of bytes for each unit of 'count' // // Return the minimum initial value for count // // Notes: ! // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64) // - 'end_to' aligned on wordSize // - 'count' must be greater or equal than the returned value // // Decreases 'end_from' and 'end_to' by count*bytes_per_count. // // Scratches 'count', R3. ! // On AArch64 also scratches R4-R10; on 32-bit ARM R4-R10 are preserved (saved/restored). // int generate_backward_aligned_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count) { assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below"); const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration const int count_per_loop = bytes_per_loop / bytes_per_count; arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_aligned; int pld_offset = config->pld_distance; - #ifndef AARCH64 bool split_read= config->split_ldm; bool split_write= config->split_stm; // See the forward copy variant for additional comments. __ push(RegisterSet(R4,R10)); - #endif // !AARCH64 __ sub_32(count, count, count_per_loop); const bool prefetch_before = pld_offset < 0; const bool prefetch_after = pld_offset > 0; --- 1071,1104 ---- // bytes_per_count: number of bytes for each unit of 'count' // // Return the minimum initial value for count // // Notes: ! // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA) // - 'end_to' aligned on wordSize // - 'count' must be greater or equal than the returned value // // Decreases 'end_from' and 'end_to' by count*bytes_per_count. // // Scratches 'count', R3. ! // ARM R4-R10 are preserved (saved/restored). // int generate_backward_aligned_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count) { assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below"); const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration const int count_per_loop = bytes_per_loop / bytes_per_count; arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_aligned; int pld_offset = config->pld_distance; bool split_read= config->split_ldm; bool split_write= config->split_stm; // See the forward copy variant for additional comments. __ push(RegisterSet(R4,R10)); __ sub_32(count, count, count_per_loop); const bool prefetch_before = pld_offset < 0; const bool prefetch_after = pld_offset > 0;
*** 1421,1436 **** prefetch(end_from, end_to, -(wordSize + offset)); offset += ArmCopyCacheLineSize; }; } - #ifdef AARCH64 - const Register data_regs[8] = {R3, R4, R5, R6, R7, R8, R9, R10}; - #endif // AARCH64 { - // LDM (32-bit ARM) / LDP (AArch64) copy of 'bytes_per_loop' bytes - // 32-bit ARM note: we have tried implementing loop unrolling to skip one // PLD with 64 bytes cache line but the gain was not significant. Label L_copy_loop; __ align(OptoLoopAlignment); --- 1120,1130 ----
*** 1439,1475 **** if (prefetch_before) { prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset)); __ BIND(L_skip_pld); } - #ifdef AARCH64 - bulk_load_backward(end_from, data_regs, 8); - #else if (split_read) { __ ldmdb(end_from, RegisterSet(R7, R10), writeback); __ ldmdb(end_from, RegisterSet(R3, R6), writeback); } else { __ ldmdb(end_from, RegisterSet(R3, R10), writeback); } - #endif // AARCH64 __ subs_32(count, count, count_per_loop); if (prefetch_after) { prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop); } - #ifdef AARCH64 - bulk_store_backward(end_to, data_regs, 8); - #else if (split_write) { __ stmdb(end_to, RegisterSet(R7, R10), writeback); __ stmdb(end_to, RegisterSet(R3, R6), writeback); } else { __ stmdb(end_to, RegisterSet(R3, R10), writeback); } - #endif // AARCH64 __ b(L_copy_loop, ge); if (prefetch_before) { __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count); --- 1133,1161 ----
*** 1480,1553 **** // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes // __ add(count, count, ...); // addition useless for the bit tests assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits"); - #ifdef AARCH64 - assert (bytes_per_loop == 64, "adjust the code below"); - assert (bytes_per_count <= 8, "adjust the code below"); - - { - Label L; - __ tbz(count, exact_log2(32/bytes_per_count), L); - - bulk_load_backward(end_from, data_regs, 4); - bulk_store_backward(end_to, data_regs, 4); - - __ bind(L); - } - - { - Label L; - __ tbz(count, exact_log2(16/bytes_per_count), L); - - bulk_load_backward(end_from, data_regs, 2); - bulk_store_backward(end_to, data_regs, 2); - - __ bind(L); - } - - { - Label L; - __ tbz(count, exact_log2(8/bytes_per_count), L); - - __ ldr(R3, Address(end_from, -8, pre_indexed)); - __ str(R3, Address(end_to, -8, pre_indexed)); - - __ bind(L); - } - - if (bytes_per_count <= 4) { - Label L; - __ tbz(count, exact_log2(4/bytes_per_count), L); - - __ ldr_w(R3, Address(end_from, -4, pre_indexed)); - __ str_w(R3, Address(end_to, -4, pre_indexed)); - - __ bind(L); - } - - if (bytes_per_count <= 2) { - Label L; - __ tbz(count, exact_log2(2/bytes_per_count), L); - - __ ldrh(R3, Address(end_from, -2, pre_indexed)); - __ strh(R3, Address(end_to, -2, pre_indexed)); - - __ bind(L); - } - - if (bytes_per_count <= 1) { - Label L; - __ tbz(count, 0, L); - - __ ldrb(R3, Address(end_from, -1, pre_indexed)); - __ strb(R3, Address(end_to, -1, pre_indexed)); - - __ bind(L); - } - #else __ tst(count, 16 / bytes_per_count); __ ldmdb(end_from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes __ stmdb(end_to, RegisterSet(R3, R6), writeback, ne); __ tst(count, 8 / bytes_per_count); --- 1166,1175 ----
*** 1571,1589 **** __ ldrb(R3, Address(end_from, -1, pre_indexed), ne); __ strb(R3, Address(end_to, -1, pre_indexed), ne); } __ pop(RegisterSet(R4,R10)); - #endif // AARCH64 return count_per_loop; } // Generate the inner loop for shifted forward array copy (unaligned copy). // It can be used when bytes_per_count < wordSize, i.e. ! // byte/short copy on 32-bit ARM, byte/short/int/compressed-oop copy on AArch64. // // Arguments // from: start src address, 64 bits aligned // to: start dst address, (now) wordSize aligned // count: number of elements (32-bit int) --- 1193,1210 ---- __ ldrb(R3, Address(end_from, -1, pre_indexed), ne); __ strb(R3, Address(end_to, -1, pre_indexed), ne); } __ pop(RegisterSet(R4,R10)); return count_per_loop; } // Generate the inner loop for shifted forward array copy (unaligned copy). // It can be used when bytes_per_count < wordSize, i.e. ! // byte/short copy // // Arguments // from: start src address, 64 bits aligned // to: start dst address, (now) wordSize aligned // count: number of elements (32-bit int)
*** 1592,1606 **** // lsl_shift: shift applied to 'new' value to set the high bytes of the next write // // Return the minimum initial value for count // // Notes: ! // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64) // - 'to' aligned on wordSize // - 'count' must be greater or equal than the returned value // - 'lsr_shift' + 'lsl_shift' = BitsPerWord ! // - 'bytes_per_count' is 1 or 2 on 32-bit ARM; 1, 2 or 4 on AArch64 // // Increases 'to' by count*bytes_per_count. // // Scratches 'from' and 'count', R3-R10, R12 // --- 1213,1227 ---- // lsl_shift: shift applied to 'new' value to set the high bytes of the next write // // Return the minimum initial value for count // // Notes: ! // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA) // - 'to' aligned on wordSize // - 'count' must be greater or equal than the returned value // - 'lsr_shift' + 'lsl_shift' = BitsPerWord ! // - 'bytes_per_count' is 1 or 2 // // Increases 'to' by count*bytes_per_count. // // Scratches 'from' and 'count', R3-R10, R12 //
*** 1620,1633 **** const int count_per_loop = bytes_per_loop / bytes_per_count; arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_shifted; int pld_offset = config->pld_distance; - #ifndef AARCH64 bool split_read= config->split_ldm; bool split_write= config->split_stm; - #endif // !AARCH64 const bool prefetch_before = pld_offset < 0; const bool prefetch_after = pld_offset > 0; Label L_skip_pld, L_last_read, L_done; if (pld_offset != 0) { --- 1241,1252 ----
*** 1664,1679 **** } else { __ cmp_32(count, count_per_loop); __ b(L_last_read, lt); } - #ifdef AARCH64 - const Register data_regs[9] = {R3, R4, R5, R6, R7, R8, R9, R10, R12}; - __ logical_shift_right(R3, R12, lsr_shift); // part of R12 not yet written - __ subs_32(count, count, count_per_loop); - bulk_load_forward(from, &data_regs[1], 8); - #else // read 32 bytes if (split_read) { // if write is not split, use less registers in first set to reduce locking RegisterSet set1 = split_write ? RegisterSet(R4, R7) : RegisterSet(R4, R5); RegisterSet set2 = (split_write ? RegisterSet(R8, R10) : RegisterSet(R6, R10)) | R12; --- 1283,1292 ----
*** 1684,1694 **** } else { __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written __ ldmia(from, RegisterSet(R4, R10) | R12, writeback); // Note: small latency on R4 __ subs(count, count, count_per_loop); } - #endif // AARCH64 if (prefetch_after) { // do it after the 1st ldm/ldp anyway (no locking issues with early STM/STP) prefetch(from, to, pld_offset, bytes_per_loop); } --- 1297,1306 ----
*** 1699,1831 **** __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift)); // ... __ logical_shift_right(R5, R5, lsr_shift); __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift)); __ logical_shift_right(R6, R6, lsr_shift); __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift)); - #ifndef AARCH64 if (split_write) { // write the first half as soon as possible to reduce stm locking __ stmia(to, RegisterSet(R3, R6), writeback, prefetch_before ? gt : ge); } - #endif // !AARCH64 __ logical_shift_right(R7, R7, lsr_shift); __ orr(R7, R7, AsmOperand(R8, lsl, lsl_shift)); __ logical_shift_right(R8, R8, lsr_shift); __ orr(R8, R8, AsmOperand(R9, lsl, lsl_shift)); __ logical_shift_right(R9, R9, lsr_shift); __ orr(R9, R9, AsmOperand(R10, lsl, lsl_shift)); __ logical_shift_right(R10, R10, lsr_shift); __ orr(R10, R10, AsmOperand(R12, lsl, lsl_shift)); - #ifdef AARCH64 - bulk_store_forward(to, data_regs, 8); - #else if (split_write) { __ stmia(to, RegisterSet(R7, R10), writeback, prefetch_before ? gt : ge); } else { __ stmia(to, RegisterSet(R3, R10), writeback, prefetch_before ? gt : ge); } - #endif // AARCH64 __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop) if (prefetch_before) { // the first loop may end earlier, allowing to skip pld at the end __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count); - #ifndef AARCH64 __ stmia(to, RegisterSet(R3, R10), writeback); // stmia was skipped - #endif // !AARCH64 __ b(L_skip_pld, ge); __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop); } __ BIND(L_last_read); __ b(L_done, eq); - #ifdef AARCH64 - assert(bytes_per_count < 8, "adjust the code below"); - - __ logical_shift_right(R3, R12, lsr_shift); - - { - Label L; - __ tbz(count, exact_log2(32/bytes_per_count), L); - bulk_load_forward(from, &data_regs[1], 4); - __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); - __ logical_shift_right(R4, R4, lsr_shift); - __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift)); - __ logical_shift_right(R5, R5, lsr_shift); - __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift)); - __ logical_shift_right(R6, R6, lsr_shift); - __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift)); - bulk_store_forward(to, data_regs, 4); - __ logical_shift_right(R3, R7, lsr_shift); - __ bind(L); - } - - { - Label L; - __ tbz(count, exact_log2(16/bytes_per_count), L); - bulk_load_forward(from, &data_regs[1], 2); - __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); - __ logical_shift_right(R4, R4, lsr_shift); - __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift)); - bulk_store_forward(to, data_regs, 2); - __ logical_shift_right(R3, R5, lsr_shift); - __ bind(L); - } - - { - Label L; - __ tbz(count, exact_log2(8/bytes_per_count), L); - __ ldr(R4, Address(from, 8, post_indexed)); - __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); - __ str(R3, Address(to, 8, post_indexed)); - __ logical_shift_right(R3, R4, lsr_shift); - __ bind(L); - } - - const int have_bytes = lsl_shift/BitsPerByte; // number of already read bytes in R3 - - // It remains less than wordSize to write. - // Do not check count if R3 already has maximal number of loaded elements (one less than wordSize). - if (have_bytes < wordSize - bytes_per_count) { - Label L; - __ andr(count, count, (uintx)(8/bytes_per_count-1)); // make count exact - __ cmp_32(count, have_bytes/bytes_per_count); // do we have enough bytes to store? - __ b(L, le); - __ ldr(R4, Address(from, 8, post_indexed)); - __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); - __ bind(L); - } - - { - Label L; - __ tbz(count, exact_log2(4/bytes_per_count), L); - __ str_w(R3, Address(to, 4, post_indexed)); - if (bytes_per_count < 4) { - __ logical_shift_right(R3, R3, 4*BitsPerByte); - } - __ bind(L); - } - - if (bytes_per_count <= 2) { - Label L; - __ tbz(count, exact_log2(2/bytes_per_count), L); - __ strh(R3, Address(to, 2, post_indexed)); - if (bytes_per_count < 2) { - __ logical_shift_right(R3, R3, 2*BitsPerByte); - } - __ bind(L); - } - - if (bytes_per_count <= 1) { - Label L; - __ tbz(count, exact_log2(1/bytes_per_count), L); - __ strb(R3, Address(to, 1, post_indexed)); - __ bind(L); - } - #else switch (bytes_per_count) { case 2: __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); __ tst(count, 8); __ ldmia(from, RegisterSet(R4, R7), writeback, ne); --- 1311,1351 ----
*** 1904,1922 **** __ tst(count, 1); __ strb(R3, Address(to, 1, post_indexed), ne); // one last byte break; } - #endif // AARCH64 __ BIND(L_done); return 0; // no minimum } // Generate the inner loop for shifted backward array copy (unaligned copy). // It can be used when bytes_per_count < wordSize, i.e. ! // byte/short copy on 32-bit ARM, byte/short/int/compressed-oop copy on AArch64. // // Arguments // end_from: end src address, 64 bits aligned // end_to: end dst address, (now) wordSize aligned // count: number of elements (32-bit int) --- 1424,1441 ---- __ tst(count, 1); __ strb(R3, Address(to, 1, post_indexed), ne); // one last byte break; } __ BIND(L_done); return 0; // no minimum } // Generate the inner loop for shifted backward array copy (unaligned copy). // It can be used when bytes_per_count < wordSize, i.e. ! // byte/short copy // // Arguments // end_from: end src address, 64 bits aligned // end_to: end dst address, (now) wordSize aligned // count: number of elements (32-bit int)
*** 1925,1939 **** // lsr_shift: shift applied to 'new' value to set the low bytes of the next write // // Return the minimum initial value for count // // Notes: ! // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64) // - 'end_to' aligned on wordSize // - 'count' must be greater or equal than the returned value // - 'lsr_shift' + 'lsl_shift' = 'BitsPerWord' ! // - 'bytes_per_count' is 1 or 2 on 32-bit ARM; 1, 2 or 4 on AArch64 // // Decreases 'end_to' by count*bytes_per_count. // // Scratches 'end_from', 'count', R3-R10, R12 // --- 1444,1458 ---- // lsr_shift: shift applied to 'new' value to set the low bytes of the next write // // Return the minimum initial value for count // // Notes: ! // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA) // - 'end_to' aligned on wordSize // - 'count' must be greater or equal than the returned value // - 'lsr_shift' + 'lsl_shift' = 'BitsPerWord' ! // - 'bytes_per_count' is 1 or 2 on 32-bit ARM // // Decreases 'end_to' by count*bytes_per_count. // // Scratches 'end_from', 'count', R3-R10, R12 //
*** 1953,1966 **** const int count_per_loop = bytes_per_loop / bytes_per_count; arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_shifted; int pld_offset = config->pld_distance; - #ifndef AARCH64 bool split_read= config->split_ldm; bool split_write= config->split_stm; - #endif // !AARCH64 const bool prefetch_before = pld_offset < 0; const bool prefetch_after = pld_offset > 0; --- 1472,1483 ----
*** 1999,2022 **** } else { __ cmp_32(count, count_per_loop); __ b(L_last_read, lt); } - #ifdef AARCH64 - __ logical_shift_left(R12, R3, lsl_shift); - const Register data_regs[9] = {R3, R4, R5, R6, R7, R8, R9, R10, R12}; - bulk_load_backward(end_from, data_regs, 8); - #else if (split_read) { __ ldmdb(end_from, RegisterSet(R7, R10), writeback); __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written __ ldmdb(end_from, RegisterSet(R3, R6), writeback); } else { __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written __ ldmdb(end_from, RegisterSet(R3, R10), writeback); } - #endif // AARCH64 __ subs_32(count, count, count_per_loop); if (prefetch_after) { // do prefetch during ldm/ldp latency prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop); --- 1516,1533 ----
*** 2032,2170 **** __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift)); __ logical_shift_left(R7, R7, lsl_shift); __ orr(R7, R7, AsmOperand(R6, lsr, lsr_shift)); __ logical_shift_left(R6, R6, lsl_shift); __ orr(R6, R6, AsmOperand(R5, lsr, lsr_shift)); - #ifndef AARCH64 if (split_write) { // store early to reduce locking issues __ stmdb(end_to, RegisterSet(R6, R10) | R12, writeback, prefetch_before ? gt : ge); } - #endif // !AARCH64 __ logical_shift_left(R5, R5, lsl_shift); __ orr(R5, R5, AsmOperand(R4, lsr, lsr_shift)); __ logical_shift_left(R4, R4, lsl_shift); __ orr(R4, R4, AsmOperand(R3, lsr, lsr_shift)); - #ifdef AARCH64 - bulk_store_backward(end_to, &data_regs[1], 8); - #else if (split_write) { __ stmdb(end_to, RegisterSet(R4, R5), writeback, prefetch_before ? gt : ge); } else { __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback, prefetch_before ? gt : ge); } - #endif // AARCH64 __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop) if (prefetch_before) { // the first loop may end earlier, allowing to skip pld at the end __ cmn_32(count, ((bytes_per_loop + pld_offset)/bytes_per_count)); - #ifndef AARCH64 __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback); // stmdb was skipped - #endif // !AARCH64 __ b(L_skip_pld, ge); __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop); } __ BIND(L_last_read); __ b(L_done, eq); - #ifdef AARCH64 - assert(bytes_per_count < 8, "adjust the code below"); - - __ logical_shift_left(R12, R3, lsl_shift); - - { - Label L; - __ tbz(count, exact_log2(32/bytes_per_count), L); - bulk_load_backward(end_from, &data_regs[4], 4); - - __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); - __ logical_shift_left(R10, R10, lsl_shift); - __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift)); - __ logical_shift_left(R9, R9, lsl_shift); - __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift)); - __ logical_shift_left(R8, R8, lsl_shift); - __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift)); - - bulk_store_backward(end_to, &data_regs[5], 4); - __ logical_shift_left(R12, R7, lsl_shift); - __ bind(L); - } - - { - Label L; - __ tbz(count, exact_log2(16/bytes_per_count), L); - bulk_load_backward(end_from, &data_regs[6], 2); - - __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); - __ logical_shift_left(R10, R10, lsl_shift); - __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift)); - - bulk_store_backward(end_to, &data_regs[7], 2); - __ logical_shift_left(R12, R9, lsl_shift); - __ bind(L); - } - - { - Label L; - __ tbz(count, exact_log2(8/bytes_per_count), L); - __ ldr(R10, Address(end_from, -8, pre_indexed)); - __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); - __ str(R12, Address(end_to, -8, pre_indexed)); - __ logical_shift_left(R12, R10, lsl_shift); - __ bind(L); - } - - const int have_bytes = lsr_shift/BitsPerByte; // number of already read bytes in R12 - - // It remains less than wordSize to write. - // Do not check count if R12 already has maximal number of loaded elements (one less than wordSize). - if (have_bytes < wordSize - bytes_per_count) { - Label L; - __ andr(count, count, (uintx)(8/bytes_per_count-1)); // make count exact - __ cmp_32(count, have_bytes/bytes_per_count); // do we have enough bytes to store? - __ b(L, le); - __ ldr(R10, Address(end_from, -8, pre_indexed)); - __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); - __ bind(L); - } - - assert (bytes_per_count <= 4, "must be"); - - { - Label L; - __ tbz(count, exact_log2(4/bytes_per_count), L); - __ logical_shift_right(R9, R12, (wordSize-4)*BitsPerByte); - __ str_w(R9, Address(end_to, -4, pre_indexed)); // Write 4 MSB - if (bytes_per_count < 4) { - __ logical_shift_left(R12, R12, 4*BitsPerByte); // Promote remaining bytes to MSB - } - __ bind(L); - } - - if (bytes_per_count <= 2) { - Label L; - __ tbz(count, exact_log2(2/bytes_per_count), L); - __ logical_shift_right(R9, R12, (wordSize-2)*BitsPerByte); - __ strh(R9, Address(end_to, -2, pre_indexed)); // Write 2 MSB - if (bytes_per_count < 2) { - __ logical_shift_left(R12, R12, 2*BitsPerByte); // Promote remaining bytes to MSB - } - __ bind(L); - } - - if (bytes_per_count <= 1) { - Label L; - __ tbz(count, exact_log2(1/bytes_per_count), L); - __ logical_shift_right(R9, R12, (wordSize-1)*BitsPerByte); - __ strb(R9, Address(end_to, -1, pre_indexed)); // Write 1 MSB - __ bind(L); - } - #else switch(bytes_per_count) { case 2: __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written __ tst(count, 8); __ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne); --- 1543,1580 ----
*** 2244,2254 **** __ mov(R12, AsmOperand(R12, lsr, 24), ne); __ strb(R12, Address(end_to, -1, pre_indexed), ne); break; } - #endif // AARCH64 __ BIND(L_done); return 0; // no minimum } --- 1654,1663 ----
*** 2259,2289 **** } else { return Address(base, -delta, pre_indexed); } } - #ifdef AARCH64 - // Loads one 'size_in_bytes'-sized value from 'from' in given direction, i.e. - // if forward: loads value at from and increases from by size - // if !forward: loads value at from-size_in_bytes and decreases from by size - void load_one(Register rd, Register from, int size_in_bytes, bool forward) { - assert_different_registers(from, rd); - Address addr = get_addr_with_indexing(from, size_in_bytes, forward); - __ load_sized_value(rd, addr, size_in_bytes, false); - } - - // Stores one 'size_in_bytes'-sized value to 'to' in given direction (see load_one) - void store_one(Register rd, Register to, int size_in_bytes, bool forward) { - assert_different_registers(to, rd); - Address addr = get_addr_with_indexing(to, size_in_bytes, forward); - __ store_sized_value(rd, addr, size_in_bytes); - } - #else - // load_one and store_one are the same as for AArch64 except for - // *) Support for condition execution - // *) Second value register argument for 8-byte values - void load_one(Register rd, Register from, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) { assert_different_registers(from, rd, rd2); if (size_in_bytes < 8) { Address addr = get_addr_with_indexing(from, size_in_bytes, forward); __ load_sized_value(rd, addr, size_in_bytes, false, cond); --- 1668,1677 ----
*** 2313,2323 **** } else { __ stmdb(to, RegisterSet(rd) | rd2, writeback, cond); } } } - #endif // AARCH64 // Copies data from 'from' to 'to' in specified direction to align 'from' by 64 bits. // (on 32-bit ARM 64-bit alignment is better for LDM). // // Arguments: --- 1701,1710 ----
*** 2334,2373 **** // decreases 'count' by the number of elements copied // // Returns maximum number of bytes which may be copied. int align_src(Register from, Register to, Register count, Register tmp, int bytes_per_count, bool forward) { assert_different_registers(from, to, count, tmp); - #ifdef AARCH64 - // TODO-AARCH64: replace by simple loop? - Label Laligned_by_2, Laligned_by_4, Laligned_by_8; - - if (bytes_per_count == 1) { - __ tbz(from, 0, Laligned_by_2); - __ sub_32(count, count, 1); - load_one(tmp, from, 1, forward); - store_one(tmp, to, 1, forward); - } - - __ BIND(Laligned_by_2); - - if (bytes_per_count <= 2) { - __ tbz(from, 1, Laligned_by_4); - __ sub_32(count, count, 2/bytes_per_count); - load_one(tmp, from, 2, forward); - store_one(tmp, to, 2, forward); - } - - __ BIND(Laligned_by_4); - - if (bytes_per_count <= 4) { - __ tbz(from, 2, Laligned_by_8); - __ sub_32(count, count, 4/bytes_per_count); - load_one(tmp, from, 4, forward); - store_one(tmp, to, 4, forward); - } - __ BIND(Laligned_by_8); - #else // AARCH64 if (bytes_per_count < 8) { Label L_align_src; __ BIND(L_align_src); __ tst(from, 7); // ne => not aligned: copy one element and (if bytes_per_count < 4) loop --- 1721,1730 ----
*** 2376,2386 **** store_one(tmp, to, bytes_per_count, forward, ne); if (bytes_per_count < 4) { __ b(L_align_src, ne); // if bytes_per_count == 4, then 0 or 1 loop iterations are enough } } - #endif // AARCH64 return 7/bytes_per_count; } // Copies 'count' of 'bytes_per_count'-sized elements in the specified direction. // --- 1733,1742 ----
*** 2396,2426 **** // shifts 'from' and 'to' void copy_small_array(Register from, Register to, Register count, Register tmp, Register tmp2, int bytes_per_count, bool forward, Label & entry) { assert_different_registers(from, to, count, tmp); __ align(OptoLoopAlignment); - #ifdef AARCH64 - Label L_small_array_done, L_small_array_loop; - __ BIND(entry); - __ cbz_32(count, L_small_array_done); - - __ BIND(L_small_array_loop); - __ subs_32(count, count, 1); - load_one(tmp, from, bytes_per_count, forward); - store_one(tmp, to, bytes_per_count, forward); - __ b(L_small_array_loop, gt); - - __ BIND(L_small_array_done); - #else Label L_small_loop; __ BIND(L_small_loop); store_one(tmp, to, bytes_per_count, forward, al, tmp2); __ BIND(entry); // entry point __ subs(count, count, 1); load_one(tmp, from, bytes_per_count, forward, ge, tmp2); __ b(L_small_loop, ge); - #endif // AARCH64 } // Aligns 'to' by reading one word from 'from' and writting its part to 'to'. // // Arguments: --- 1752,1768 ----
*** 2498,2508 **** int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, Register Rval, int to_remainder, int bytes_per_count, bool forward) { assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is invalid"); ! const Register tmp = forward ? R3 : R12; // TODO-AARCH64: on cojoint_short R4 was used for tmp assert_different_registers(from, to, count, Rval, tmp); int required_to_align = align_dst(to, count, Rval, tmp, to_remainder, bytes_per_count, forward); int lsr_shift = (wordSize - to_remainder) * BitsPerByte; --- 1840,1850 ---- int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, Register Rval, int to_remainder, int bytes_per_count, bool forward) { assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is invalid"); ! const Register tmp = forward ? R3 : R12; assert_different_registers(from, to, count, Rval, tmp); int required_to_align = align_dst(to, count, Rval, tmp, to_remainder, bytes_per_count, forward); int lsr_shift = (wordSize - to_remainder) * BitsPerByte;
*** 2532,2645 **** // 'from' must be aligned by wordSize // 'to' must be aligned by bytes_per_count but must not be aligned by wordSize // shifts 'to' by the number of copied bytes // // Scratches 'from', 'count', R3 and R12. ! // On AArch64 also scratches R4-R10, on 32-bit ARM saves them to use. int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool forward) { const Register Rval = forward ? R12 : R3; // as generate_{forward,backward}_shifted_copy_loop expect int min_copy = 0; // Note: if {seq} is a sequence of numbers, L{seq} means that if the execution reaches this point, // then the remainder of 'to' divided by wordSize is one of elements of {seq}. - #ifdef AARCH64 - // TODO-AARCH64: simplify, tune - - load_one(Rval, from, wordSize, forward); - - Label L_loop_finished; - - switch (bytes_per_count) { - case 4: - min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward); - break; - case 2: - { - Label L2, L4, L6; - - __ tbz(to, 1, L4); - __ tbz(to, 2, L2); - - __ BIND(L6); - int min_copy6 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 6, bytes_per_count, forward); - __ b(L_loop_finished); - - __ BIND(L2); - int min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); - __ b(L_loop_finished); - - __ BIND(L4); - int min_copy4 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward); - - min_copy = MAX2(MAX2(min_copy2, min_copy4), min_copy6); - break; - } - case 1: - { - Label L1, L2, L3, L4, L5, L6, L7; - Label L15, L26; - Label L246; - - __ tbz(to, 0, L246); - __ tbz(to, 1, L15); - __ tbz(to, 2, L3); - - __ BIND(L7); - int min_copy7 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 7, bytes_per_count, forward); - __ b(L_loop_finished); - - __ BIND(L246); - __ tbnz(to, 1, L26); - - __ BIND(L4); - int min_copy4 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward); - __ b(L_loop_finished); - - __ BIND(L15); - __ tbz(to, 2, L1); - - __ BIND(L5); - int min_copy5 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 5, bytes_per_count, forward); - __ b(L_loop_finished); - - __ BIND(L3); - int min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward); - __ b(L_loop_finished); - - __ BIND(L26); - __ tbz(to, 2, L2); - - __ BIND(L6); - int min_copy6 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 6, bytes_per_count, forward); - __ b(L_loop_finished); - - __ BIND(L1); - int min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward); - __ b(L_loop_finished); - - __ BIND(L2); - int min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); - - - min_copy = MAX2(min_copy1, min_copy2); - min_copy = MAX2(min_copy, min_copy3); - min_copy = MAX2(min_copy, min_copy4); - min_copy = MAX2(min_copy, min_copy5); - min_copy = MAX2(min_copy, min_copy6); - min_copy = MAX2(min_copy, min_copy7); - break; - } - default: - ShouldNotReachHere(); - break; - } - __ BIND(L_loop_finished); - - #else __ push(RegisterSet(R4,R10)); load_one(Rval, from, wordSize, forward); switch (bytes_per_count) { case 2: --- 1874,1893 ---- // 'from' must be aligned by wordSize // 'to' must be aligned by bytes_per_count but must not be aligned by wordSize // shifts 'to' by the number of copied bytes // // Scratches 'from', 'count', R3 and R12. ! // R4-R10 saved for use. int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool forward) { const Register Rval = forward ? R12 : R3; // as generate_{forward,backward}_shifted_copy_loop expect int min_copy = 0; // Note: if {seq} is a sequence of numbers, L{seq} means that if the execution reaches this point, // then the remainder of 'to' divided by wordSize is one of elements of {seq}. __ push(RegisterSet(R4,R10)); load_one(Rval, from, wordSize, forward); switch (bytes_per_count) { case 2:
*** 2692,2702 **** ShouldNotReachHere(); break; } __ pop(RegisterSet(R4,R10)); - #endif // AARCH64 return min_copy; } #ifndef PRODUCT --- 1940,1949 ----
*** 2774,2784 **** // aligned copy loop which require <=7 bytes for src alignment, and 8 words for aligned copy loop. const int small_copy_limit = (8*wordSize + 7) / bytes_per_count; Label L_small_array; __ cmp_32(count, small_copy_limit); ! __ b(L_small_array, le); // TODO-AARCH64: le vs lt // Otherwise proceed with large implementation. bool from_is_aligned = (bytes_per_count >= 8); if (aligned && forward && (HeapWordSize % 8 == 0)) { --- 2021,2031 ---- // aligned copy loop which require <=7 bytes for src alignment, and 8 words for aligned copy loop. const int small_copy_limit = (8*wordSize + 7) / bytes_per_count; Label L_small_array; __ cmp_32(count, small_copy_limit); ! __ b(L_small_array, le); // Otherwise proceed with large implementation. bool from_is_aligned = (bytes_per_count >= 8); if (aligned && forward && (HeapWordSize % 8 == 0)) {
*** 2862,2872 **** // Arguments: // to: destination pointer after copying. // if 'forward' then 'to' == upper bound, else 'to' == beginning of the modified region // count: total number of copied elements, 32-bit int // ! // Blows all volatile (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR) and 'to', 'count', 'tmp' registers. void oop_arraycopy_stub_epilogue_helper(Register to, Register count, Register tmp, bool status, bool forward, DecoratorSet decorators) { assert_different_registers(to, count, tmp); if (forward) { // 'to' is upper bound of the modified region --- 2109,2119 ---- // Arguments: // to: destination pointer after copying. // if 'forward' then 'to' == upper bound, else 'to' == beginning of the modified region // count: total number of copied elements, 32-bit int // ! // Blows all volatile R0-R3, Rtemp, LR) and 'to', 'count', 'tmp' registers. void oop_arraycopy_stub_epilogue_helper(Register to, Register count, Register tmp, bool status, bool forward, DecoratorSet decorators) { assert_different_registers(to, count, tmp); if (forward) { // 'to' is upper bound of the modified region
*** 2881,2896 **** if (status) { __ mov(R0, 0); // OK } - #ifdef AARCH64 - __ raw_pop(LR, ZR); - __ ret(); - #else __ pop(PC); - #endif // AARCH64 } // Generate stub for assign-compatible oop copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. --- 2128,2138 ----
*** 2937,2951 **** const Register saved_count = LR; const int callee_saved_regs = 3; // R0-R2 // LR is used later to save barrier args - #ifdef AARCH64 - __ raw_push(LR, ZR); - #else __ push(LR); - #endif // AARCH64 DecoratorSet decorators = IN_HEAP | IS_ARRAY; if (disjoint) { decorators |= ARRAYCOPY_DISJOINT; } --- 2179,2189 ----
*** 3019,3035 **** oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward, decorators); } if (!to_is_aligned) { - // !to_is_aligned <=> UseCompressedOops && AArch64 __ BIND(L_unaligned_dst); - #ifdef AARCH64 - assert (UseCompressedOops, "unaligned oop array copy may be requested only with UseCompressedOops"); - #else ShouldNotReachHere(); - #endif // AARCH64 int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward); assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count"); oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward, decorators); } --- 2257,2268 ----
*** 3058,3071 **** const Register R3_bits = R3; // test copy of low bits __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); - #ifdef AARCH64 - __ NOT_IMPLEMENTED(); - start = NULL; - #else const Register tmp = Rtemp; // bump this on entry, not on exit: inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R3, tmp); --- 2291,2300 ----
*** 3083,3093 **** __ tst(R3_bits, BytesPerShort-1); __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerShort), eq); __ jump(StubRoutines::_jshort_arraycopy, relocInfo::runtime_call_type, tmp, eq); __ jump(StubRoutines::_jbyte_arraycopy, relocInfo::runtime_call_type, tmp); - #endif return start; } // Helper for generating a dynamic type check. // Smashes only the given temp registers. --- 2312,2321 ----
*** 3183,3193 **** // Arguments for generated stub: // from: R0 // to: R1 // count: R2 treated as signed 32-bit int // ckoff: R3 (super_check_offset) ! // ckval: R4 (AArch64) / SP[0] (32-bit ARM) (super_klass) // ret: R0 zero for success; (-1^K) where K is partial transfer count (32-bit) // address generate_checkcast_copy(const char * name) { __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); --- 2411,2421 ---- // Arguments for generated stub: // from: R0 // to: R1 // count: R2 treated as signed 32-bit int // ckoff: R3 (super_check_offset) ! // ckval: R4 (super_klass) // ret: R0 zero for success; (-1^K) where K is partial transfer count (32-bit) // address generate_checkcast_copy(const char * name) { __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name);
*** 3198,3261 **** const Register count = R2; // elements count const Register R3_ckoff = R3; // super_check_offset const Register R4_ckval = R4; // super_klass ! const int callee_saved_regs = AARCH64_ONLY(5) NOT_AARCH64(4); // LR saved differently Label load_element, store_element, do_epilogue, fail; BLOCK_COMMENT("Entry:"); __ zap_high_non_significant_bits(R2); - #ifdef AARCH64 - __ raw_push(LR, ZR); - __ raw_push(R19, R20); - #else int pushed = 0; __ push(LR); pushed+=1; - #endif // AARCH64 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST; BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); bs->arraycopy_prologue(_masm, decorators, true, to, count, callee_saved_regs); - #ifndef AARCH64 const RegisterSet caller_saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11; __ push(caller_saved_regs); assert(caller_saved_regs.size() == 6, "check the count"); pushed+=6; __ ldr(R4_ckval,Address(SP, wordSize*pushed)); // read the argument that was on the stack - #endif // !AARCH64 // Save arguments for barrier generation (after the pre barrier): // - must be a caller saved register and not LR // - ARM32: avoid R10 in case RThread is needed ! const Register saved_count = AARCH64_ONLY(R19) NOT_AARCH64(altFP_7_11); ! #ifdef AARCH64 ! __ mov_w(saved_count, count); ! __ cbnz_w(count, load_element); // and test count ! #else __ movs(saved_count, count); // and test count __ b(load_element,ne); - #endif // AARCH64 // nothing to copy __ mov(R0, 0); - #ifdef AARCH64 - __ raw_pop(R19, R20); - __ raw_pop(LR, ZR); - __ ret(); - #else __ pop(caller_saved_regs); __ pop(PC); - #endif // AARCH64 // ======== begin loop ======== // (Loop is rotated; its entry is load_element.) __ align(OptoLoopAlignment); __ BIND(store_element); --- 2426,2471 ---- const Register count = R2; // elements count const Register R3_ckoff = R3; // super_check_offset const Register R4_ckval = R4; // super_klass ! const int callee_saved_regs = 4; // LR saved differently Label load_element, store_element, do_epilogue, fail; BLOCK_COMMENT("Entry:"); __ zap_high_non_significant_bits(R2); int pushed = 0; __ push(LR); pushed+=1; DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST; BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); bs->arraycopy_prologue(_masm, decorators, true, to, count, callee_saved_regs); const RegisterSet caller_saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11; __ push(caller_saved_regs); assert(caller_saved_regs.size() == 6, "check the count"); pushed+=6; __ ldr(R4_ckval,Address(SP, wordSize*pushed)); // read the argument that was on the stack // Save arguments for barrier generation (after the pre barrier): // - must be a caller saved register and not LR // - ARM32: avoid R10 in case RThread is needed ! const Register saved_count = altFP_7_11; __ movs(saved_count, count); // and test count __ b(load_element,ne); // nothing to copy __ mov(R0, 0); __ pop(caller_saved_regs); __ pop(PC); // ======== begin loop ======== // (Loop is rotated; its entry is load_element.) __ align(OptoLoopAlignment); __ BIND(store_element);
*** 3288,3298 **** // Note: fail marked by the fact that count differs from saved_count __ BIND(do_epilogue); ! Register copied = AARCH64_ONLY(R20) NOT_AARCH64(R4); // saved Label L_not_copied; __ subs_32(copied, saved_count, count); // copied count (in saved reg) __ b(L_not_copied, eq); // nothing was copied, skip post barrier __ sub(to, to, AsmOperand(copied, lsl, LogBytesPerHeapOop)); // initial to value --- 2498,2508 ---- // Note: fail marked by the fact that count differs from saved_count __ BIND(do_epilogue); ! Register copied = R4; // saved Label L_not_copied; __ subs_32(copied, saved_count, count); // copied count (in saved reg) __ b(L_not_copied, eq); // nothing was copied, skip post barrier __ sub(to, to, AsmOperand(copied, lsl, LogBytesPerHeapOop)); // initial to value
*** 3304,3324 **** inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R3, R12); __ BIND(L_not_copied); __ cmp_32(copied, saved_count); // values preserved in saved registers - #ifdef AARCH64 - __ csinv(R0, ZR, copied, eq); // 0 if all copied else NOT(copied) - __ raw_pop(R19, R20); - __ raw_pop(LR, ZR); - __ ret(); - #else __ mov(R0, 0, eq); // 0 if all copied __ mvn(R0, copied, ne); // else NOT(copied) __ pop(caller_saved_regs); __ pop(PC); - #endif // AARCH64 return start; } // Perform range checks on the proposed arraycopy. --- 2514,2527 ----
*** 3358,3368 **** // Input: // R0 - src oop // R1 - src_pos (32-bit int) // R2 - dst oop // R3 - dst_pos (32-bit int) ! // R4 (AArch64) / SP[0] (32-bit ARM) - element count (32-bit int) // // Output: (32-bit int) // R0 == 0 - success // R0 < 0 - need to call System.arraycopy // --- 2561,2571 ---- // Input: // R0 - src oop // R1 - src_pos (32-bit int) // R2 - dst oop // R3 - dst_pos (32-bit int) ! // R4 - element count (32-bit int) // // Output: (32-bit int) // R0 == 0 - success // R0 < 0 - need to call System.arraycopy //
*** 3376,3411 **** const Register dst_pos = R3; // destination position // registers used as temp const Register R5_src_klass = R5; // source array klass const Register R6_dst_klass = R6; // destination array klass ! const Register R_lh = AARCH64_ONLY(R7) NOT_AARCH64(altFP_7_11); // layout handler const Register R8_temp = R8; __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); __ zap_high_non_significant_bits(R1); __ zap_high_non_significant_bits(R3); __ zap_high_non_significant_bits(R4); - #ifndef AARCH64 int pushed = 0; const RegisterSet saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11; __ push(saved_regs); assert(saved_regs.size() == 6, "check the count"); pushed+=6; - #endif // !AARCH64 // bump this on entry, not on exit: inc_counter_np(SharedRuntime::_generic_array_copy_ctr, R5, R12); const Register length = R4; // elements count - #ifndef AARCH64 __ ldr(length, Address(SP,4*pushed)); - #endif // !AARCH64 //----------------------------------------------------------------------- // Assembler stubs will be used for this call to arraycopy // if the following conditions are met: --- 2579,2610 ---- const Register dst_pos = R3; // destination position // registers used as temp const Register R5_src_klass = R5; // source array klass const Register R6_dst_klass = R6; // destination array klass ! const Register R_lh = altFP_7_11; // layout handler const Register R8_temp = R8; __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); address start = __ pc(); __ zap_high_non_significant_bits(R1); __ zap_high_non_significant_bits(R3); __ zap_high_non_significant_bits(R4); int pushed = 0; const RegisterSet saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11; __ push(saved_regs); assert(saved_regs.size() == 6, "check the count"); pushed+=6; // bump this on entry, not on exit: inc_counter_np(SharedRuntime::_generic_array_copy_ctr, R5, R12); const Register length = R4; // elements count __ ldr(length, Address(SP,4*pushed)); //----------------------------------------------------------------------- // Assembler stubs will be used for this call to arraycopy // if the following conditions are met:
*** 3494,3540 **** const Register count = R2; // elements count // 'from', 'to', 'count' registers should be set in this order // since they are the same as 'src', 'src_pos', 'dst'. - #ifdef AARCH64 - - BLOCK_COMMENT("choose copy loop based on element size and scale indexes"); - Label Lbyte, Lshort, Lint, Llong; - - __ cbz(R12_elsize, Lbyte); - - assert (LogBytesPerShort < LogBytesPerInt && LogBytesPerInt < LogBytesPerLong, "must be"); - __ cmp(R12_elsize, LogBytesPerInt); - __ b(Lint, eq); - __ b(Llong, gt); - - __ BIND(Lshort); - __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerShort); - __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerShort); - __ mov(count, length); - __ b(StubRoutines::_jshort_arraycopy); - - __ BIND(Lint); - __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerInt); - __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerInt); - __ mov(count, length); - __ b(StubRoutines::_jint_arraycopy); - - __ BIND(Lbyte); - __ add_ptr_scaled_int32(from, src, src_pos, 0); - __ add_ptr_scaled_int32(to, dst, dst_pos, 0); - __ mov(count, length); - __ b(StubRoutines::_jbyte_arraycopy); - - __ BIND(Llong); - __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerLong); - __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerLong); - __ mov(count, length); - __ b(StubRoutines::_jlong_arraycopy); - - #else // AARCH64 BLOCK_COMMENT("scale indexes to element size"); __ add(from, src, AsmOperand(src_pos, lsl, R12_elsize)); // src_addr __ add(to, dst, AsmOperand(dst_pos, lsl, R12_elsize)); // dst_addr --- 2693,2702 ----
*** 3554,3564 **** __ cmp(R12_elsize, LogBytesPerInt); __ b(StubRoutines::_jint_arraycopy,eq); __ b(StubRoutines::_jlong_arraycopy); - #endif // AARCH64 } // ObjArrayKlass __ BIND(L_objArray); // live at this point: R5_src_klass, R6_dst_klass, src[_pos], dst[_pos], length --- 2716,2725 ----
*** 3584,3596 **** __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop); // src_addr __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop); // dst_addr __ BIND(L_plain_copy); __ mov(count, length); - #ifndef AARCH64 __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ? - #endif // !AARCH64 __ b(StubRoutines::_oop_arraycopy); } { __ BIND(L_checkcast_copy); --- 2745,2755 ----
*** 3626,3657 **** // Generate the type check. int sco_offset = in_bytes(Klass::super_check_offset_offset()); __ ldr_u32(sco_temp, Address(R6_dst_klass, sco_offset)); generate_type_check(R5_src_klass, sco_temp, R6_dst_klass, R8_temp, R9, ! AARCH64_ONLY(R10) NOT_AARCH64(R12), L_plain_copy); // Fetch destination element klass from the ObjArrayKlass header. int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); // the checkcast_copy loop needs two extra arguments: ! const Register Rdst_elem_klass = AARCH64_ONLY(R4) NOT_AARCH64(R3); __ ldr(Rdst_elem_klass, Address(R6_dst_klass, ek_offset)); // dest elem klass - #ifndef AARCH64 __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ? __ str(Rdst_elem_klass, Address(SP,0)); // dest elem klass argument - #endif // !AARCH64 __ ldr_u32(R3, Address(Rdst_elem_klass, sco_offset)); // sco of elem klass __ b(StubRoutines::_checkcast_arraycopy); } __ BIND(L_failed); - #ifndef AARCH64 __ pop(saved_regs); - #endif // !AARCH64 __ mvn(R0, 0); // failure, with 0 copied __ ret(); return start; } --- 2785,2812 ---- // Generate the type check. int sco_offset = in_bytes(Klass::super_check_offset_offset()); __ ldr_u32(sco_temp, Address(R6_dst_klass, sco_offset)); generate_type_check(R5_src_klass, sco_temp, R6_dst_klass, R8_temp, R9, ! R12, L_plain_copy); // Fetch destination element klass from the ObjArrayKlass header. int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); // the checkcast_copy loop needs two extra arguments: ! const Register Rdst_elem_klass = R3; __ ldr(Rdst_elem_klass, Address(R6_dst_klass, ek_offset)); // dest elem klass __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ? __ str(Rdst_elem_klass, Address(SP,0)); // dest elem klass argument __ ldr_u32(R3, Address(Rdst_elem_klass, sco_offset)); // sco of elem klass __ b(StubRoutines::_checkcast_arraycopy); } __ BIND(L_failed); __ pop(saved_regs); __ mvn(R0, 0); // failure, with 0 copied __ ret(); return start; }
*** 3681,3695 **** case 4: // int32_t __ ldr_s32(R1, Address(R0)); break; case 8: // int64_t - #ifdef AARCH64 - __ ldr(R1, Address(R0)); - #else Unimplemented(); - #endif // AARCH64 break; default: ShouldNotReachHere(); } --- 2836,2846 ----
*** 3763,4233 **** StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy"); } - #ifndef AARCH64 #define COMPILE_CRYPTO #include "stubRoutinesCrypto_arm.cpp" - #else - - #ifdef COMPILER2 - // Arguments: - // - // Inputs: - // c_rarg0 - source byte array address - // c_rarg1 - destination byte array address - // c_rarg2 - K (key) in little endian int array - // - address generate_aescrypt_encryptBlock() { - __ align(CodeEntryAlignment); - StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); - - Label L_doLast; - - const Register from = c_rarg0; // source array address - const Register to = c_rarg1; // destination array address - const Register key = c_rarg2; // key array address - const Register keylen = R8; - - address start = __ pc(); - __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed)); - __ mov(FP, SP); - - __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); - - __ vld1(V0, Address(from), MacroAssembler::VELEM_SIZE_8, 128); // get 16 bytes of input - - __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - - int quad = 1; - __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad); - __ aese(V0, V1); - __ aesmc(V0, V0); - __ aese(V0, V2); - __ aesmc(V0, V0); - __ aese(V0, V3); - __ aesmc(V0, V0); - __ aese(V0, V4); - __ aesmc(V0, V0); - - __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad); - __ aese(V0, V1); - __ aesmc(V0, V0); - __ aese(V0, V2); - __ aesmc(V0, V0); - __ aese(V0, V3); - __ aesmc(V0, V0); - __ aese(V0, V4); - __ aesmc(V0, V0); - - __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); - - __ cmp_w(keylen, 44); - __ b(L_doLast, eq); - - __ aese(V0, V1); - __ aesmc(V0, V0); - __ aese(V0, V2); - __ aesmc(V0, V0); - - __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); - - __ cmp_w(keylen, 52); - __ b(L_doLast, eq); - - __ aese(V0, V1); - __ aesmc(V0, V0); - __ aese(V0, V2); - __ aesmc(V0, V0); - - __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); - - __ BIND(L_doLast); - - __ aese(V0, V1); - __ aesmc(V0, V0); - __ aese(V0, V2); - - __ vld1(V1, Address(key), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); - __ eor(V0, V0, V1, MacroAssembler::VELEM_SIZE_8, quad); - - __ vst1(V0, Address(to), MacroAssembler::VELEM_SIZE_8, 128); - - __ mov(R0, 0); - - __ mov(SP, FP); - __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed)); - __ ret(LR); - - return start; - } - - // Arguments: - // - // Inputs: - // c_rarg0 - source byte array address - // c_rarg1 - destination byte array address - // c_rarg2 - K (key) in little endian int array - // - address generate_aescrypt_decryptBlock() { - assert(UseAES, "need AES instructions and misaligned SSE support"); - __ align(CodeEntryAlignment); - StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); - Label L_doLast; - - const Register from = c_rarg0; // source array address - const Register to = c_rarg1; // destination array address - const Register key = c_rarg2; // key array address - const Register keylen = R8; - - address start = __ pc(); - __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed)); - __ mov(FP, SP); - - __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); - - __ vld1(V0, Address(from), MacroAssembler::VELEM_SIZE_8, 128); // get 16 bytes of input - - __ vld1(V5, Address(key, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - - int quad = 1; - __ rev32(V5, V5, MacroAssembler::VELEM_SIZE_8, quad); - - __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad); - __ aesd(V0, V1); - __ aesimc(V0, V0); - __ aesd(V0, V2); - __ aesimc(V0, V0); - __ aesd(V0, V3); - __ aesimc(V0, V0); - __ aesd(V0, V4); - __ aesimc(V0, V0); - - __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad); - __ aesd(V0, V1); - __ aesimc(V0, V0); - __ aesd(V0, V2); - __ aesimc(V0, V0); - __ aesd(V0, V3); - __ aesimc(V0, V0); - __ aesd(V0, V4); - __ aesimc(V0, V0); - - __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); - - __ cmp_w(keylen, 44); - __ b(L_doLast, eq); - - __ aesd(V0, V1); - __ aesimc(V0, V0); - __ aesd(V0, V2); - __ aesimc(V0, V0); - - __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); - - __ cmp_w(keylen, 52); - __ b(L_doLast, eq); - - __ aesd(V0, V1); - __ aesimc(V0, V0); - __ aesd(V0, V2); - __ aesimc(V0, V0); - - __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); - - __ BIND(L_doLast); - - __ aesd(V0, V1); - __ aesimc(V0, V0); - __ aesd(V0, V2); - - __ eor(V0, V0, V5, MacroAssembler::VELEM_SIZE_8, quad); - - __ vst1(V0, Address(to), MacroAssembler::VELEM_SIZE_8, 128); - - __ mov(R0, 0); - - __ mov(SP, FP); - __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed)); - __ ret(LR); - - - return start; - } - - // Arguments: - // - // Inputs: - // c_rarg0 - source byte array address - // c_rarg1 - destination byte array address - // c_rarg2 - K (key) in little endian int array - // c_rarg3 - r vector byte array address - // c_rarg4 - input length - // - // Output: - // x0 - input length - // - address generate_cipherBlockChaining_encryptAESCrypt() { - assert(UseAES, "need AES instructions and misaligned SSE support"); - __ align(CodeEntryAlignment); - StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); - - Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; - - const Register from = c_rarg0; // source array address - const Register to = c_rarg1; // destination array address - const Register key = c_rarg2; // key array address - const Register rvec = c_rarg3; // r byte array initialized from initvector array address - // and left with the results of the last encryption block - const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) - const Register keylen = R8; - - address start = __ pc(); - __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed)); - __ mov(FP, SP); - - __ mov(R9, len_reg); - __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); - - __ vld1(V0, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128); - - __ cmp_w(keylen, 52); - __ b(L_loadkeys_44, cc); - __ b(L_loadkeys_52, eq); - - __ vld1(V17, V18, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - - int quad = 1; - __ rev32(V17, V17, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V18, V18, MacroAssembler::VELEM_SIZE_8, quad); - __ BIND(L_loadkeys_52); - __ vld1(V19, V20, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V19, V19, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V20, V20, MacroAssembler::VELEM_SIZE_8, quad); - __ BIND(L_loadkeys_44); - __ vld1(V21, V22, V23, V24, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V21, V21, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V22, V22, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V23, V23, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V24, V24, MacroAssembler::VELEM_SIZE_8, quad); - __ vld1(V25, V26, V27, V28, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V25, V25, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V26, V26, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V27, V27, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V28, V28, MacroAssembler::VELEM_SIZE_8, quad); - __ vld1(V29, V30, V31, Address(key), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V29, V29, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V30, V30, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V31, V31, MacroAssembler::VELEM_SIZE_8, quad); - - __ BIND(L_aes_loop); - __ vld1(V1, Address(from, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ eor(V0, V0, V1, MacroAssembler::VELEM_SIZE_8, quad); - - __ b(L_rounds_44, cc); - __ b(L_rounds_52, eq); - - __ aese(V0, V17); - __ aesmc(V0, V0); - __ aese(V0, V18); - __ aesmc(V0, V0); - __ BIND(L_rounds_52); - __ aese(V0, V19); - __ aesmc(V0, V0); - __ aese(V0, V20); - __ aesmc(V0, V0); - __ BIND(L_rounds_44); - __ aese(V0, V21); - __ aesmc(V0, V0); - __ aese(V0, V22); - __ aesmc(V0, V0); - __ aese(V0, V23); - __ aesmc(V0, V0); - __ aese(V0, V24); - __ aesmc(V0, V0); - __ aese(V0, V25); - __ aesmc(V0, V0); - __ aese(V0, V26); - __ aesmc(V0, V0); - __ aese(V0, V27); - __ aesmc(V0, V0); - __ aese(V0, V28); - __ aesmc(V0, V0); - __ aese(V0, V29); - __ aesmc(V0, V0); - __ aese(V0, V30); - __ eor(V0, V0, V31, MacroAssembler::VELEM_SIZE_8, quad); - - __ vst1(V0, Address(to, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ sub(len_reg, len_reg, 16); - __ cbnz(len_reg, L_aes_loop); - - __ vst1(V0, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128); - - __ mov(R0, R9); - - __ mov(SP, FP); - __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed)); - __ ret(LR); - - return start; - } - - // Arguments: - // - // Inputs: - // c_rarg0 - source byte array address - // c_rarg1 - destination byte array address - // c_rarg2 - K (key) in little endian int array - // c_rarg3 - r vector byte array address - // c_rarg4 - input length - // - // Output: - // rax - input length - // - address generate_cipherBlockChaining_decryptAESCrypt() { - assert(UseAES, "need AES instructions and misaligned SSE support"); - __ align(CodeEntryAlignment); - StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); - - Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; - - const Register from = c_rarg0; // source array address - const Register to = c_rarg1; // destination array address - const Register key = c_rarg2; // key array address - const Register rvec = c_rarg3; // r byte array initialized from initvector array address - // and left with the results of the last encryption block - const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) - const Register keylen = R8; - - address start = __ pc(); - __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed)); - __ mov(FP, SP); - - __ mov(R9, len_reg); - __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); - - __ vld1(V2, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128); - - __ vld1(V31, Address(key, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - - int quad = 1; - __ rev32(V31, V31, MacroAssembler::VELEM_SIZE_8, quad); - - __ cmp_w(keylen, 52); - __ b(L_loadkeys_44, cc); - __ b(L_loadkeys_52, eq); - - __ vld1(V17, V18, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V17, V17, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V18, V18, MacroAssembler::VELEM_SIZE_8, quad); - __ BIND(L_loadkeys_52); - __ vld1(V19, V20, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V19, V19, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V20, V20, MacroAssembler::VELEM_SIZE_8, quad); - __ BIND(L_loadkeys_44); - __ vld1(V21, V22, V23, V24, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V21, V21, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V22, V22, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V23, V23, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V24, V24, MacroAssembler::VELEM_SIZE_8, quad); - __ vld1(V25, V26, V27, V28, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V25, V25, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V26, V26, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V27, V27, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V28, V28, MacroAssembler::VELEM_SIZE_8, quad); - __ vld1(V29, V30, Address(key), MacroAssembler::VELEM_SIZE_8, 128); - __ rev32(V29, V29, MacroAssembler::VELEM_SIZE_8, quad); - __ rev32(V30, V30, MacroAssembler::VELEM_SIZE_8, quad); - - __ BIND(L_aes_loop); - __ vld1(V0, Address(from, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ orr(V1, V0, V0, MacroAssembler::VELEM_SIZE_8, quad); - - __ b(L_rounds_44, cc); - __ b(L_rounds_52, eq); - - __ aesd(V0, V17); - __ aesimc(V0, V0); - __ aesd(V0, V17); - __ aesimc(V0, V0); - __ BIND(L_rounds_52); - __ aesd(V0, V19); - __ aesimc(V0, V0); - __ aesd(V0, V20); - __ aesimc(V0, V0); - __ BIND(L_rounds_44); - __ aesd(V0, V21); - __ aesimc(V0, V0); - __ aesd(V0, V22); - __ aesimc(V0, V0); - __ aesd(V0, V23); - __ aesimc(V0, V0); - __ aesd(V0, V24); - __ aesimc(V0, V0); - __ aesd(V0, V25); - __ aesimc(V0, V0); - __ aesd(V0, V26); - __ aesimc(V0, V0); - __ aesd(V0, V27); - __ aesimc(V0, V0); - __ aesd(V0, V28); - __ aesimc(V0, V0); - __ aesd(V0, V29); - __ aesimc(V0, V0); - __ aesd(V0, V30); - __ eor(V0, V0, V31, MacroAssembler::VELEM_SIZE_8, quad); - __ eor(V0, V0, V2, MacroAssembler::VELEM_SIZE_8, quad); - - __ vst1(V0, Address(to, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); - __ orr(V2, V1, V1, MacroAssembler::VELEM_SIZE_8, quad); - - __ sub(len_reg, len_reg, 16); - __ cbnz(len_reg, L_aes_loop); - - __ vst1(V2, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128); - - __ mov(R0, R9); - - __ mov(SP, FP); - __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed)); - __ ret(LR); - - return start; - } - - #endif // COMPILER2 - #endif // AARCH64 private: #undef __ #define __ masm-> --- 2914,2925 ----
*** 4296,4316 **** StubRoutines::_catch_exception_entry = generate_catch_exception(); // stub for throwing stack overflow error used both by interpreter and compiler StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError)); - #ifndef AARCH64 // integer division used both by interpreter and compiler StubRoutines::Arm::_idiv_irem_entry = generate_idiv_irem(); StubRoutines::_atomic_add_entry = generate_atomic_add(); StubRoutines::_atomic_xchg_entry = generate_atomic_xchg(); StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg(); StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long(); StubRoutines::_atomic_load_long_entry = generate_atomic_load_long(); StubRoutines::_atomic_store_long_entry = generate_atomic_store_long(); - #endif // !AARCH64 } void generate_all() { // Generates all stubs and initializes the entry points --- 2988,3006 ----
*** 4336,4363 **** // Safefetch stubs. generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, &StubRoutines::_safefetch32_fault_pc, &StubRoutines::_safefetch32_continuation_pc); - #ifdef AARCH64 - generate_safefetch("SafeFetchN", wordSize, &StubRoutines::_safefetchN_entry, - &StubRoutines::_safefetchN_fault_pc, - &StubRoutines::_safefetchN_continuation_pc); - #ifdef COMPILER2 - if (UseAESIntrinsics) { - StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); - StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); - StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); - StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); - } - #endif - #else assert (sizeof(int) == wordSize, "32-bit architecture"); StubRoutines::_safefetchN_entry = StubRoutines::_safefetch32_entry; StubRoutines::_safefetchN_fault_pc = StubRoutines::_safefetch32_fault_pc; StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc; - #endif // AARCH64 #ifdef COMPILE_CRYPTO // generate AES intrinsics code if (UseAESIntrinsics) { aes_init(); --- 3026,3039 ----
src/hotspot/cpu/arm/stubGenerator_arm.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File