< prev index next >
src/cpu/x86/vm/stubGenerator_x86_32.cpp
Print this page
*** 2149,2158 ****
--- 2149,2169 ----
__ emit_data(0x08090a0b, relocInfo::none, 0 );
__ emit_data(0x0c0d0e0f, relocInfo::none, 0 );
return start;
}
+ address generate_counter_shuffle_mask() {
+ __ align(16);
+ StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
+ address start = __ pc();
+ __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
+ __ emit_data(0x08090a0b, relocInfo::none, 0);
+ __ emit_data(0x04050607, relocInfo::none, 0);
+ __ emit_data(0x00010203, relocInfo::none, 0);
+ return start;
+ }
+
// Utility routine for loading a 128-bit key word in little endian format
// can optionally specify that the shuffle mask is already in an xmmregister
void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
__ movdqu(xmmdst, Address(key, offset));
if (xmm_shuf_mask != NULL) {
*** 2174,2183 ****
--- 2185,2219 ----
void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
load_key(xmmtmp, key, offset, xmm_shuf_mask);
__ aesdec(xmmdst, xmmtmp);
}
+ // Utility routine for increase 128bit counter (iv in CTR mode)
+ // XMM_128bit, D3, D2, D1, D0
+ void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
+ __ pextrd(reg, xmmdst, 0x0);
+ __ addl(reg, inc_delta);
+ __ pinsrd(xmmdst, reg, 0x0);
+ __ jcc(Assembler::carryClear, next_block); // jump if no carry
+
+ __ pextrd(reg, xmmdst, 0x01); // Carry-> D1
+ __ addl(reg, 0x01);
+ __ pinsrd(xmmdst, reg, 0x01);
+ __ jcc(Assembler::carryClear, next_block); // jump if no carry
+
+ __ pextrd(reg, xmmdst, 0x02); // Carry-> D2
+ __ addl(reg, 0x01);
+ __ pinsrd(xmmdst, reg, 0x02);
+ __ jcc(Assembler::carryClear, next_block); // jump if no carry
+
+ __ pextrd(reg, xmmdst, 0x03); // Carry -> D3
+ __ addl(reg, 0x01);
+ __ pinsrd(xmmdst, reg, 0x03);
+
+ __ BIND(next_block); // next instruction
+ }
+
// Arguments:
//
// Inputs:
// c_rarg0 - source byte array address
*** 2749,2758 ****
--- 2785,2998 ----
__ jmp(L_exit);
return start;
}
+
+ // CTR AES crypt.
+ // In 32-bit stub, parallelize 4 blocks at a time
+ // Arguments:
+ //
+ // Inputs:
+ // c_rarg0 - source byte array address
+ // c_rarg1 - destination byte array address
+ // c_rarg2 - K (key) in little endian int array
+ // c_rarg3 - counter vector byte array address
+ // c_rarg4 - input length
+ //
+ // Output:
+ // rax - input length
+ //
+ address generate_counterMode_AESCrypt_Parallel() {
+ assert(UseAES, "need AES instructions and misaligned SSE support");
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
+ address start = __ pc();
+ const Register from = rsi; // source array address
+ const Register to = rdx; // destination array address
+ const Register key = rcx; // key array address
+ const Register counter = rdi; // counter byte array initialized from initvector array address
+
+ // and left with the results of the last encryption block
+ const Register len_reg = rbx; // src len (must be multiple of blocksize 16)
+ const Register pos = rax;
+
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+ handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi
+
+ // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+ // context for the registers used, where all instructions below are using 128-bit mode
+ // On EVEX without VL and BW, these instructions will all be AVX.
+ if (VM_Version::supports_avx512vlbw()) {
+ __ movl(rdx, 0xffff);
+ __ kmovdl(k1, rdx);
+ }
+
+ // load registers from incoming parameters
+ const Address from_param(rbp, 8+0);
+ const Address to_param (rbp, 8+4);
+ const Address key_param (rbp, 8+8);
+ const Address rvec_param (rbp, 8+12);
+ const Address len_param (rbp, 8+16);
+ __ movptr(from , from_param);
+ __ movptr(to , to_param);
+ __ movptr(key , key_param);
+ __ movptr(counter , rvec_param);
+ __ movptr(len_reg , len_param);
+
+ // xmm register assignments for the loops below
+ const XMMRegister xmm_curr_counter = xmm0;
+ const XMMRegister xmm_counter_shuf_mask = xmm1; // need to be reloaded
+ const XMMRegister xmm_key_shuf_mask = xmm2; // need to be reloaded
+ const XMMRegister xmm_key = xmm3;
+ const XMMRegister xmm_result0 = xmm4;
+ const XMMRegister xmm_result1 = xmm5;
+ const XMMRegister xmm_result2 = xmm6;
+ const XMMRegister xmm_result3 = xmm7;
+ const XMMRegister xmm_from0 = xmm1; //reuse XMM register
+ const XMMRegister xmm_from1 = xmm2;
+ const XMMRegister xmm_from2 = xmm3;
+ const XMMRegister xmm_from3 = xmm4;
+
+ //for key_128, key_192, key_256
+ const int rounds[3] = {10, 12, 14};
+ Label L_singleBlockLoopTop[3];
+ Label L_multiBlock_loopTop[3];
+ Label L_key192_top, L_key256_top;
+ Label L_incCounter[3][4]; // 3: different key length, 4: 4 blocks at a time
+ Label L_incCounter_single[3]; //for single block, key128, key192, key256
+ Label L_exit;
+ const int PARALLEL_FACTOR = 4; //because of the limited register number
+
+ // initialize counter with initial counter
+ __ movdqu(xmm_curr_counter, Address(counter, 0x00));
+ __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
+ __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled for increase
+
+ // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
+ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+ __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+ __ cmpl(rax, 52);
+ __ jcc(Assembler::equal, L_key192_top);
+ __ cmpl(rax, 60);
+ __ jcc(Assembler::equal, L_key256_top);
+
+ //key128 begins here
+ __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
+
+ #define CTR_DoFour(opc, src_reg) \
+ __ opc(xmm_result0, src_reg); \
+ __ opc(xmm_result1, src_reg); \
+ __ opc(xmm_result2, src_reg); \
+ __ opc(xmm_result3, src_reg);
+
+ // k == 0 : generate code for key_128
+ // k == 1 : generate code for key_192
+ // k == 2 : generate code for key_256
+ for (int k = 0; k < 3; ++k) {
+ //multi blocks starts here
+ __ align(OptoLoopAlignment);
+ __ BIND(L_multiBlock_loopTop[k]);
+ __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
+ __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
+
+ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+ __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
+
+ //load, then increase counters
+ CTR_DoFour(movdqa, xmm_curr_counter);
+ __ push(rbx);
+ inc_counter(rbx, xmm_result1, 0x01, L_incCounter[k][0]);
+ inc_counter(rbx, xmm_result2, 0x02, L_incCounter[k][1]);
+ inc_counter(rbx, xmm_result3, 0x03, L_incCounter[k][2]);
+ inc_counter(rbx, xmm_curr_counter, 0x04, L_incCounter[k][3]);
+ __ pop (rbx);
+
+ load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); // load Round 0 key. interleaving for better performance
+
+ CTR_DoFour(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
+ CTR_DoFour(pxor, xmm_key); //PXOR with Round 0 key
+
+ for (int i = 1; i < rounds[k]; ++i) {
+ load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
+ CTR_DoFour(aesenc, xmm_key);
+ }
+ load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
+ CTR_DoFour(aesenclast, xmm_key);
+
+ // get next PARALLEL_FACTOR blocks into xmm_from registers
+ __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+ __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
+ __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
+
+ // PXOR with input text
+ __ pxor(xmm_result0, xmm_from0); //result0 is xmm4
+ __ pxor(xmm_result1, xmm_from1);
+ __ pxor(xmm_result2, xmm_from2);
+
+ // store PARALLEL_FACTOR results into the next 64 bytes of output
+ __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
+ __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
+ __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
+
+ //do it here after xmm_result0 is saved, because xmm_from3 reuse the same register of xmm_result0.
+ __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
+ __ pxor(xmm_result3, xmm_from3);
+ __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
+
+ __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
+ __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
+ __ jmp(L_multiBlock_loopTop[k]);
+
+ // singleBlock starts here
+ __ align(OptoLoopAlignment);
+ __ BIND(L_singleBlockLoopTop[k]);
+ __ cmpptr(len_reg, 0);
+ __ jcc(Assembler::equal, L_exit);
+ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+ __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
+ __ movdqa(xmm_result0, xmm_curr_counter);
+ load_key(xmm_key, key, 0x00, xmm_key_shuf_mask);
+ __ push(rbx);//rbx is used for increasing counter
+ inc_counter(rbx, xmm_curr_counter, 0x01, L_incCounter_single[k]);
+ __ pop (rbx);
+ __ pshufb(xmm_result0, xmm_counter_shuf_mask);
+ __ pxor(xmm_result0, xmm_key);
+ for (int i = 1; i < rounds[k]; i++) {
+ load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
+ __ aesenc(xmm_result0, xmm_key);
+ }
+ load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
+ __ aesenclast(xmm_result0, xmm_key);
+ __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+ __ pxor(xmm_result0, xmm_from0);
+ __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
+ __ addptr(pos, AESBlockSize);
+ __ subptr(len_reg, AESBlockSize);
+ __ jmp(L_singleBlockLoopTop[k]);
+ }
+
+ __ BIND(L_exit);
+ __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
+ __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
+ handleSOERegisters(false /*restoring*/);
+ __ movptr(rax, len_param); // return length
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+
+ __ BIND (L_key192_top);
+ __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
+ __ jmp(L_multiBlock_loopTop[1]); //key192
+
+ __ BIND (L_key256_top);
+ __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
+ __ jmp(L_multiBlock_loopTop[2]); //key192
+
+ return start;
+ }
+
+
// byte swap x86 long
address generate_ghash_long_swap_mask() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
address start = __ pc();
*** 3339,3348 ****
--- 3579,3593 ----
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
}
+ if (UseCTRAESIntrinsics) {
+ StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
+ StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
+ }
+
// Generate GHASH intrinsics code
if (UseGHASHIntrinsics) {
StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
< prev index next >