--- old/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2015-11-24 10:02:27.303863600 -0800 +++ new/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2015-11-24 10:02:26.901540300 -0800 @@ -3054,6 +3054,15 @@ return start; } + address generate_counter_shuffle_mask() { + __ align(16); + StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask"); + address start = __ pc(); + __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); + __ emit_data64(0x0001020304050607, relocInfo::none); + return start; + } + // Utility routine for loading a 128-bit key word in little endian format // can optionally specify that the shuffle mask is already in an xmmregister void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { @@ -3064,7 +3073,19 @@ __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); } } - + + // Utility routine for increase 128bit counter (iv in CTR mode) + void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) { + __ pextrq(reg, xmmdst, 0x0); + __ addq(reg, inc_delta); + __ pinsrq(xmmdst, reg, 0x0); + __ jcc(Assembler::carryClear, next_block); // jump if no carry + __ pextrq(reg, xmmdst, 0x01); // Carry + __ addq(reg, 0x01); + __ pinsrq(xmmdst, reg, 0x01); //Carry end + __ BIND(next_block); // next instruction + } + // Arguments: // // Inputs: @@ -3715,6 +3736,219 @@ return start; } + // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time + // to hide instruction latency + // + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - counter vector byte array address + // c_rarg4 - input length + // + // Output: + // rax - input length + // + address generate_counterMode_AESCrypt_Parallel() { + assert(UseAES, "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); + address start = __ pc(); + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register counter = c_rarg3; // counter byte array initialized from counter array address + // and left with the results of the last encryption block +#ifndef _WIN64 + const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) +#else + const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 + const Register len_reg = r10; // pick the first volatile windows register +#endif + const Register pos = rax; // pos is the return value. + + const int PARALLEL_FACTOR = 6; + const XMMRegister xmm_counter_shuf_mask = xmm0; + const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front + const XMMRegister xmm_curr_counter = xmm2; + + const XMMRegister xmm_key_tmp0 = xmm3; + const XMMRegister xmm_key_tmp1 = xmm4; + + // registers holding the four results in the parallelized loop + const XMMRegister xmm_result0 = xmm5; + const XMMRegister xmm_result1 = xmm6; + const XMMRegister xmm_result2 = xmm7; + const XMMRegister xmm_result3 = xmm8; + const XMMRegister xmm_result4 = xmm9; + const XMMRegister xmm_result5 = xmm10; + + const XMMRegister xmm_from0 = xmm11; + const XMMRegister xmm_from1 = xmm12; + const XMMRegister xmm_from2 = xmm13; + const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64. + const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text + const XMMRegister xmm_from5 = xmm4; + + //for key_128, key_192, key_256 + const int rounds[3] = {10, 12, 14}; + Label L_multiBlock_loopTop[3]; + Label L_singleBlockLoopTop[3]; + Label L__incCounter[3][6]; //for 6 blocks + Label L__incCounter_single[3]; //for single block, key128, key192, key256 + + Label L_exit; + + __ enter(); // required for proper stackwalking of RuntimeStub frame + + // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge + // context for the registers used, where all instructions below are using 128-bit mode + // On EVEX without VL and BW, these instructions will all be AVX. + if (VM_Version::supports_avx512vlbw()) { + __ movl(rax, 0xffff); + __ kmovql(k1, rax); + } + +#ifdef _WIN64 + // on win64, fill len_reg from stack position + __ movl(len_reg, len_mem); + // save the xmm registers which must be preserved 6-14 + const int XMM_REG_NUM_KEY_LAST = 14; + __ subptr(rsp, -rsp_after_call_off * wordSize); + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { + __ movdqu(xmm_save(i), as_XMMRegister(i)); + } +#else + __ push(len_reg); // Save +#endif + __ push(rbx); // Save RBX + __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter + __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); + __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled + __ movptr(pos, 0); + + // key length could be only {11, 13, 15} * 4 = {44, 52, 60} + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + __ cmpl(rbx, 52); + __ jcc(Assembler::equal, L_multiBlock_loopTop[1]); + __ cmpl(rbx, 60); + __ jcc(Assembler::equal, L_multiBlock_loopTop[2]); + +#define CTR_DoSix(opc, src_reg) \ + __ opc(xmm_result0, src_reg); \ + __ opc(xmm_result1, src_reg); \ + __ opc(xmm_result2, src_reg); \ + __ opc(xmm_result3, src_reg); \ + __ opc(xmm_result4, src_reg); \ + __ opc(xmm_result5, src_reg); + + // k == 0 : generate code for key_128 + // k == 1 : generate code for key_192 + // k == 2 : generate code for key_256 + for (int k = 0; k < 3; ++k) { + //multi blocks starts here + __ align(OptoLoopAlignment); + __ BIND(L_multiBlock_loopTop[k]); + __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left + __ jcc(Assembler::less, L_singleBlockLoopTop[k]); + load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); + + //load, then increase counters + CTR_DoSix(movdqa, xmm_curr_counter); + inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]); + inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]); + inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]); + inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]); + inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]); + inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]); + CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR + CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key + + //load two ROUND_KEYs at a time + for (int i = 1; i < rounds[k]; ) { + load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask); + load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask); + CTR_DoSix(aesenc, xmm_key_tmp1); + i++; + if (i != rounds[k]) { + CTR_DoSix(aesenc, xmm_key_tmp0); + } else { + CTR_DoSix(aesenclast, xmm_key_tmp0); + } + i++; + } + + // get next PARALLEL_FACTOR blocks into xmm_result registers + __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); + __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); + __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); + __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); + __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize)); + __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize)); + + __ pxor(xmm_result0, xmm_from0); + __ pxor(xmm_result1, xmm_from1); + __ pxor(xmm_result2, xmm_from2); + __ pxor(xmm_result3, xmm_from3); + __ pxor(xmm_result4, xmm_from4); + __ pxor(xmm_result5, xmm_from5); + + // store 6 results into the next 64 bytes of output + __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); + __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); + __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); + __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); + __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4); + __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5); + + __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text + __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length + __ jmp(L_multiBlock_loopTop[k]); + + // singleBlock starts here + __ align(OptoLoopAlignment); + __ BIND(L_singleBlockLoopTop[k]); + __ cmpptr(len_reg, 0); + __ jcc(Assembler::equal, L_exit); + load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); + __ movdqa(xmm_result0, xmm_curr_counter); + inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]); + __ pshufb(xmm_result0, xmm_counter_shuf_mask); + __ pxor(xmm_result0, xmm_key_tmp0); + for (int i = 1; i < rounds[k]; i++) { + load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask); + __ aesenc(xmm_result0, xmm_key_tmp0); + } + load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask); + __ aesenclast(xmm_result0, xmm_key_tmp0); + __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); + __ pxor(xmm_result0, xmm_from0); + __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jmp(L_singleBlockLoopTop[k]); + } + + __ BIND(L_exit); + __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back. + __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back + __ pop(rbx); // pop the saved RBX. +#ifdef _WIN64 + // restore regs belonging to calling function + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { + __ movdqu(as_XMMRegister(i), xmm_save(i)); + } + __ movl(rax, len_mem); +#else + __ pop(rax); // return length +#endif + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + return start; + } // byte swap x86 long address generate_ghash_long_swap_mask() { @@ -4474,11 +4708,14 @@ // don't bother generating these AES intrinsic stubs unless global flag is set if (UseAESIntrinsics) { StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others - StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); - StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); + StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); + } + if (UseCTRAESIntrinsics){ + StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask(); + StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel(); } // Generate GHASH intrinsics code