3622 __ aesdec(xmm_result, xmm_key11);
3623 load_key(xmm_temp, key, 0xc0);
3624 __ aesdec(xmm_result, xmm_temp);
3625 load_key(xmm_temp, key, 0xd0);
3626 __ aesdec(xmm_result, xmm_temp);
3627 load_key(xmm_temp, key, 0xe0); // 256-bit key goes up to e0
3628 __ aesdec(xmm_result, xmm_temp);
3629 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0
3630 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3631 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3632 // no need to store r to memory until we exit
3633 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3634 __ addptr(pos, AESBlockSize);
3635 __ subptr(len_reg, AESBlockSize);
3636 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
3637 __ jmp(L_exit);
3638
3639 return start;
3640 }
3641
3642 /**
3643 * Arguments:
3644 *
3645 * Inputs:
3646 * c_rarg0 - int crc
3647 * c_rarg1 - byte* buf
3648 * c_rarg2 - int length
3649 *
3650 * Ouput:
3651 * rax - int crc result
3652 */
3653 address generate_updateBytesCRC32() {
3654 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
3655
3656 __ align(CodeEntryAlignment);
3657 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3658
3659 address start = __ pc();
3660 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
3661 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4058 StubRoutines::x86::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000);
4059 StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
4060 StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
4061
4062 // support for verify_oop (must happen after universe_init)
4063 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
4064
4065 // arraycopy stubs used by compilers
4066 generate_arraycopy_stubs();
4067
4068 generate_math_stubs();
4069
4070 // don't bother generating these AES intrinsic stubs unless global flag is set
4071 if (UseAESIntrinsics) {
4072 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others
4073
4074 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4075 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4076 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4077 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
4078 }
4079
4080 // Safefetch stubs.
4081 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
4082 &StubRoutines::_safefetch32_fault_pc,
4083 &StubRoutines::_safefetch32_continuation_pc);
4084 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4085 &StubRoutines::_safefetchN_fault_pc,
4086 &StubRoutines::_safefetchN_continuation_pc);
4087 #ifdef COMPILER2
4088 if (UseMultiplyToLenIntrinsic) {
4089 StubRoutines::_multiplyToLen = generate_multiplyToLen();
4090 }
4091 if (UseSquareToLenIntrinsic) {
4092 StubRoutines::_squareToLen = generate_squareToLen();
4093 }
4094 if (UseMulAddIntrinsic) {
4095 StubRoutines::_mulAdd = generate_mulAdd();
4096 }
4097
|
3622 __ aesdec(xmm_result, xmm_key11);
3623 load_key(xmm_temp, key, 0xc0);
3624 __ aesdec(xmm_result, xmm_temp);
3625 load_key(xmm_temp, key, 0xd0);
3626 __ aesdec(xmm_result, xmm_temp);
3627 load_key(xmm_temp, key, 0xe0); // 256-bit key goes up to e0
3628 __ aesdec(xmm_result, xmm_temp);
3629 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0
3630 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3631 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3632 // no need to store r to memory until we exit
3633 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3634 __ addptr(pos, AESBlockSize);
3635 __ subptr(len_reg, AESBlockSize);
3636 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
3637 __ jmp(L_exit);
3638
3639 return start;
3640 }
3641
3642
3643 // byte swap x86 long
3644 address generate_ghash_long_swap_mask() {
3645 __ align(CodeEntryAlignment);
3646 StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
3647 address start = __ pc();
3648 __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
3649 __ emit_data64(0x0706050403020100, relocInfo::none );
3650 return start;
3651 }
3652
3653 // byte swap x86 byte array
3654 address generate_ghash_byte_swap_mask() {
3655 __ align(CodeEntryAlignment);
3656 StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
3657 address start = __ pc();
3658 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
3659 __ emit_data64(0x0001020304050607, relocInfo::none );
3660 return start;
3661 }
3662
3663 /* Single and multi-block ghash operations */
3664 address generate_ghash_processBlocks() {
3665 __ align(CodeEntryAlignment);
3666 Label L_ghash_loop, L_exit;
3667 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3668 address start = __ pc();
3669
3670 const Register state = c_rarg0;
3671 const Register subkeyH = c_rarg1;
3672 const Register data = c_rarg2;
3673 const Register blocks = c_rarg3;
3674
3675 #ifdef _WIN64
3676 const int XMM_REG_LAST = 10;
3677 #endif
3678
3679 const XMMRegister xmm_temp0 = xmm0;
3680 const XMMRegister xmm_temp1 = xmm1;
3681 const XMMRegister xmm_temp2 = xmm2;
3682 const XMMRegister xmm_temp3 = xmm3;
3683 const XMMRegister xmm_temp4 = xmm4;
3684 const XMMRegister xmm_temp5 = xmm5;
3685 const XMMRegister xmm_temp6 = xmm6;
3686 const XMMRegister xmm_temp7 = xmm7;
3687 const XMMRegister xmm_temp8 = xmm8;
3688 const XMMRegister xmm_temp9 = xmm9;
3689 const XMMRegister xmm_temp10 = xmm10;
3690
3691 __ enter();
3692
3693 #ifdef _WIN64
3694 // save the xmm registers which must be preserved 6-10
3695 __ subptr(rsp, -rsp_after_call_off * wordSize);
3696 for (int i = 6; i <= XMM_REG_LAST; i++) {
3697 __ movdqu(xmm_save(i), as_XMMRegister(i));
3698 }
3699 #endif
3700
3701 __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3702
3703 __ movdqu(xmm_temp0, Address(state, 0));
3704 __ pshufb(xmm_temp0, xmm_temp10);
3705
3706
3707 __ BIND(L_ghash_loop);
3708 __ movdqu(xmm_temp2, Address(data, 0));
3709 __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
3710
3711 __ movdqu(xmm_temp1, Address(subkeyH, 0));
3712 __ pshufb(xmm_temp1, xmm_temp10);
3713
3714 __ pxor(xmm_temp0, xmm_temp2);
3715
3716 //
3717 // Multiply with the hash key
3718 //
3719 __ movdqu(xmm_temp3, xmm_temp0);
3720 __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
3721 __ movdqu(xmm_temp4, xmm_temp0);
3722 __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
3723
3724 __ movdqu(xmm_temp5, xmm_temp0);
3725 __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
3726 __ movdqu(xmm_temp6, xmm_temp0);
3727 __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
3728
3729 __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
3730
3731 __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
3732 __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
3733 __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
3734 __ pxor(xmm_temp3, xmm_temp5);
3735 __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
3736 // of the carry-less multiplication of
3737 // xmm0 by xmm1.
3738
3739 // We shift the result of the multiplication by one bit position
3740 // to the left to cope for the fact that the bits are reversed.
3741 __ movdqu(xmm_temp7, xmm_temp3);
3742 __ movdqu(xmm_temp8, xmm_temp6);
3743 __ pslld(xmm_temp3, 1);
3744 __ pslld(xmm_temp6, 1);
3745 __ psrld(xmm_temp7, 31);
3746 __ psrld(xmm_temp8, 31);
3747 __ movdqu(xmm_temp9, xmm_temp7);
3748 __ pslldq(xmm_temp8, 4);
3749 __ pslldq(xmm_temp7, 4);
3750 __ psrldq(xmm_temp9, 12);
3751 __ por(xmm_temp3, xmm_temp7);
3752 __ por(xmm_temp6, xmm_temp8);
3753 __ por(xmm_temp6, xmm_temp9);
3754
3755 //
3756 // First phase of the reduction
3757 //
3758 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
3759 // independently.
3760 __ movdqu(xmm_temp7, xmm_temp3);
3761 __ movdqu(xmm_temp8, xmm_temp3);
3762 __ movdqu(xmm_temp9, xmm_temp3);
3763 __ pslld(xmm_temp7, 31); // packed right shift shifting << 31
3764 __ pslld(xmm_temp8, 30); // packed right shift shifting << 30
3765 __ pslld(xmm_temp9, 25); // packed right shift shifting << 25
3766 __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions
3767 __ pxor(xmm_temp7, xmm_temp9);
3768 __ movdqu(xmm_temp8, xmm_temp7);
3769 __ pslldq(xmm_temp7, 12);
3770 __ psrldq(xmm_temp8, 4);
3771 __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
3772
3773 //
3774 // Second phase of the reduction
3775 //
3776 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
3777 // shift operations.
3778 __ movdqu(xmm_temp2, xmm_temp3);
3779 __ movdqu(xmm_temp4, xmm_temp3);
3780 __ movdqu(xmm_temp5, xmm_temp3);
3781 __ psrld(xmm_temp2, 1); // packed left shifting >> 1
3782 __ psrld(xmm_temp4, 2); // packed left shifting >> 2
3783 __ psrld(xmm_temp5, 7); // packed left shifting >> 7
3784 __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions
3785 __ pxor(xmm_temp2, xmm_temp5);
3786 __ pxor(xmm_temp2, xmm_temp8);
3787 __ pxor(xmm_temp3, xmm_temp2);
3788 __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
3789
3790 __ decrement(blocks);
3791 __ jcc(Assembler::zero, L_exit);
3792 __ movdqu(xmm_temp0, xmm_temp6);
3793 __ addptr(data, 16);
3794 __ jmp(L_ghash_loop);
3795
3796 __ BIND(L_exit);
3797 __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
3798 __ movdqu(Address(state, 0), xmm_temp6); // store the result
3799
3800 #ifdef _WIN64
3801 // restore xmm regs belonging to calling function
3802 for (int i = 6; i <= XMM_REG_LAST; i++) {
3803 __ movdqu(as_XMMRegister(i), xmm_save(i));
3804 }
3805 #endif
3806 __ leave();
3807 __ ret(0);
3808 return start;
3809 }
3810
3811 /**
3812 * Arguments:
3813 *
3814 * Inputs:
3815 * c_rarg0 - int crc
3816 * c_rarg1 - byte* buf
3817 * c_rarg2 - int length
3818 *
3819 * Ouput:
3820 * rax - int crc result
3821 */
3822 address generate_updateBytesCRC32() {
3823 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
3824
3825 __ align(CodeEntryAlignment);
3826 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3827
3828 address start = __ pc();
3829 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
3830 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4227 StubRoutines::x86::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000);
4228 StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
4229 StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
4230
4231 // support for verify_oop (must happen after universe_init)
4232 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
4233
4234 // arraycopy stubs used by compilers
4235 generate_arraycopy_stubs();
4236
4237 generate_math_stubs();
4238
4239 // don't bother generating these AES intrinsic stubs unless global flag is set
4240 if (UseAESIntrinsics) {
4241 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others
4242
4243 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4244 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4245 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4246 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
4247 }
4248
4249 // Generate GHASH intrinsics code
4250 if (UseGHASHIntrinsics) {
4251 StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
4252 StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
4253 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4254 }
4255
4256 // Safefetch stubs.
4257 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
4258 &StubRoutines::_safefetch32_fault_pc,
4259 &StubRoutines::_safefetch32_continuation_pc);
4260 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4261 &StubRoutines::_safefetchN_fault_pc,
4262 &StubRoutines::_safefetchN_continuation_pc);
4263 #ifdef COMPILER2
4264 if (UseMultiplyToLenIntrinsic) {
4265 StubRoutines::_multiplyToLen = generate_multiplyToLen();
4266 }
4267 if (UseSquareToLenIntrinsic) {
4268 StubRoutines::_squareToLen = generate_squareToLen();
4269 }
4270 if (UseMulAddIntrinsic) {
4271 StubRoutines::_mulAdd = generate_mulAdd();
4272 }
4273
|