3664 __ aesdec(xmm_result, xmm_key11); 3665 load_key(xmm_temp, key, 0xc0); 3666 __ aesdec(xmm_result, xmm_temp); 3667 load_key(xmm_temp, key, 0xd0); 3668 __ aesdec(xmm_result, xmm_temp); 3669 load_key(xmm_temp, key, 0xe0); // 256-bit key goes up to e0 3670 __ aesdec(xmm_result, xmm_temp); 3671 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0 3672 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector 3673 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3674 // no need to store r to memory until we exit 3675 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 3676 __ addptr(pos, AESBlockSize); 3677 __ subptr(len_reg, AESBlockSize); 3678 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); 3679 __ jmp(L_exit); 3680 3681 return start; 3682 } 3683 3684 /** 3685 * Arguments: 3686 * 3687 * Inputs: 3688 * c_rarg0 - int crc 3689 * c_rarg1 - byte* buf 3690 * c_rarg2 - int length 3691 * 3692 * Ouput: 3693 * rax - int crc result 3694 */ 3695 address generate_updateBytesCRC32() { 3696 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions"); 3697 3698 __ align(CodeEntryAlignment); 3699 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3700 3701 address start = __ pc(); 3702 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 3703 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) 4101 StubRoutines::x86::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000); 4102 StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF); 4103 StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000); 4104 4105 // support for verify_oop (must happen after universe_init) 4106 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4107 4108 // arraycopy stubs used by compilers 4109 generate_arraycopy_stubs(); 4110 4111 generate_math_stubs(); 4112 4113 // don't bother generating these AES intrinsic stubs unless global flag is set 4114 if (UseAESIntrinsics) { 4115 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others 4116 4117 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4118 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4119 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4120 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 4121 } 4122 4123 // Safefetch stubs. 4124 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4125 &StubRoutines::_safefetch32_fault_pc, 4126 &StubRoutines::_safefetch32_continuation_pc); 4127 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4128 &StubRoutines::_safefetchN_fault_pc, 4129 &StubRoutines::_safefetchN_continuation_pc); 4130 #ifdef COMPILER2 4131 if (UseMultiplyToLenIntrinsic) { 4132 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4133 } 4134 if (UseSquareToLenIntrinsic) { 4135 StubRoutines::_squareToLen = generate_squareToLen(); 4136 } 4137 if (UseMulAddIntrinsic) { 4138 StubRoutines::_mulAdd = generate_mulAdd(); 4139 } 4140 #endif | 3664 __ aesdec(xmm_result, xmm_key11); 3665 load_key(xmm_temp, key, 0xc0); 3666 __ aesdec(xmm_result, xmm_temp); 3667 load_key(xmm_temp, key, 0xd0); 3668 __ aesdec(xmm_result, xmm_temp); 3669 load_key(xmm_temp, key, 0xe0); // 256-bit key goes up to e0 3670 __ aesdec(xmm_result, xmm_temp); 3671 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0 3672 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector 3673 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3674 // no need to store r to memory until we exit 3675 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 3676 __ addptr(pos, AESBlockSize); 3677 __ subptr(len_reg, AESBlockSize); 3678 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); 3679 __ jmp(L_exit); 3680 3681 return start; 3682 } 3683 3684 3685 // byte swap x86 long 3686 address generate_ghash_long_swap_mask() { 3687 __ align(CodeEntryAlignment); 3688 StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask"); 3689 address start = __ pc(); 3690 __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none ); 3691 __ emit_data64(0x0706050403020100, relocInfo::none ); 3692 return start; 3693 } 3694 3695 // byte swap x86 byte array 3696 address generate_ghash_byte_swap_mask() { 3697 __ align(CodeEntryAlignment); 3698 StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask"); 3699 address start = __ pc(); 3700 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none ); 3701 __ emit_data64(0x0001020304050607, relocInfo::none ); 3702 return start; 3703 } 3704 3705 /* Single and multi-block ghash operations */ 3706 address generate_ghash_processBlocks() { 3707 __ align(CodeEntryAlignment); 3708 Label L_ghash_loop, L_exit; 3709 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3710 address start = __ pc(); 3711 3712 const Register state = c_rarg0; 3713 const Register subkeyH = c_rarg1; 3714 const Register data = c_rarg2; 3715 const Register blocks = c_rarg3; 3716 3717 #ifdef _WIN64 3718 const int XMM_REG_LAST = 10; 3719 #endif 3720 3721 const XMMRegister xmm_temp0 = xmm0; 3722 const XMMRegister xmm_temp1 = xmm1; 3723 const XMMRegister xmm_temp2 = xmm2; 3724 const XMMRegister xmm_temp3 = xmm3; 3725 const XMMRegister xmm_temp4 = xmm4; 3726 const XMMRegister xmm_temp5 = xmm5; 3727 const XMMRegister xmm_temp6 = xmm6; 3728 const XMMRegister xmm_temp7 = xmm7; 3729 const XMMRegister xmm_temp8 = xmm8; 3730 const XMMRegister xmm_temp9 = xmm9; 3731 const XMMRegister xmm_temp10 = xmm10; 3732 3733 __ enter(); 3734 3735 #ifdef _WIN64 3736 // save the xmm registers which must be preserved 6-10 3737 __ subptr(rsp, -rsp_after_call_off * wordSize); 3738 for (int i = 6; i <= XMM_REG_LAST; i++) { 3739 __ movdqu(xmm_save(i), as_XMMRegister(i)); 3740 } 3741 #endif 3742 3743 __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); 3744 3745 __ movdqu(xmm_temp0, Address(state, 0)); 3746 __ pshufb(xmm_temp0, xmm_temp10); 3747 3748 3749 __ BIND(L_ghash_loop); 3750 __ movdqu(xmm_temp2, Address(data, 0)); 3751 __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); 3752 3753 __ movdqu(xmm_temp1, Address(subkeyH, 0)); 3754 __ pshufb(xmm_temp1, xmm_temp10); 3755 3756 __ pxor(xmm_temp0, xmm_temp2); 3757 3758 // 3759 // Multiply with the hash key 3760 // 3761 __ movdqu(xmm_temp3, xmm_temp0); 3762 __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0 3763 __ movdqu(xmm_temp4, xmm_temp0); 3764 __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1 3765 3766 __ movdqu(xmm_temp5, xmm_temp0); 3767 __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0 3768 __ movdqu(xmm_temp6, xmm_temp0); 3769 __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1 3770 3771 __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0 3772 3773 __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5 3774 __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right 3775 __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left 3776 __ pxor(xmm_temp3, xmm_temp5); 3777 __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result 3778 // of the carry-less multiplication of 3779 // xmm0 by xmm1. 3780 3781 // We shift the result of the multiplication by one bit position 3782 // to the left to cope for the fact that the bits are reversed. 3783 __ movdqu(xmm_temp7, xmm_temp3); 3784 __ movdqu(xmm_temp8, xmm_temp6); 3785 __ pslld(xmm_temp3, 1); 3786 __ pslld(xmm_temp6, 1); 3787 __ psrld(xmm_temp7, 31); 3788 __ psrld(xmm_temp8, 31); 3789 __ movdqu(xmm_temp9, xmm_temp7); 3790 __ pslldq(xmm_temp8, 4); 3791 __ pslldq(xmm_temp7, 4); 3792 __ psrldq(xmm_temp9, 12); 3793 __ por(xmm_temp3, xmm_temp7); 3794 __ por(xmm_temp6, xmm_temp8); 3795 __ por(xmm_temp6, xmm_temp9); 3796 3797 // 3798 // First phase of the reduction 3799 // 3800 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts 3801 // independently. 3802 __ movdqu(xmm_temp7, xmm_temp3); 3803 __ movdqu(xmm_temp8, xmm_temp3); 3804 __ movdqu(xmm_temp9, xmm_temp3); 3805 __ pslld(xmm_temp7, 31); // packed right shift shifting << 31 3806 __ pslld(xmm_temp8, 30); // packed right shift shifting << 30 3807 __ pslld(xmm_temp9, 25); // packed right shift shifting << 25 3808 __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions 3809 __ pxor(xmm_temp7, xmm_temp9); 3810 __ movdqu(xmm_temp8, xmm_temp7); 3811 __ pslldq(xmm_temp7, 12); 3812 __ psrldq(xmm_temp8, 4); 3813 __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete 3814 3815 // 3816 // Second phase of the reduction 3817 // 3818 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these 3819 // shift operations. 3820 __ movdqu(xmm_temp2, xmm_temp3); 3821 __ movdqu(xmm_temp4, xmm_temp3); 3822 __ movdqu(xmm_temp5, xmm_temp3); 3823 __ psrld(xmm_temp2, 1); // packed left shifting >> 1 3824 __ psrld(xmm_temp4, 2); // packed left shifting >> 2 3825 __ psrld(xmm_temp5, 7); // packed left shifting >> 7 3826 __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions 3827 __ pxor(xmm_temp2, xmm_temp5); 3828 __ pxor(xmm_temp2, xmm_temp8); 3829 __ pxor(xmm_temp3, xmm_temp2); 3830 __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6 3831 3832 __ decrement(blocks); 3833 __ jcc(Assembler::zero, L_exit); 3834 __ movdqu(xmm_temp0, xmm_temp6); 3835 __ addptr(data, 16); 3836 __ jmp(L_ghash_loop); 3837 3838 __ BIND(L_exit); 3839 __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result 3840 __ movdqu(Address(state, 0), xmm_temp6); // store the result 3841 3842 #ifdef _WIN64 3843 // restore xmm regs belonging to calling function 3844 for (int i = 6; i <= XMM_REG_LAST; i++) { 3845 __ movdqu(as_XMMRegister(i), xmm_save(i)); 3846 } 3847 #endif 3848 __ leave(); 3849 __ ret(0); 3850 return start; 3851 } 3852 3853 /** 3854 * Arguments: 3855 * 3856 * Inputs: 3857 * c_rarg0 - int crc 3858 * c_rarg1 - byte* buf 3859 * c_rarg2 - int length 3860 * 3861 * Ouput: 3862 * rax - int crc result 3863 */ 3864 address generate_updateBytesCRC32() { 3865 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions"); 3866 3867 __ align(CodeEntryAlignment); 3868 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3869 3870 address start = __ pc(); 3871 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 3872 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) 4270 StubRoutines::x86::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000); 4271 StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF); 4272 StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000); 4273 4274 // support for verify_oop (must happen after universe_init) 4275 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4276 4277 // arraycopy stubs used by compilers 4278 generate_arraycopy_stubs(); 4279 4280 generate_math_stubs(); 4281 4282 // don't bother generating these AES intrinsic stubs unless global flag is set 4283 if (UseAESIntrinsics) { 4284 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others 4285 4286 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4287 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4288 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4289 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 4290 } 4291 4292 // Generate GHASH intrinsics code 4293 if (UseGHASHIntrinsics) { 4294 StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); 4295 StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask(); 4296 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4297 } 4298 4299 // Safefetch stubs. 4300 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4301 &StubRoutines::_safefetch32_fault_pc, 4302 &StubRoutines::_safefetch32_continuation_pc); 4303 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4304 &StubRoutines::_safefetchN_fault_pc, 4305 &StubRoutines::_safefetchN_continuation_pc); 4306 #ifdef COMPILER2 4307 if (UseMultiplyToLenIntrinsic) { 4308 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4309 } 4310 if (UseSquareToLenIntrinsic) { 4311 StubRoutines::_squareToLen = generate_squareToLen(); 4312 } 4313 if (UseMulAddIntrinsic) { 4314 StubRoutines::_mulAdd = generate_mulAdd(); 4315 } 4316 #endif |