< prev index next >

src/cpu/x86/vm/stubGenerator_x86_64.cpp

Print this page
rev 9055 : 8073108: Use x86 and SPARC CPU instructions for GHASH acceleration
Reviewed-by: kvn, jrose


3622     __ aesdec(xmm_result, xmm_key11);
3623     load_key(xmm_temp, key, 0xc0);
3624     __ aesdec(xmm_result, xmm_temp);
3625     load_key(xmm_temp, key, 0xd0);
3626     __ aesdec(xmm_result, xmm_temp);
3627     load_key(xmm_temp, key, 0xe0);     // 256-bit key goes up to e0
3628     __ aesdec(xmm_result, xmm_temp);
3629     __ aesdeclast(xmm_result, xmm_key_last);          // xmm15 came from key+0
3630     __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
3631     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);  // store into the next 16 bytes of output
3632     // no need to store r to memory until we exit
3633     __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);  // set up next r vector with cipher input from this block
3634     __ addptr(pos, AESBlockSize);
3635     __ subptr(len_reg, AESBlockSize);
3636     __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
3637     __ jmp(L_exit);
3638 
3639     return start;
3640   }
3641 









































































































































































3642   /**
3643    *  Arguments:
3644    *
3645    * Inputs:
3646    *   c_rarg0   - int crc
3647    *   c_rarg1   - byte* buf
3648    *   c_rarg2   - int length
3649    *
3650    * Ouput:
3651    *       rax   - int crc result
3652    */
3653   address generate_updateBytesCRC32() {
3654     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
3655 
3656     __ align(CodeEntryAlignment);
3657     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3658 
3659     address start = __ pc();
3660     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
3661     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)


4058     StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
4059     StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
4060     StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
4061 
4062     // support for verify_oop (must happen after universe_init)
4063     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
4064 
4065     // arraycopy stubs used by compilers
4066     generate_arraycopy_stubs();
4067 
4068     generate_math_stubs();
4069 
4070     // don't bother generating these AES intrinsic stubs unless global flag is set
4071     if (UseAESIntrinsics) {
4072       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
4073 
4074       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4075       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4076       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4077       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();







4078     }
4079 
4080     // Safefetch stubs.
4081     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4082                                                        &StubRoutines::_safefetch32_fault_pc,
4083                                                        &StubRoutines::_safefetch32_continuation_pc);
4084     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4085                                                        &StubRoutines::_safefetchN_fault_pc,
4086                                                        &StubRoutines::_safefetchN_continuation_pc);
4087 #ifdef COMPILER2
4088     if (UseMultiplyToLenIntrinsic) {
4089       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4090     }
4091     if (UseSquareToLenIntrinsic) {
4092       StubRoutines::_squareToLen = generate_squareToLen();
4093     }
4094     if (UseMulAddIntrinsic) {
4095       StubRoutines::_mulAdd = generate_mulAdd();
4096     }
4097 




3622     __ aesdec(xmm_result, xmm_key11);
3623     load_key(xmm_temp, key, 0xc0);
3624     __ aesdec(xmm_result, xmm_temp);
3625     load_key(xmm_temp, key, 0xd0);
3626     __ aesdec(xmm_result, xmm_temp);
3627     load_key(xmm_temp, key, 0xe0);     // 256-bit key goes up to e0
3628     __ aesdec(xmm_result, xmm_temp);
3629     __ aesdeclast(xmm_result, xmm_key_last);          // xmm15 came from key+0
3630     __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
3631     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);  // store into the next 16 bytes of output
3632     // no need to store r to memory until we exit
3633     __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);  // set up next r vector with cipher input from this block
3634     __ addptr(pos, AESBlockSize);
3635     __ subptr(len_reg, AESBlockSize);
3636     __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
3637     __ jmp(L_exit);
3638 
3639     return start;
3640   }
3641 
3642 
3643   // byte swap x86 long
3644   address generate_ghash_long_swap_mask() {
3645     __ align(CodeEntryAlignment);
3646     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
3647     address start = __ pc();
3648     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
3649     __ emit_data64(0x0706050403020100, relocInfo::none );
3650   return start;
3651   }
3652 
3653   // byte swap x86 byte array
3654   address generate_ghash_byte_swap_mask() {
3655     __ align(CodeEntryAlignment);
3656     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
3657     address start = __ pc();
3658     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
3659     __ emit_data64(0x0001020304050607, relocInfo::none );
3660   return start;
3661   }
3662 
3663   /* Single and multi-block ghash operations */
3664   address generate_ghash_processBlocks() {
3665     __ align(CodeEntryAlignment);
3666     Label L_ghash_loop, L_exit;
3667     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3668     address start = __ pc();
3669 
3670     const Register state        = c_rarg0;
3671     const Register subkeyH      = c_rarg1;
3672     const Register data         = c_rarg2;
3673     const Register blocks       = c_rarg3;
3674 
3675 #ifdef _WIN64
3676     const int XMM_REG_LAST  = 10;
3677 #endif
3678 
3679     const XMMRegister xmm_temp0 = xmm0;
3680     const XMMRegister xmm_temp1 = xmm1;
3681     const XMMRegister xmm_temp2 = xmm2;
3682     const XMMRegister xmm_temp3 = xmm3;
3683     const XMMRegister xmm_temp4 = xmm4;
3684     const XMMRegister xmm_temp5 = xmm5;
3685     const XMMRegister xmm_temp6 = xmm6;
3686     const XMMRegister xmm_temp7 = xmm7;
3687     const XMMRegister xmm_temp8 = xmm8;
3688     const XMMRegister xmm_temp9 = xmm9;
3689     const XMMRegister xmm_temp10 = xmm10;
3690 
3691     __ enter();
3692 
3693 #ifdef _WIN64
3694     // save the xmm registers which must be preserved 6-10
3695     __ subptr(rsp, -rsp_after_call_off * wordSize);
3696     for (int i = 6; i <= XMM_REG_LAST; i++) {
3697       __ movdqu(xmm_save(i), as_XMMRegister(i));
3698     }
3699 #endif
3700 
3701     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3702 
3703     __ movdqu(xmm_temp0, Address(state, 0));
3704     __ pshufb(xmm_temp0, xmm_temp10);
3705 
3706 
3707     __ BIND(L_ghash_loop);
3708     __ movdqu(xmm_temp2, Address(data, 0));
3709     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
3710 
3711     __ movdqu(xmm_temp1, Address(subkeyH, 0));
3712     __ pshufb(xmm_temp1, xmm_temp10);
3713 
3714     __ pxor(xmm_temp0, xmm_temp2);
3715 
3716     //
3717     // Multiply with the hash key
3718     //
3719     __ movdqu(xmm_temp3, xmm_temp0);
3720     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
3721     __ movdqu(xmm_temp4, xmm_temp0);
3722     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
3723 
3724     __ movdqu(xmm_temp5, xmm_temp0);
3725     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
3726     __ movdqu(xmm_temp6, xmm_temp0);
3727     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
3728 
3729     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
3730 
3731     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
3732     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
3733     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
3734     __ pxor(xmm_temp3, xmm_temp5);
3735     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
3736                                         // of the carry-less multiplication of
3737                                         // xmm0 by xmm1.
3738 
3739     // We shift the result of the multiplication by one bit position
3740     // to the left to cope for the fact that the bits are reversed.
3741     __ movdqu(xmm_temp7, xmm_temp3);
3742     __ movdqu(xmm_temp8, xmm_temp6);
3743     __ pslld(xmm_temp3, 1);
3744     __ pslld(xmm_temp6, 1);
3745     __ psrld(xmm_temp7, 31);
3746     __ psrld(xmm_temp8, 31);
3747     __ movdqu(xmm_temp9, xmm_temp7);
3748     __ pslldq(xmm_temp8, 4);
3749     __ pslldq(xmm_temp7, 4);
3750     __ psrldq(xmm_temp9, 12);
3751     __ por(xmm_temp3, xmm_temp7);
3752     __ por(xmm_temp6, xmm_temp8);
3753     __ por(xmm_temp6, xmm_temp9);
3754 
3755     //
3756     // First phase of the reduction
3757     //
3758     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
3759     // independently.
3760     __ movdqu(xmm_temp7, xmm_temp3);
3761     __ movdqu(xmm_temp8, xmm_temp3);
3762     __ movdqu(xmm_temp9, xmm_temp3);
3763     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
3764     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
3765     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
3766     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
3767     __ pxor(xmm_temp7, xmm_temp9);
3768     __ movdqu(xmm_temp8, xmm_temp7);
3769     __ pslldq(xmm_temp7, 12);
3770     __ psrldq(xmm_temp8, 4);
3771     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
3772 
3773     //
3774     // Second phase of the reduction
3775     //
3776     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
3777     // shift operations.
3778     __ movdqu(xmm_temp2, xmm_temp3);
3779     __ movdqu(xmm_temp4, xmm_temp3);
3780     __ movdqu(xmm_temp5, xmm_temp3);
3781     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
3782     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
3783     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
3784     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
3785     __ pxor(xmm_temp2, xmm_temp5);
3786     __ pxor(xmm_temp2, xmm_temp8);
3787     __ pxor(xmm_temp3, xmm_temp2);
3788     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
3789 
3790     __ decrement(blocks);
3791     __ jcc(Assembler::zero, L_exit);
3792     __ movdqu(xmm_temp0, xmm_temp6);
3793     __ addptr(data, 16);
3794     __ jmp(L_ghash_loop);
3795 
3796     __ BIND(L_exit);
3797     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
3798     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
3799 
3800 #ifdef _WIN64
3801     // restore xmm regs belonging to calling function
3802     for (int i = 6; i <= XMM_REG_LAST; i++) {
3803       __ movdqu(as_XMMRegister(i), xmm_save(i));
3804     }
3805 #endif
3806     __ leave();
3807     __ ret(0);
3808     return start;
3809   }
3810 
3811   /**
3812    *  Arguments:
3813    *
3814    * Inputs:
3815    *   c_rarg0   - int crc
3816    *   c_rarg1   - byte* buf
3817    *   c_rarg2   - int length
3818    *
3819    * Ouput:
3820    *       rax   - int crc result
3821    */
3822   address generate_updateBytesCRC32() {
3823     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
3824 
3825     __ align(CodeEntryAlignment);
3826     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3827 
3828     address start = __ pc();
3829     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
3830     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)


4227     StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
4228     StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
4229     StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
4230 
4231     // support for verify_oop (must happen after universe_init)
4232     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
4233 
4234     // arraycopy stubs used by compilers
4235     generate_arraycopy_stubs();
4236 
4237     generate_math_stubs();
4238 
4239     // don't bother generating these AES intrinsic stubs unless global flag is set
4240     if (UseAESIntrinsics) {
4241       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
4242 
4243       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4244       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4245       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4246       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
4247     }
4248 
4249     // Generate GHASH intrinsics code
4250     if (UseGHASHIntrinsics) {
4251       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
4252       StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
4253       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4254     }
4255 
4256     // Safefetch stubs.
4257     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4258                                                        &StubRoutines::_safefetch32_fault_pc,
4259                                                        &StubRoutines::_safefetch32_continuation_pc);
4260     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4261                                                        &StubRoutines::_safefetchN_fault_pc,
4262                                                        &StubRoutines::_safefetchN_continuation_pc);
4263 #ifdef COMPILER2
4264     if (UseMultiplyToLenIntrinsic) {
4265       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4266     }
4267     if (UseSquareToLenIntrinsic) {
4268       StubRoutines::_squareToLen = generate_squareToLen();
4269     }
4270     if (UseMulAddIntrinsic) {
4271       StubRoutines::_mulAdd = generate_mulAdd();
4272     }
4273 


< prev index next >