3219 const int XMM_REG_NUM_KEY_LAST = 15;
3220 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3221 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3222 const XMMRegister xmm_key11 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3223 const XMMRegister xmm_key12 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3224 const XMMRegister xmm_key13 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3225
3226 __ enter(); // required for proper stackwalking of RuntimeStub frame
3227
3228 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3229 // context for the registers used, where all instructions below are using 128-bit mode
3230 // On EVEX without VL and BW, these instructions will all be AVX.
3231 if (VM_Version::supports_avx512vlbw()) {
3232 __ movl(rax, 0xffff);
3233 __ kmovql(k1, rax);
3234 }
3235
3236 #ifdef _WIN64
3237 // on win64, fill len_reg from stack position
3238 __ movl(len_reg, len_mem);
3239 // save the xmm registers which must be preserved 6-15
3240 __ subptr(rsp, -rsp_after_call_off * wordSize);
3241 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3242 __ movdqu(xmm_save(i), as_XMMRegister(i));
3243 }
3244 #else
3245 __ push(len_reg); // Save
3246 #endif
3247
3248 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
3249 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3250 // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3251 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3252 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3253 offset += 0x10;
3254 }
3255 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
3256
3257 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3258 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3259 __ cmpl(rax, 44);
3260 __ jcc(Assembler::notEqual, L_key_192_256);
3261
3262 // 128 bit code follows here
3263 __ movptr(pos, 0);
3264 __ align(OptoLoopAlignment);
3265
3266 __ BIND(L_loopTop_128);
3267 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3268 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3269 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3270 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3271 __ aesenc(xmm_result, as_XMMRegister(rnum));
3272 }
3273 __ aesenclast(xmm_result, xmm_key10);
3274 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3275 // no need to store r to memory until we exit
3276 __ addptr(pos, AESBlockSize);
3277 __ subptr(len_reg, AESBlockSize);
3278 __ jcc(Assembler::notEqual, L_loopTop_128);
3279
3280 __ BIND(L_exit);
3281 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object
3282
3283 #ifdef _WIN64
3284 // restore xmm regs belonging to calling function
3285 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3286 __ movdqu(as_XMMRegister(i), xmm_save(i));
3287 }
3288 __ movl(rax, len_mem);
3289 #else
3290 __ pop(rax); // return length
3291 #endif
3292 __ leave(); // required for proper stackwalking of RuntimeStub frame
3293 __ ret(0);
3294
3295 __ BIND(L_key_192_256);
3296 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3297 load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3298 load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3299 __ cmpl(rax, 52);
3300 __ jcc(Assembler::notEqual, L_key_256);
3301
3302 // 192-bit code follows here (could be changed to use more xmm registers)
3303 __ movptr(pos, 0);
3304 __ align(OptoLoopAlignment);
3305
3306 __ BIND(L_loopTop_192);
3307 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3429
3430 // keys 0-10 preloaded into xmm5-xmm15
3431 const int XMM_REG_NUM_KEY_FIRST = 5;
3432 const int XMM_REG_NUM_KEY_LAST = 15;
3433 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3434 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3435
3436 __ enter(); // required for proper stackwalking of RuntimeStub frame
3437
3438 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3439 // context for the registers used, where all instructions below are using 128-bit mode
3440 // On EVEX without VL and BW, these instructions will all be AVX.
3441 if (VM_Version::supports_avx512vlbw()) {
3442 __ movl(rax, 0xffff);
3443 __ kmovql(k1, rax);
3444 }
3445
3446 #ifdef _WIN64
3447 // on win64, fill len_reg from stack position
3448 __ movl(len_reg, len_mem);
3449 // save the xmm registers which must be preserved 6-15
3450 __ subptr(rsp, -rsp_after_call_off * wordSize);
3451 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3452 __ movdqu(xmm_save(i), as_XMMRegister(i));
3453 }
3454 #else
3455 __ push(len_reg); // Save
3456 #endif
3457 __ push(rbx);
3458 // the java expanded key ordering is rotated one position from what we want
3459 // so we start from 0x10 here and hit 0x00 last
3460 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
3461 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3462 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3463 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3464 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3465 offset += 0x10;
3466 }
3467 load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3468
3469 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block
3470
3471 // registers holding the four results in the parallelized loop
3472 const XMMRegister xmm_result0 = xmm0;
3473 const XMMRegister xmm_result1 = xmm2;
3627 __ aesdec(xmm_result, key_tmp);
3628 }
3629
3630 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3631 __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3632 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3633 // no need to store r to memory until we exit
3634 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3635 __ addptr(pos, AESBlockSize);
3636 __ subptr(len_reg, AESBlockSize);
3637 __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3638 if (k != 2) {
3639 __ jmp(L_exit);
3640 }
3641 } //for 128/192/256
3642
3643 __ BIND(L_exit);
3644 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
3645 __ pop(rbx);
3646 #ifdef _WIN64
3647 // restore regs belonging to calling function
3648 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3649 __ movdqu(as_XMMRegister(i), xmm_save(i));
3650 }
3651 __ movl(rax, len_mem);
3652 #else
3653 __ pop(rax); // return length
3654 #endif
3655 __ leave(); // required for proper stackwalking of RuntimeStub frame
3656 __ ret(0);
3657 return start;
3658 }
3659
3660 address generate_upper_word_mask() {
3661 __ align(64);
3662 StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3663 address start = __ pc();
3664 __ emit_data64(0x0000000000000000, relocInfo::none);
3665 __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3666 return start;
3667 }
3668
3669 address generate_shuffle_byte_flip_mask() {
3670 __ align(64);
3682 StubCodeMark mark(this, "StubRoutines", name);
3683 address start = __ pc();
3684
3685 Register buf = c_rarg0;
3686 Register state = c_rarg1;
3687 Register ofs = c_rarg2;
3688 Register limit = c_rarg3;
3689
3690 const XMMRegister abcd = xmm0;
3691 const XMMRegister e0 = xmm1;
3692 const XMMRegister e1 = xmm2;
3693 const XMMRegister msg0 = xmm3;
3694
3695 const XMMRegister msg1 = xmm4;
3696 const XMMRegister msg2 = xmm5;
3697 const XMMRegister msg3 = xmm6;
3698 const XMMRegister shuf_mask = xmm7;
3699
3700 __ enter();
3701
3702 #ifdef _WIN64
3703 // save the xmm registers which must be preserved 6-7
3704 __ subptr(rsp, 4 * wordSize);
3705 __ movdqu(Address(rsp, 0), xmm6);
3706 __ movdqu(Address(rsp, 2 * wordSize), xmm7);
3707 #endif
3708
3709 __ subptr(rsp, 4 * wordSize);
3710
3711 __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3712 buf, state, ofs, limit, rsp, multi_block);
3713
3714 __ addptr(rsp, 4 * wordSize);
3715 #ifdef _WIN64
3716 // restore xmm regs belonging to calling function
3717 __ movdqu(xmm6, Address(rsp, 0));
3718 __ movdqu(xmm7, Address(rsp, 2 * wordSize));
3719 __ addptr(rsp, 4 * wordSize);
3720 #endif
3721
3722 __ leave();
3723 __ ret(0);
3724 return start;
3725 }
3726
3727 address generate_pshuffle_byte_flip_mask() {
3728 __ align(64);
3729 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3730 address start = __ pc();
3731 __ emit_data64(0x0405060700010203, relocInfo::none);
3732 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3733
3734 if (VM_Version::supports_avx2()) {
3735 __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
3736 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3737 // _SHUF_00BA
3738 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3739 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3740 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3758 address start = __ pc();
3759
3760 Register buf = c_rarg0;
3761 Register state = c_rarg1;
3762 Register ofs = c_rarg2;
3763 Register limit = c_rarg3;
3764
3765 const XMMRegister msg = xmm0;
3766 const XMMRegister state0 = xmm1;
3767 const XMMRegister state1 = xmm2;
3768 const XMMRegister msgtmp0 = xmm3;
3769
3770 const XMMRegister msgtmp1 = xmm4;
3771 const XMMRegister msgtmp2 = xmm5;
3772 const XMMRegister msgtmp3 = xmm6;
3773 const XMMRegister msgtmp4 = xmm7;
3774
3775 const XMMRegister shuf_mask = xmm8;
3776
3777 __ enter();
3778 #ifdef _WIN64
3779 // save the xmm registers which must be preserved 6-7
3780 __ subptr(rsp, 6 * wordSize);
3781 __ movdqu(Address(rsp, 0), xmm6);
3782 __ movdqu(Address(rsp, 2 * wordSize), xmm7);
3783 __ movdqu(Address(rsp, 4 * wordSize), xmm8);
3784
3785 if (!VM_Version::supports_sha() && VM_Version::supports_avx2()) {
3786 __ subptr(rsp, 10 * wordSize);
3787 __ movdqu(Address(rsp, 0), xmm9);
3788 __ movdqu(Address(rsp, 2 * wordSize), xmm10);
3789 __ movdqu(Address(rsp, 4 * wordSize), xmm11);
3790 __ movdqu(Address(rsp, 6 * wordSize), xmm12);
3791 __ movdqu(Address(rsp, 8 * wordSize), xmm13);
3792 }
3793 #endif
3794
3795 __ subptr(rsp, 4 * wordSize);
3796
3797 if (VM_Version::supports_sha()) {
3798 __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3799 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3800 } else if (VM_Version::supports_avx2()) {
3801 __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3802 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3803 }
3804 __ addptr(rsp, 4 * wordSize);
3805 #ifdef _WIN64
3806 // restore xmm regs belonging to calling function
3807 if (!VM_Version::supports_sha() && VM_Version::supports_avx2()) {
3808 __ movdqu(xmm9, Address(rsp, 0));
3809 __ movdqu(xmm10, Address(rsp, 2 * wordSize));
3810 __ movdqu(xmm11, Address(rsp, 4 * wordSize));
3811 __ movdqu(xmm12, Address(rsp, 6 * wordSize));
3812 __ movdqu(xmm13, Address(rsp, 8 * wordSize));
3813 __ addptr(rsp, 10 * wordSize);
3814 }
3815 __ movdqu(xmm6, Address(rsp, 0));
3816 __ movdqu(xmm7, Address(rsp, 2 * wordSize));
3817 __ movdqu(xmm8, Address(rsp, 4 * wordSize));
3818 __ addptr(rsp, 6 * wordSize);
3819 #endif
3820 __ leave();
3821 __ ret(0);
3822 return start;
3823 }
3824
3825 // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
3826 // to hide instruction latency
3827 //
3828 // Arguments:
3829 //
3830 // Inputs:
3831 // c_rarg0 - source byte array address
3832 // c_rarg1 - destination byte array address
3833 // c_rarg2 - K (key) in little endian int array
3834 // c_rarg3 - counter vector byte array address
3835 // Linux
3836 // c_rarg4 - input length
3837 // c_rarg5 - saved encryptedCounter start
3838 // rbp + 6 * wordSize - saved used length
3839 // Windows
3900 Label L_multiBlock_loopTop[3];
3901 Label L_singleBlockLoopTop[3];
3902 Label L__incCounter[3][6]; //for 6 blocks
3903 Label L__incCounter_single[3]; //for single block, key128, key192, key256
3904 Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
3905 Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
3906
3907 Label L_exit;
3908
3909 __ enter(); // required for proper stackwalking of RuntimeStub frame
3910
3911 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3912 // context for the registers used, where all instructions below are using 128-bit mode
3913 // On EVEX without VL and BW, these instructions will all be AVX.
3914 if (VM_Version::supports_avx512vlbw()) {
3915 __ movl(rax, 0xffff);
3916 __ kmovql(k1, rax);
3917 }
3918
3919 #ifdef _WIN64
3920 // save the xmm registers which must be preserved 6-14
3921 const int XMM_REG_NUM_KEY_LAST = 14;
3922 __ subptr(rsp, -rsp_after_call_off * wordSize);
3923 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3924 __ movdqu(xmm_save(i), as_XMMRegister(i));
3925 }
3926
3927 const Address r13_save(rbp, rdi_off * wordSize);
3928 const Address r14_save(rbp, rsi_off * wordSize);
3929
3930 __ movptr(r13_save, r13);
3931 __ movptr(r14_save, r14);
3932
3933 // on win64, fill len_reg from stack position
3934 __ movl(len_reg, len_mem);
3935 __ movptr(saved_encCounter_start, saved_encCounter_mem);
3936 __ movptr(used_addr, used_mem);
3937 __ movl(used, Address(used_addr, 0));
3938 #else
3939 __ push(len_reg); // Save
3940 __ movptr(used_addr, used_mem);
3941 __ movl(used, Address(used_addr, 0));
3942 #endif
3943
3944 __ push(rbx); // Save RBX
3945 __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
3946 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
3947 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
3948 __ movptr(pos, 0);
3949
3950 // Use the partially used encrpyted counter from last invocation
3951 __ BIND(L_preLoop_start);
4113 __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4114 __ pextrw(Address(to, pos), xmm_result0, 0);
4115 __ psrldq(xmm_result0, 2);
4116 __ addptr(pos, 2);
4117 __ BIND(L_processTail_1_extr[k]);
4118 __ testptr(len_reg, 1);
4119 __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4120 __ pextrb(Address(to, pos), xmm_result0, 0);
4121
4122 __ BIND(L_processTail_exit_extr[k]);
4123 __ movl(Address(used_addr, 0), len_reg);
4124 __ jmp(L_exit);
4125
4126 }
4127
4128 __ BIND(L_exit);
4129 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4130 __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4131 __ pop(rbx); // pop the saved RBX.
4132 #ifdef _WIN64
4133 // restore regs belonging to calling function
4134 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
4135 __ movdqu(as_XMMRegister(i), xmm_save(i));
4136 }
4137 __ movl(rax, len_mem);
4138 __ movptr(r13, r13_save);
4139 __ movptr(r14, r14_save);
4140 #else
4141 __ pop(rax); // return 'len'
4142 #endif
4143 __ leave(); // required for proper stackwalking of RuntimeStub frame
4144 __ ret(0);
4145 return start;
4146 }
4147
4148 // byte swap x86 long
4149 address generate_ghash_long_swap_mask() {
4150 __ align(CodeEntryAlignment);
4151 StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4152 address start = __ pc();
4153 __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4154 __ emit_data64(0x0706050403020100, relocInfo::none );
4155 return start;
4156 }
4157
4158 // byte swap x86 byte array
4159 address generate_ghash_byte_swap_mask() {
4160 __ align(CodeEntryAlignment);
4161 StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4162 address start = __ pc();
4163 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4164 __ emit_data64(0x0001020304050607, relocInfo::none );
4165 return start;
4166 }
4167
4168 /* Single and multi-block ghash operations */
4169 address generate_ghash_processBlocks() {
4170 __ align(CodeEntryAlignment);
4171 Label L_ghash_loop, L_exit;
4172 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4173 address start = __ pc();
4174
4175 const Register state = c_rarg0;
4176 const Register subkeyH = c_rarg1;
4177 const Register data = c_rarg2;
4178 const Register blocks = c_rarg3;
4179
4180 #ifdef _WIN64
4181 const int XMM_REG_LAST = 10;
4182 #endif
4183
4184 const XMMRegister xmm_temp0 = xmm0;
4185 const XMMRegister xmm_temp1 = xmm1;
4186 const XMMRegister xmm_temp2 = xmm2;
4187 const XMMRegister xmm_temp3 = xmm3;
4188 const XMMRegister xmm_temp4 = xmm4;
4189 const XMMRegister xmm_temp5 = xmm5;
4190 const XMMRegister xmm_temp6 = xmm6;
4191 const XMMRegister xmm_temp7 = xmm7;
4192 const XMMRegister xmm_temp8 = xmm8;
4193 const XMMRegister xmm_temp9 = xmm9;
4194 const XMMRegister xmm_temp10 = xmm10;
4195
4196 __ enter();
4197
4198 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
4199 // context for the registers used, where all instructions below are using 128-bit mode
4200 // On EVEX without VL and BW, these instructions will all be AVX.
4201 if (VM_Version::supports_avx512vlbw()) {
4202 __ movl(rax, 0xffff);
4203 __ kmovql(k1, rax);
4204 }
4205
4206 #ifdef _WIN64
4207 // save the xmm registers which must be preserved 6-10
4208 __ subptr(rsp, -rsp_after_call_off * wordSize);
4209 for (int i = 6; i <= XMM_REG_LAST; i++) {
4210 __ movdqu(xmm_save(i), as_XMMRegister(i));
4211 }
4212 #endif
4213
4214 __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4215
4216 __ movdqu(xmm_temp0, Address(state, 0));
4217 __ pshufb(xmm_temp0, xmm_temp10);
4218
4219
4220 __ BIND(L_ghash_loop);
4221 __ movdqu(xmm_temp2, Address(data, 0));
4222 __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4223
4224 __ movdqu(xmm_temp1, Address(subkeyH, 0));
4225 __ pshufb(xmm_temp1, xmm_temp10);
4226
4227 __ pxor(xmm_temp0, xmm_temp2);
4228
4229 //
4230 // Multiply with the hash key
4231 //
4232 __ movdqu(xmm_temp3, xmm_temp0);
4233 __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
4293 __ movdqu(xmm_temp5, xmm_temp3);
4294 __ psrld(xmm_temp2, 1); // packed left shifting >> 1
4295 __ psrld(xmm_temp4, 2); // packed left shifting >> 2
4296 __ psrld(xmm_temp5, 7); // packed left shifting >> 7
4297 __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions
4298 __ pxor(xmm_temp2, xmm_temp5);
4299 __ pxor(xmm_temp2, xmm_temp8);
4300 __ pxor(xmm_temp3, xmm_temp2);
4301 __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
4302
4303 __ decrement(blocks);
4304 __ jcc(Assembler::zero, L_exit);
4305 __ movdqu(xmm_temp0, xmm_temp6);
4306 __ addptr(data, 16);
4307 __ jmp(L_ghash_loop);
4308
4309 __ BIND(L_exit);
4310 __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
4311 __ movdqu(Address(state, 0), xmm_temp6); // store the result
4312
4313 #ifdef _WIN64
4314 // restore xmm regs belonging to calling function
4315 for (int i = 6; i <= XMM_REG_LAST; i++) {
4316 __ movdqu(as_XMMRegister(i), xmm_save(i));
4317 }
4318 #endif
4319 __ leave();
4320 __ ret(0);
4321 return start;
4322 }
4323
4324 /**
4325 * Arguments:
4326 *
4327 * Inputs:
4328 * c_rarg0 - int crc
4329 * c_rarg1 - byte* buf
4330 * c_rarg2 - int length
4331 *
4332 * Ouput:
4333 * rax - int crc result
4334 */
4335 address generate_updateBytesCRC32() {
4336 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
4337
4338 __ align(CodeEntryAlignment);
4635 }
4636
4637 address generate_libmExp() {
4638 address start = __ pc();
4639
4640 const XMMRegister x0 = xmm0;
4641 const XMMRegister x1 = xmm1;
4642 const XMMRegister x2 = xmm2;
4643 const XMMRegister x3 = xmm3;
4644
4645 const XMMRegister x4 = xmm4;
4646 const XMMRegister x5 = xmm5;
4647 const XMMRegister x6 = xmm6;
4648 const XMMRegister x7 = xmm7;
4649
4650 const Register tmp = r11;
4651
4652 BLOCK_COMMENT("Entry:");
4653 __ enter(); // required for proper stackwalking of RuntimeStub frame
4654
4655 #ifdef _WIN64
4656 // save the xmm registers which must be preserved 6-7
4657 __ subptr(rsp, 4 * wordSize);
4658 __ movdqu(Address(rsp, 0), xmm6);
4659 __ movdqu(Address(rsp, 2 * wordSize), xmm7);
4660 #endif
4661 __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4662
4663 #ifdef _WIN64
4664 // restore xmm regs belonging to calling function
4665 __ movdqu(xmm6, Address(rsp, 0));
4666 __ movdqu(xmm7, Address(rsp, 2 * wordSize));
4667 __ addptr(rsp, 4 * wordSize);
4668 #endif
4669
4670 __ leave(); // required for proper stackwalking of RuntimeStub frame
4671 __ ret(0);
4672
4673 return start;
4674
4675 }
4676
4677 address generate_libmLog() {
4678 address start = __ pc();
4679
4680 const XMMRegister x0 = xmm0;
4681 const XMMRegister x1 = xmm1;
4682 const XMMRegister x2 = xmm2;
4683 const XMMRegister x3 = xmm3;
4684
4685 const XMMRegister x4 = xmm4;
4686 const XMMRegister x5 = xmm5;
4687 const XMMRegister x6 = xmm6;
4688 const XMMRegister x7 = xmm7;
4689
4690 const Register tmp1 = r11;
4691 const Register tmp2 = r8;
4692
4693 BLOCK_COMMENT("Entry:");
4694 __ enter(); // required for proper stackwalking of RuntimeStub frame
4695
4696 #ifdef _WIN64
4697 // save the xmm registers which must be preserved 6-7
4698 __ subptr(rsp, 4 * wordSize);
4699 __ movdqu(Address(rsp, 0), xmm6);
4700 __ movdqu(Address(rsp, 2 * wordSize), xmm7);
4701 #endif
4702 __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
4703
4704 #ifdef _WIN64
4705 // restore xmm regs belonging to calling function
4706 __ movdqu(xmm6, Address(rsp, 0));
4707 __ movdqu(xmm7, Address(rsp, 2 * wordSize));
4708 __ addptr(rsp, 4 * wordSize);
4709 #endif
4710
4711 __ leave(); // required for proper stackwalking of RuntimeStub frame
4712 __ ret(0);
4713
4714 return start;
4715
4716 }
4717
4718 address generate_libmLog10() {
4719 address start = __ pc();
4720
4721 const XMMRegister x0 = xmm0;
4722 const XMMRegister x1 = xmm1;
4723 const XMMRegister x2 = xmm2;
4724 const XMMRegister x3 = xmm3;
4725
4726 const XMMRegister x4 = xmm4;
4727 const XMMRegister x5 = xmm5;
4728 const XMMRegister x6 = xmm6;
4729 const XMMRegister x7 = xmm7;
4730
4731 const Register tmp = r11;
4732
4733 BLOCK_COMMENT("Entry:");
4734 __ enter(); // required for proper stackwalking of RuntimeStub frame
4735
4736 #ifdef _WIN64
4737 // save the xmm registers which must be preserved 6-7
4738 __ subptr(rsp, 4 * wordSize);
4739 __ movdqu(Address(rsp, 0), xmm6);
4740 __ movdqu(Address(rsp, 2 * wordSize), xmm7);
4741 #endif
4742 __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4743
4744 #ifdef _WIN64
4745 // restore xmm regs belonging to calling function
4746 __ movdqu(xmm6, Address(rsp, 0));
4747 __ movdqu(xmm7, Address(rsp, 2 * wordSize));
4748 __ addptr(rsp, 4 * wordSize);
4749 #endif
4750
4751 __ leave(); // required for proper stackwalking of RuntimeStub frame
4752 __ ret(0);
4753
4754 return start;
4755
4756 }
4757
4758 address generate_libmPow() {
4759 address start = __ pc();
4760
4761 const XMMRegister x0 = xmm0;
4762 const XMMRegister x1 = xmm1;
4763 const XMMRegister x2 = xmm2;
4764 const XMMRegister x3 = xmm3;
4765
4766 const XMMRegister x4 = xmm4;
4767 const XMMRegister x5 = xmm5;
4768 const XMMRegister x6 = xmm6;
4769 const XMMRegister x7 = xmm7;
4770
4771 const Register tmp1 = r8;
4772 const Register tmp2 = r9;
4773 const Register tmp3 = r10;
4774 const Register tmp4 = r11;
4775
4776 BLOCK_COMMENT("Entry:");
4777 __ enter(); // required for proper stackwalking of RuntimeStub frame
4778
4779 #ifdef _WIN64
4780 // save the xmm registers which must be preserved 6-7
4781 __ subptr(rsp, 4 * wordSize);
4782 __ movdqu(Address(rsp, 0), xmm6);
4783 __ movdqu(Address(rsp, 2 * wordSize), xmm7);
4784 #endif
4785 __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4786
4787 #ifdef _WIN64
4788 // restore xmm regs belonging to calling function
4789 __ movdqu(xmm6, Address(rsp, 0));
4790 __ movdqu(xmm7, Address(rsp, 2 * wordSize));
4791 __ addptr(rsp, 4 * wordSize);
4792 #endif
4793
4794 __ leave(); // required for proper stackwalking of RuntimeStub frame
4795 __ ret(0);
4796
4797 return start;
4798
4799 }
4800
4801 address generate_libmSin() {
4802 address start = __ pc();
4803
4804 const XMMRegister x0 = xmm0;
4805 const XMMRegister x1 = xmm1;
4806 const XMMRegister x2 = xmm2;
4807 const XMMRegister x3 = xmm3;
4808
4809 const XMMRegister x4 = xmm4;
4810 const XMMRegister x5 = xmm5;
4811 const XMMRegister x6 = xmm6;
4812 const XMMRegister x7 = xmm7;
4813
4814 const Register tmp1 = r8;
4815 const Register tmp2 = r9;
4816 const Register tmp3 = r10;
4817 const Register tmp4 = r11;
4818
4819 BLOCK_COMMENT("Entry:");
4820 __ enter(); // required for proper stackwalking of RuntimeStub frame
4821
4822 #ifdef _WIN64
4823 __ push(rsi);
4824 __ push(rdi);
4825 // save the xmm registers which must be preserved 6-7
4826 __ subptr(rsp, 4 * wordSize);
4827 __ movdqu(Address(rsp, 0), xmm6);
4828 __ movdqu(Address(rsp, 2 * wordSize), xmm7);
4829 #endif
4830 __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4831
4832 #ifdef _WIN64
4833 // restore xmm regs belonging to calling function
4834 __ movdqu(xmm6, Address(rsp, 0));
4835 __ movdqu(xmm7, Address(rsp, 2 * wordSize));
4836 __ addptr(rsp, 4 * wordSize);
4837 __ pop(rdi);
4838 __ pop(rsi);
4839 #endif
4840
4841 __ leave(); // required for proper stackwalking of RuntimeStub frame
4842 __ ret(0);
4843
4844 return start;
4845
4846 }
4847
4848 address generate_libmCos() {
4849 address start = __ pc();
4850
4851 const XMMRegister x0 = xmm0;
4852 const XMMRegister x1 = xmm1;
4853 const XMMRegister x2 = xmm2;
4854 const XMMRegister x3 = xmm3;
4855
4856 const XMMRegister x4 = xmm4;
4857 const XMMRegister x5 = xmm5;
4858 const XMMRegister x6 = xmm6;
4859 const XMMRegister x7 = xmm7;
4860
4861 const Register tmp1 = r8;
4862 const Register tmp2 = r9;
4863 const Register tmp3 = r10;
4864 const Register tmp4 = r11;
4865
4866 BLOCK_COMMENT("Entry:");
4867 __ enter(); // required for proper stackwalking of RuntimeStub frame
4868
4869 #ifdef _WIN64
4870 __ push(rsi);
4871 __ push(rdi);
4872 // save the xmm registers which must be preserved 6-7
4873 __ subptr(rsp, 4 * wordSize);
4874 __ movdqu(Address(rsp, 0), xmm6);
4875 __ movdqu(Address(rsp, 2 * wordSize), xmm7);
4876 #endif
4877 __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4878
4879 #ifdef _WIN64
4880 // restore xmm regs belonging to calling function
4881 __ movdqu(xmm6, Address(rsp, 0));
4882 __ movdqu(xmm7, Address(rsp, 2 * wordSize));
4883 __ addptr(rsp, 4 * wordSize);
4884 __ pop(rdi);
4885 __ pop(rsi);
4886 #endif
4887
4888 __ leave(); // required for proper stackwalking of RuntimeStub frame
4889 __ ret(0);
4890
4891 return start;
4892
4893 }
4894
4895 address generate_libmTan() {
4896 address start = __ pc();
4897
4898 const XMMRegister x0 = xmm0;
4899 const XMMRegister x1 = xmm1;
4900 const XMMRegister x2 = xmm2;
4901 const XMMRegister x3 = xmm3;
4902
4903 const XMMRegister x4 = xmm4;
4904 const XMMRegister x5 = xmm5;
4905 const XMMRegister x6 = xmm6;
4906 const XMMRegister x7 = xmm7;
4907
4908 const Register tmp1 = r8;
4909 const Register tmp2 = r9;
4910 const Register tmp3 = r10;
4911 const Register tmp4 = r11;
4912
4913 BLOCK_COMMENT("Entry:");
4914 __ enter(); // required for proper stackwalking of RuntimeStub frame
4915
4916 #ifdef _WIN64
4917 __ push(rsi);
4918 __ push(rdi);
4919 // save the xmm registers which must be preserved 6-7
4920 __ subptr(rsp, 4 * wordSize);
4921 __ movdqu(Address(rsp, 0), xmm6);
4922 __ movdqu(Address(rsp, 2 * wordSize), xmm7);
4923 #endif
4924 __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4925
4926 #ifdef _WIN64
4927 // restore xmm regs belonging to calling function
4928 __ movdqu(xmm6, Address(rsp, 0));
4929 __ movdqu(xmm7, Address(rsp, 2 * wordSize));
4930 __ addptr(rsp, 4 * wordSize);
4931 __ pop(rdi);
4932 __ pop(rsi);
4933 #endif
4934
4935 __ leave(); // required for proper stackwalking of RuntimeStub frame
4936 __ ret(0);
4937
4938 return start;
4939
4940 }
4941
4942 #undef __
4943 #define __ masm->
4944
4945 // Continuation point for throwing of implicit exceptions that are
4946 // not handled in the current activation. Fabricates an exception
4947 // oop and initiates normal exception dispatching in this
4948 // frame. Since we need to preserve callee-saved values (currently
4949 // only for C2, but done for C1 as well) we need a callee-saved oop
4950 // map and therefore have to make these stubs into RuntimeStubs
|
3219 const int XMM_REG_NUM_KEY_LAST = 15;
3220 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3221 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3222 const XMMRegister xmm_key11 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3223 const XMMRegister xmm_key12 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3224 const XMMRegister xmm_key13 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3225
3226 __ enter(); // required for proper stackwalking of RuntimeStub frame
3227
3228 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3229 // context for the registers used, where all instructions below are using 128-bit mode
3230 // On EVEX without VL and BW, these instructions will all be AVX.
3231 if (VM_Version::supports_avx512vlbw()) {
3232 __ movl(rax, 0xffff);
3233 __ kmovql(k1, rax);
3234 }
3235
3236 #ifdef _WIN64
3237 // on win64, fill len_reg from stack position
3238 __ movl(len_reg, len_mem);
3239 #else
3240 __ push(len_reg); // Save
3241 #endif
3242
3243 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
3244 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3245 // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3246 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3247 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3248 offset += 0x10;
3249 }
3250 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
3251
3252 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3253 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3254 __ cmpl(rax, 44);
3255 __ jcc(Assembler::notEqual, L_key_192_256);
3256
3257 // 128 bit code follows here
3258 __ movptr(pos, 0);
3259 __ align(OptoLoopAlignment);
3260
3261 __ BIND(L_loopTop_128);
3262 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3263 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3264 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3265 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3266 __ aesenc(xmm_result, as_XMMRegister(rnum));
3267 }
3268 __ aesenclast(xmm_result, xmm_key10);
3269 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3270 // no need to store r to memory until we exit
3271 __ addptr(pos, AESBlockSize);
3272 __ subptr(len_reg, AESBlockSize);
3273 __ jcc(Assembler::notEqual, L_loopTop_128);
3274
3275 __ BIND(L_exit);
3276 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object
3277
3278 #ifdef _WIN64
3279 __ movl(rax, len_mem);
3280 #else
3281 __ pop(rax); // return length
3282 #endif
3283 __ leave(); // required for proper stackwalking of RuntimeStub frame
3284 __ ret(0);
3285
3286 __ BIND(L_key_192_256);
3287 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3288 load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3289 load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3290 __ cmpl(rax, 52);
3291 __ jcc(Assembler::notEqual, L_key_256);
3292
3293 // 192-bit code follows here (could be changed to use more xmm registers)
3294 __ movptr(pos, 0);
3295 __ align(OptoLoopAlignment);
3296
3297 __ BIND(L_loopTop_192);
3298 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3420
3421 // keys 0-10 preloaded into xmm5-xmm15
3422 const int XMM_REG_NUM_KEY_FIRST = 5;
3423 const int XMM_REG_NUM_KEY_LAST = 15;
3424 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3425 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3426
3427 __ enter(); // required for proper stackwalking of RuntimeStub frame
3428
3429 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3430 // context for the registers used, where all instructions below are using 128-bit mode
3431 // On EVEX without VL and BW, these instructions will all be AVX.
3432 if (VM_Version::supports_avx512vlbw()) {
3433 __ movl(rax, 0xffff);
3434 __ kmovql(k1, rax);
3435 }
3436
3437 #ifdef _WIN64
3438 // on win64, fill len_reg from stack position
3439 __ movl(len_reg, len_mem);
3440 #else
3441 __ push(len_reg); // Save
3442 #endif
3443 __ push(rbx);
3444 // the java expanded key ordering is rotated one position from what we want
3445 // so we start from 0x10 here and hit 0x00 last
3446 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
3447 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3448 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3449 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3450 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3451 offset += 0x10;
3452 }
3453 load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3454
3455 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block
3456
3457 // registers holding the four results in the parallelized loop
3458 const XMMRegister xmm_result0 = xmm0;
3459 const XMMRegister xmm_result1 = xmm2;
3613 __ aesdec(xmm_result, key_tmp);
3614 }
3615
3616 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3617 __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3618 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3619 // no need to store r to memory until we exit
3620 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3621 __ addptr(pos, AESBlockSize);
3622 __ subptr(len_reg, AESBlockSize);
3623 __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3624 if (k != 2) {
3625 __ jmp(L_exit);
3626 }
3627 } //for 128/192/256
3628
3629 __ BIND(L_exit);
3630 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
3631 __ pop(rbx);
3632 #ifdef _WIN64
3633 __ movl(rax, len_mem);
3634 #else
3635 __ pop(rax); // return length
3636 #endif
3637 __ leave(); // required for proper stackwalking of RuntimeStub frame
3638 __ ret(0);
3639 return start;
3640 }
3641
3642 address generate_upper_word_mask() {
3643 __ align(64);
3644 StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3645 address start = __ pc();
3646 __ emit_data64(0x0000000000000000, relocInfo::none);
3647 __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3648 return start;
3649 }
3650
3651 address generate_shuffle_byte_flip_mask() {
3652 __ align(64);
3664 StubCodeMark mark(this, "StubRoutines", name);
3665 address start = __ pc();
3666
3667 Register buf = c_rarg0;
3668 Register state = c_rarg1;
3669 Register ofs = c_rarg2;
3670 Register limit = c_rarg3;
3671
3672 const XMMRegister abcd = xmm0;
3673 const XMMRegister e0 = xmm1;
3674 const XMMRegister e1 = xmm2;
3675 const XMMRegister msg0 = xmm3;
3676
3677 const XMMRegister msg1 = xmm4;
3678 const XMMRegister msg2 = xmm5;
3679 const XMMRegister msg3 = xmm6;
3680 const XMMRegister shuf_mask = xmm7;
3681
3682 __ enter();
3683
3684 __ subptr(rsp, 4 * wordSize);
3685
3686 __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3687 buf, state, ofs, limit, rsp, multi_block);
3688
3689 __ addptr(rsp, 4 * wordSize);
3690
3691 __ leave();
3692 __ ret(0);
3693 return start;
3694 }
3695
3696 address generate_pshuffle_byte_flip_mask() {
3697 __ align(64);
3698 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3699 address start = __ pc();
3700 __ emit_data64(0x0405060700010203, relocInfo::none);
3701 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3702
3703 if (VM_Version::supports_avx2()) {
3704 __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
3705 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3706 // _SHUF_00BA
3707 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3708 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3709 __ emit_data64(0x0b0a090803020100, relocInfo::none);
3727 address start = __ pc();
3728
3729 Register buf = c_rarg0;
3730 Register state = c_rarg1;
3731 Register ofs = c_rarg2;
3732 Register limit = c_rarg3;
3733
3734 const XMMRegister msg = xmm0;
3735 const XMMRegister state0 = xmm1;
3736 const XMMRegister state1 = xmm2;
3737 const XMMRegister msgtmp0 = xmm3;
3738
3739 const XMMRegister msgtmp1 = xmm4;
3740 const XMMRegister msgtmp2 = xmm5;
3741 const XMMRegister msgtmp3 = xmm6;
3742 const XMMRegister msgtmp4 = xmm7;
3743
3744 const XMMRegister shuf_mask = xmm8;
3745
3746 __ enter();
3747
3748 __ subptr(rsp, 4 * wordSize);
3749
3750 if (VM_Version::supports_sha()) {
3751 __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3752 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3753 } else if (VM_Version::supports_avx2()) {
3754 __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3755 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3756 }
3757 __ addptr(rsp, 4 * wordSize);
3758
3759 __ leave();
3760 __ ret(0);
3761 return start;
3762 }
3763
3764 // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
3765 // to hide instruction latency
3766 //
3767 // Arguments:
3768 //
3769 // Inputs:
3770 // c_rarg0 - source byte array address
3771 // c_rarg1 - destination byte array address
3772 // c_rarg2 - K (key) in little endian int array
3773 // c_rarg3 - counter vector byte array address
3774 // Linux
3775 // c_rarg4 - input length
3776 // c_rarg5 - saved encryptedCounter start
3777 // rbp + 6 * wordSize - saved used length
3778 // Windows
3839 Label L_multiBlock_loopTop[3];
3840 Label L_singleBlockLoopTop[3];
3841 Label L__incCounter[3][6]; //for 6 blocks
3842 Label L__incCounter_single[3]; //for single block, key128, key192, key256
3843 Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
3844 Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
3845
3846 Label L_exit;
3847
3848 __ enter(); // required for proper stackwalking of RuntimeStub frame
3849
3850 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3851 // context for the registers used, where all instructions below are using 128-bit mode
3852 // On EVEX without VL and BW, these instructions will all be AVX.
3853 if (VM_Version::supports_avx512vlbw()) {
3854 __ movl(rax, 0xffff);
3855 __ kmovql(k1, rax);
3856 }
3857
3858 #ifdef _WIN64
3859 // allocate spill slots for r13, r14
3860 enum {
3861 saved_r13_offset,
3862 saved_r14_offset
3863 };
3864 __ subptr(rsp, 2 * wordSize);
3865 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
3866 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
3867
3868 // on win64, fill len_reg from stack position
3869 __ movl(len_reg, len_mem);
3870 __ movptr(saved_encCounter_start, saved_encCounter_mem);
3871 __ movptr(used_addr, used_mem);
3872 __ movl(used, Address(used_addr, 0));
3873 #else
3874 __ push(len_reg); // Save
3875 __ movptr(used_addr, used_mem);
3876 __ movl(used, Address(used_addr, 0));
3877 #endif
3878
3879 __ push(rbx); // Save RBX
3880 __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
3881 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
3882 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
3883 __ movptr(pos, 0);
3884
3885 // Use the partially used encrpyted counter from last invocation
3886 __ BIND(L_preLoop_start);
4048 __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4049 __ pextrw(Address(to, pos), xmm_result0, 0);
4050 __ psrldq(xmm_result0, 2);
4051 __ addptr(pos, 2);
4052 __ BIND(L_processTail_1_extr[k]);
4053 __ testptr(len_reg, 1);
4054 __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4055 __ pextrb(Address(to, pos), xmm_result0, 0);
4056
4057 __ BIND(L_processTail_exit_extr[k]);
4058 __ movl(Address(used_addr, 0), len_reg);
4059 __ jmp(L_exit);
4060
4061 }
4062
4063 __ BIND(L_exit);
4064 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4065 __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4066 __ pop(rbx); // pop the saved RBX.
4067 #ifdef _WIN64
4068 __ movl(rax, len_mem);
4069 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4070 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4071 __ addptr(rsp, 2 * wordSize);
4072 #else
4073 __ pop(rax); // return 'len'
4074 #endif
4075 __ leave(); // required for proper stackwalking of RuntimeStub frame
4076 __ ret(0);
4077 return start;
4078 }
4079
4080 // byte swap x86 long
4081 address generate_ghash_long_swap_mask() {
4082 __ align(CodeEntryAlignment);
4083 StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4084 address start = __ pc();
4085 __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4086 __ emit_data64(0x0706050403020100, relocInfo::none );
4087 return start;
4088 }
4089
4090 // byte swap x86 byte array
4091 address generate_ghash_byte_swap_mask() {
4092 __ align(CodeEntryAlignment);
4093 StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4094 address start = __ pc();
4095 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4096 __ emit_data64(0x0001020304050607, relocInfo::none );
4097 return start;
4098 }
4099
4100 /* Single and multi-block ghash operations */
4101 address generate_ghash_processBlocks() {
4102 __ align(CodeEntryAlignment);
4103 Label L_ghash_loop, L_exit;
4104 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4105 address start = __ pc();
4106
4107 const Register state = c_rarg0;
4108 const Register subkeyH = c_rarg1;
4109 const Register data = c_rarg2;
4110 const Register blocks = c_rarg3;
4111
4112 const XMMRegister xmm_temp0 = xmm0;
4113 const XMMRegister xmm_temp1 = xmm1;
4114 const XMMRegister xmm_temp2 = xmm2;
4115 const XMMRegister xmm_temp3 = xmm3;
4116 const XMMRegister xmm_temp4 = xmm4;
4117 const XMMRegister xmm_temp5 = xmm5;
4118 const XMMRegister xmm_temp6 = xmm6;
4119 const XMMRegister xmm_temp7 = xmm7;
4120 const XMMRegister xmm_temp8 = xmm8;
4121 const XMMRegister xmm_temp9 = xmm9;
4122 const XMMRegister xmm_temp10 = xmm10;
4123
4124 __ enter();
4125
4126 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
4127 // context for the registers used, where all instructions below are using 128-bit mode
4128 // On EVEX without VL and BW, these instructions will all be AVX.
4129 if (VM_Version::supports_avx512vlbw()) {
4130 __ movl(rax, 0xffff);
4131 __ kmovql(k1, rax);
4132 }
4133
4134 __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4135
4136 __ movdqu(xmm_temp0, Address(state, 0));
4137 __ pshufb(xmm_temp0, xmm_temp10);
4138
4139
4140 __ BIND(L_ghash_loop);
4141 __ movdqu(xmm_temp2, Address(data, 0));
4142 __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4143
4144 __ movdqu(xmm_temp1, Address(subkeyH, 0));
4145 __ pshufb(xmm_temp1, xmm_temp10);
4146
4147 __ pxor(xmm_temp0, xmm_temp2);
4148
4149 //
4150 // Multiply with the hash key
4151 //
4152 __ movdqu(xmm_temp3, xmm_temp0);
4153 __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
4213 __ movdqu(xmm_temp5, xmm_temp3);
4214 __ psrld(xmm_temp2, 1); // packed left shifting >> 1
4215 __ psrld(xmm_temp4, 2); // packed left shifting >> 2
4216 __ psrld(xmm_temp5, 7); // packed left shifting >> 7
4217 __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions
4218 __ pxor(xmm_temp2, xmm_temp5);
4219 __ pxor(xmm_temp2, xmm_temp8);
4220 __ pxor(xmm_temp3, xmm_temp2);
4221 __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
4222
4223 __ decrement(blocks);
4224 __ jcc(Assembler::zero, L_exit);
4225 __ movdqu(xmm_temp0, xmm_temp6);
4226 __ addptr(data, 16);
4227 __ jmp(L_ghash_loop);
4228
4229 __ BIND(L_exit);
4230 __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
4231 __ movdqu(Address(state, 0), xmm_temp6); // store the result
4232
4233 __ leave();
4234 __ ret(0);
4235 return start;
4236 }
4237
4238 /**
4239 * Arguments:
4240 *
4241 * Inputs:
4242 * c_rarg0 - int crc
4243 * c_rarg1 - byte* buf
4244 * c_rarg2 - int length
4245 *
4246 * Ouput:
4247 * rax - int crc result
4248 */
4249 address generate_updateBytesCRC32() {
4250 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
4251
4252 __ align(CodeEntryAlignment);
4549 }
4550
4551 address generate_libmExp() {
4552 address start = __ pc();
4553
4554 const XMMRegister x0 = xmm0;
4555 const XMMRegister x1 = xmm1;
4556 const XMMRegister x2 = xmm2;
4557 const XMMRegister x3 = xmm3;
4558
4559 const XMMRegister x4 = xmm4;
4560 const XMMRegister x5 = xmm5;
4561 const XMMRegister x6 = xmm6;
4562 const XMMRegister x7 = xmm7;
4563
4564 const Register tmp = r11;
4565
4566 BLOCK_COMMENT("Entry:");
4567 __ enter(); // required for proper stackwalking of RuntimeStub frame
4568
4569 __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4570
4571 __ leave(); // required for proper stackwalking of RuntimeStub frame
4572 __ ret(0);
4573
4574 return start;
4575
4576 }
4577
4578 address generate_libmLog() {
4579 address start = __ pc();
4580
4581 const XMMRegister x0 = xmm0;
4582 const XMMRegister x1 = xmm1;
4583 const XMMRegister x2 = xmm2;
4584 const XMMRegister x3 = xmm3;
4585
4586 const XMMRegister x4 = xmm4;
4587 const XMMRegister x5 = xmm5;
4588 const XMMRegister x6 = xmm6;
4589 const XMMRegister x7 = xmm7;
4590
4591 const Register tmp1 = r11;
4592 const Register tmp2 = r8;
4593
4594 BLOCK_COMMENT("Entry:");
4595 __ enter(); // required for proper stackwalking of RuntimeStub frame
4596
4597 __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
4598
4599 __ leave(); // required for proper stackwalking of RuntimeStub frame
4600 __ ret(0);
4601
4602 return start;
4603
4604 }
4605
4606 address generate_libmLog10() {
4607 address start = __ pc();
4608
4609 const XMMRegister x0 = xmm0;
4610 const XMMRegister x1 = xmm1;
4611 const XMMRegister x2 = xmm2;
4612 const XMMRegister x3 = xmm3;
4613
4614 const XMMRegister x4 = xmm4;
4615 const XMMRegister x5 = xmm5;
4616 const XMMRegister x6 = xmm6;
4617 const XMMRegister x7 = xmm7;
4618
4619 const Register tmp = r11;
4620
4621 BLOCK_COMMENT("Entry:");
4622 __ enter(); // required for proper stackwalking of RuntimeStub frame
4623
4624 __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4625
4626 __ leave(); // required for proper stackwalking of RuntimeStub frame
4627 __ ret(0);
4628
4629 return start;
4630
4631 }
4632
4633 address generate_libmPow() {
4634 address start = __ pc();
4635
4636 const XMMRegister x0 = xmm0;
4637 const XMMRegister x1 = xmm1;
4638 const XMMRegister x2 = xmm2;
4639 const XMMRegister x3 = xmm3;
4640
4641 const XMMRegister x4 = xmm4;
4642 const XMMRegister x5 = xmm5;
4643 const XMMRegister x6 = xmm6;
4644 const XMMRegister x7 = xmm7;
4645
4646 const Register tmp1 = r8;
4647 const Register tmp2 = r9;
4648 const Register tmp3 = r10;
4649 const Register tmp4 = r11;
4650
4651 BLOCK_COMMENT("Entry:");
4652 __ enter(); // required for proper stackwalking of RuntimeStub frame
4653
4654 __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4655
4656 __ leave(); // required for proper stackwalking of RuntimeStub frame
4657 __ ret(0);
4658
4659 return start;
4660
4661 }
4662
4663 address generate_libmSin() {
4664 address start = __ pc();
4665
4666 const XMMRegister x0 = xmm0;
4667 const XMMRegister x1 = xmm1;
4668 const XMMRegister x2 = xmm2;
4669 const XMMRegister x3 = xmm3;
4670
4671 const XMMRegister x4 = xmm4;
4672 const XMMRegister x5 = xmm5;
4673 const XMMRegister x6 = xmm6;
4674 const XMMRegister x7 = xmm7;
4675
4676 const Register tmp1 = r8;
4677 const Register tmp2 = r9;
4678 const Register tmp3 = r10;
4679 const Register tmp4 = r11;
4680
4681 BLOCK_COMMENT("Entry:");
4682 __ enter(); // required for proper stackwalking of RuntimeStub frame
4683
4684 #ifdef _WIN64
4685 __ push(rsi);
4686 __ push(rdi);
4687 #endif
4688 __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4689
4690 #ifdef _WIN64
4691 __ pop(rdi);
4692 __ pop(rsi);
4693 #endif
4694
4695 __ leave(); // required for proper stackwalking of RuntimeStub frame
4696 __ ret(0);
4697
4698 return start;
4699
4700 }
4701
4702 address generate_libmCos() {
4703 address start = __ pc();
4704
4705 const XMMRegister x0 = xmm0;
4706 const XMMRegister x1 = xmm1;
4707 const XMMRegister x2 = xmm2;
4708 const XMMRegister x3 = xmm3;
4709
4710 const XMMRegister x4 = xmm4;
4711 const XMMRegister x5 = xmm5;
4712 const XMMRegister x6 = xmm6;
4713 const XMMRegister x7 = xmm7;
4714
4715 const Register tmp1 = r8;
4716 const Register tmp2 = r9;
4717 const Register tmp3 = r10;
4718 const Register tmp4 = r11;
4719
4720 BLOCK_COMMENT("Entry:");
4721 __ enter(); // required for proper stackwalking of RuntimeStub frame
4722
4723 #ifdef _WIN64
4724 __ push(rsi);
4725 __ push(rdi);
4726 #endif
4727 __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4728
4729 #ifdef _WIN64
4730 __ pop(rdi);
4731 __ pop(rsi);
4732 #endif
4733
4734 __ leave(); // required for proper stackwalking of RuntimeStub frame
4735 __ ret(0);
4736
4737 return start;
4738
4739 }
4740
4741 address generate_libmTan() {
4742 address start = __ pc();
4743
4744 const XMMRegister x0 = xmm0;
4745 const XMMRegister x1 = xmm1;
4746 const XMMRegister x2 = xmm2;
4747 const XMMRegister x3 = xmm3;
4748
4749 const XMMRegister x4 = xmm4;
4750 const XMMRegister x5 = xmm5;
4751 const XMMRegister x6 = xmm6;
4752 const XMMRegister x7 = xmm7;
4753
4754 const Register tmp1 = r8;
4755 const Register tmp2 = r9;
4756 const Register tmp3 = r10;
4757 const Register tmp4 = r11;
4758
4759 BLOCK_COMMENT("Entry:");
4760 __ enter(); // required for proper stackwalking of RuntimeStub frame
4761
4762 #ifdef _WIN64
4763 __ push(rsi);
4764 __ push(rdi);
4765 #endif
4766 __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4767
4768 #ifdef _WIN64
4769 __ pop(rdi);
4770 __ pop(rsi);
4771 #endif
4772
4773 __ leave(); // required for proper stackwalking of RuntimeStub frame
4774 __ ret(0);
4775
4776 return start;
4777
4778 }
4779
4780 #undef __
4781 #define __ masm->
4782
4783 // Continuation point for throwing of implicit exceptions that are
4784 // not handled in the current activation. Fabricates an exception
4785 // oop and initiates normal exception dispatching in this
4786 // frame. Since we need to preserve callee-saved values (currently
4787 // only for C2, but done for C1 as well) we need a callee-saved oop
4788 // map and therefore have to make these stubs into RuntimeStubs
|