< prev index next >

src/cpu/x86/vm/stubGenerator_x86_64.cpp

Print this page




3219     const int XMM_REG_NUM_KEY_LAST  = 15;
3220     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3221     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3222     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3223     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3224     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3225 
3226     __ enter(); // required for proper stackwalking of RuntimeStub frame
3227 
3228     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3229     // context for the registers used, where all instructions below are using 128-bit mode
3230     // On EVEX without VL and BW, these instructions will all be AVX.
3231     if (VM_Version::supports_avx512vlbw()) {
3232       __ movl(rax, 0xffff);
3233       __ kmovql(k1, rax);
3234     }
3235 
3236 #ifdef _WIN64
3237     // on win64, fill len_reg from stack position
3238     __ movl(len_reg, len_mem);
3239     // save the xmm registers which must be preserved 6-15
3240     __ subptr(rsp, -rsp_after_call_off * wordSize);
3241     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3242       __ movdqu(xmm_save(i), as_XMMRegister(i));
3243     }
3244 #else
3245     __ push(len_reg); // Save
3246 #endif
3247 
3248     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3249     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3250     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3251     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3252       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3253       offset += 0x10;
3254     }
3255     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3256 
3257     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3258     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3259     __ cmpl(rax, 44);
3260     __ jcc(Assembler::notEqual, L_key_192_256);
3261 
3262     // 128 bit code follows here
3263     __ movptr(pos, 0);
3264     __ align(OptoLoopAlignment);
3265 
3266     __ BIND(L_loopTop_128);
3267     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3268     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3269     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3270     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3271       __ aesenc(xmm_result, as_XMMRegister(rnum));
3272     }
3273     __ aesenclast(xmm_result, xmm_key10);
3274     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3275     // no need to store r to memory until we exit
3276     __ addptr(pos, AESBlockSize);
3277     __ subptr(len_reg, AESBlockSize);
3278     __ jcc(Assembler::notEqual, L_loopTop_128);
3279 
3280     __ BIND(L_exit);
3281     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3282 
3283 #ifdef _WIN64
3284     // restore xmm regs belonging to calling function
3285     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3286       __ movdqu(as_XMMRegister(i), xmm_save(i));
3287     }
3288     __ movl(rax, len_mem);
3289 #else
3290     __ pop(rax); // return length
3291 #endif
3292     __ leave(); // required for proper stackwalking of RuntimeStub frame
3293     __ ret(0);
3294 
3295     __ BIND(L_key_192_256);
3296     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3297     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3298     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3299     __ cmpl(rax, 52);
3300     __ jcc(Assembler::notEqual, L_key_256);
3301 
3302     // 192-bit code follows here (could be changed to use more xmm registers)
3303     __ movptr(pos, 0);
3304     __ align(OptoLoopAlignment);
3305 
3306     __ BIND(L_loopTop_192);
3307     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input


3429 
3430     // keys 0-10 preloaded into xmm5-xmm15
3431     const int XMM_REG_NUM_KEY_FIRST = 5;
3432     const int XMM_REG_NUM_KEY_LAST  = 15;
3433     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3434     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3435 
3436     __ enter(); // required for proper stackwalking of RuntimeStub frame
3437 
3438     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3439     // context for the registers used, where all instructions below are using 128-bit mode
3440     // On EVEX without VL and BW, these instructions will all be AVX.
3441     if (VM_Version::supports_avx512vlbw()) {
3442       __ movl(rax, 0xffff);
3443       __ kmovql(k1, rax);
3444     }
3445 
3446 #ifdef _WIN64
3447     // on win64, fill len_reg from stack position
3448     __ movl(len_reg, len_mem);
3449     // save the xmm registers which must be preserved 6-15
3450     __ subptr(rsp, -rsp_after_call_off * wordSize);
3451     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3452       __ movdqu(xmm_save(i), as_XMMRegister(i));
3453     }
3454 #else
3455     __ push(len_reg); // Save
3456 #endif
3457     __ push(rbx);
3458     // the java expanded key ordering is rotated one position from what we want
3459     // so we start from 0x10 here and hit 0x00 last
3460     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3461     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3462     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3463     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3464       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3465       offset += 0x10;
3466     }
3467     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3468 
3469     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3470 
3471     // registers holding the four results in the parallelized loop
3472     const XMMRegister xmm_result0 = xmm0;
3473     const XMMRegister xmm_result1 = xmm2;


3627         __ aesdec(xmm_result, key_tmp);
3628       }
3629 
3630       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3631       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3632       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3633       // no need to store r to memory until we exit
3634       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3635       __ addptr(pos, AESBlockSize);
3636       __ subptr(len_reg, AESBlockSize);
3637       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3638       if (k != 2) {
3639         __ jmp(L_exit);
3640       }
3641     } //for 128/192/256
3642 
3643     __ BIND(L_exit);
3644     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
3645     __ pop(rbx);
3646 #ifdef _WIN64
3647     // restore regs belonging to calling function
3648     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3649       __ movdqu(as_XMMRegister(i), xmm_save(i));
3650     }
3651     __ movl(rax, len_mem);
3652 #else
3653     __ pop(rax); // return length
3654 #endif
3655     __ leave(); // required for proper stackwalking of RuntimeStub frame
3656     __ ret(0);
3657     return start;
3658 }
3659 
3660   address generate_upper_word_mask() {
3661     __ align(64);
3662     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3663     address start = __ pc();
3664     __ emit_data64(0x0000000000000000, relocInfo::none);
3665     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3666     return start;
3667   }
3668 
3669   address generate_shuffle_byte_flip_mask() {
3670     __ align(64);


3682     StubCodeMark mark(this, "StubRoutines", name);
3683     address start = __ pc();
3684 
3685     Register buf = c_rarg0;
3686     Register state = c_rarg1;
3687     Register ofs = c_rarg2;
3688     Register limit = c_rarg3;
3689 
3690     const XMMRegister abcd = xmm0;
3691     const XMMRegister e0 = xmm1;
3692     const XMMRegister e1 = xmm2;
3693     const XMMRegister msg0 = xmm3;
3694 
3695     const XMMRegister msg1 = xmm4;
3696     const XMMRegister msg2 = xmm5;
3697     const XMMRegister msg3 = xmm6;
3698     const XMMRegister shuf_mask = xmm7;
3699 
3700     __ enter();
3701 
3702 #ifdef _WIN64
3703     // save the xmm registers which must be preserved 6-7
3704     __ subptr(rsp, 4 * wordSize);
3705     __ movdqu(Address(rsp, 0), xmm6);
3706     __ movdqu(Address(rsp, 2 * wordSize), xmm7);
3707 #endif
3708 
3709     __ subptr(rsp, 4 * wordSize);
3710 
3711     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3712       buf, state, ofs, limit, rsp, multi_block);
3713 
3714     __ addptr(rsp, 4 * wordSize);
3715 #ifdef _WIN64
3716     // restore xmm regs belonging to calling function
3717     __ movdqu(xmm6, Address(rsp, 0));
3718     __ movdqu(xmm7, Address(rsp, 2 * wordSize));
3719     __ addptr(rsp, 4 * wordSize);
3720 #endif
3721 
3722     __ leave();
3723     __ ret(0);
3724     return start;
3725   }
3726 
3727   address generate_pshuffle_byte_flip_mask() {
3728     __ align(64);
3729     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3730     address start = __ pc();
3731     __ emit_data64(0x0405060700010203, relocInfo::none);
3732     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3733 
3734     if (VM_Version::supports_avx2()) {
3735       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
3736       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3737       // _SHUF_00BA
3738       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3739       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3740       __ emit_data64(0x0b0a090803020100, relocInfo::none);


3758     address start = __ pc();
3759 
3760     Register buf = c_rarg0;
3761     Register state = c_rarg1;
3762     Register ofs = c_rarg2;
3763     Register limit = c_rarg3;
3764 
3765     const XMMRegister msg = xmm0;
3766     const XMMRegister state0 = xmm1;
3767     const XMMRegister state1 = xmm2;
3768     const XMMRegister msgtmp0 = xmm3;
3769 
3770     const XMMRegister msgtmp1 = xmm4;
3771     const XMMRegister msgtmp2 = xmm5;
3772     const XMMRegister msgtmp3 = xmm6;
3773     const XMMRegister msgtmp4 = xmm7;
3774 
3775     const XMMRegister shuf_mask = xmm8;
3776 
3777     __ enter();
3778 #ifdef _WIN64
3779     // save the xmm registers which must be preserved 6-7
3780     __ subptr(rsp, 6 * wordSize);
3781     __ movdqu(Address(rsp, 0), xmm6);
3782     __ movdqu(Address(rsp, 2 * wordSize), xmm7);
3783     __ movdqu(Address(rsp, 4 * wordSize), xmm8);
3784 
3785     if (!VM_Version::supports_sha() && VM_Version::supports_avx2()) {
3786       __ subptr(rsp, 10 * wordSize);
3787       __ movdqu(Address(rsp, 0), xmm9);
3788       __ movdqu(Address(rsp, 2 * wordSize), xmm10);
3789       __ movdqu(Address(rsp, 4 * wordSize), xmm11);
3790       __ movdqu(Address(rsp, 6 * wordSize), xmm12);
3791       __ movdqu(Address(rsp, 8 * wordSize), xmm13);
3792     }
3793 #endif
3794 
3795     __ subptr(rsp, 4 * wordSize);
3796 
3797     if (VM_Version::supports_sha()) {
3798       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3799         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3800     } else if (VM_Version::supports_avx2()) {
3801       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3802         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3803     }
3804     __ addptr(rsp, 4 * wordSize);
3805 #ifdef _WIN64
3806     // restore xmm regs belonging to calling function
3807     if (!VM_Version::supports_sha() && VM_Version::supports_avx2()) {
3808       __ movdqu(xmm9, Address(rsp, 0));
3809       __ movdqu(xmm10, Address(rsp, 2 * wordSize));
3810       __ movdqu(xmm11, Address(rsp, 4 * wordSize));
3811       __ movdqu(xmm12, Address(rsp, 6 * wordSize));
3812       __ movdqu(xmm13, Address(rsp, 8 * wordSize));
3813       __ addptr(rsp, 10 * wordSize);
3814     }
3815     __ movdqu(xmm6, Address(rsp, 0));
3816     __ movdqu(xmm7, Address(rsp, 2 * wordSize));
3817     __ movdqu(xmm8, Address(rsp, 4 * wordSize));
3818     __ addptr(rsp, 6 * wordSize);
3819 #endif
3820     __ leave();
3821     __ ret(0);
3822     return start;
3823   }
3824 
3825   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
3826   // to hide instruction latency
3827   //
3828   // Arguments:
3829   //
3830   // Inputs:
3831   //   c_rarg0   - source byte array address
3832   //   c_rarg1   - destination byte array address
3833   //   c_rarg2   - K (key) in little endian int array
3834   //   c_rarg3   - counter vector byte array address
3835   //   Linux
3836   //     c_rarg4   -          input length
3837   //     c_rarg5   -          saved encryptedCounter start
3838   //     rbp + 6 * wordSize - saved used length
3839   //   Windows


3900     Label L_multiBlock_loopTop[3];
3901     Label L_singleBlockLoopTop[3];
3902     Label L__incCounter[3][6]; //for 6 blocks
3903     Label L__incCounter_single[3]; //for single block, key128, key192, key256
3904     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
3905     Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
3906 
3907     Label L_exit;
3908 
3909     __ enter(); // required for proper stackwalking of RuntimeStub frame
3910 
3911     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3912     // context for the registers used, where all instructions below are using 128-bit mode
3913     // On EVEX without VL and BW, these instructions will all be AVX.
3914     if (VM_Version::supports_avx512vlbw()) {
3915         __ movl(rax, 0xffff);
3916         __ kmovql(k1, rax);
3917     }
3918 
3919 #ifdef _WIN64
3920     // save the xmm registers which must be preserved 6-14
3921     const int XMM_REG_NUM_KEY_LAST = 14;
3922     __ subptr(rsp, -rsp_after_call_off * wordSize);
3923     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3924       __ movdqu(xmm_save(i), as_XMMRegister(i));
3925     }
3926 
3927     const Address r13_save(rbp, rdi_off * wordSize);
3928     const Address r14_save(rbp, rsi_off * wordSize);
3929 
3930     __ movptr(r13_save, r13);
3931     __ movptr(r14_save, r14);
3932 
3933     // on win64, fill len_reg from stack position
3934     __ movl(len_reg, len_mem);
3935     __ movptr(saved_encCounter_start, saved_encCounter_mem);
3936     __ movptr(used_addr, used_mem);
3937     __ movl(used, Address(used_addr, 0));
3938 #else
3939     __ push(len_reg); // Save
3940     __ movptr(used_addr, used_mem);
3941     __ movl(used, Address(used_addr, 0));
3942 #endif
3943 
3944     __ push(rbx); // Save RBX
3945     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
3946     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
3947     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
3948     __ movptr(pos, 0);
3949 
3950     // Use the partially used encrpyted counter from last invocation
3951     __ BIND(L_preLoop_start);


4113         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4114           __ pextrw(Address(to, pos), xmm_result0, 0);
4115           __ psrldq(xmm_result0, 2);
4116           __ addptr(pos, 2);
4117         __ BIND(L_processTail_1_extr[k]);
4118         __ testptr(len_reg, 1);
4119         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4120           __ pextrb(Address(to, pos), xmm_result0, 0);
4121 
4122         __ BIND(L_processTail_exit_extr[k]);
4123         __ movl(Address(used_addr, 0), len_reg);
4124         __ jmp(L_exit);
4125 
4126     }
4127 
4128     __ BIND(L_exit);
4129     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4130     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4131     __ pop(rbx); // pop the saved RBX.
4132 #ifdef _WIN64
4133     // restore regs belonging to calling function
4134     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
4135       __ movdqu(as_XMMRegister(i), xmm_save(i));
4136     }
4137     __ movl(rax, len_mem);
4138     __ movptr(r13, r13_save);
4139     __ movptr(r14, r14_save);

4140 #else
4141     __ pop(rax); // return 'len'
4142 #endif
4143     __ leave(); // required for proper stackwalking of RuntimeStub frame
4144     __ ret(0);
4145     return start;
4146   }
4147 
4148   // byte swap x86 long
4149   address generate_ghash_long_swap_mask() {
4150     __ align(CodeEntryAlignment);
4151     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4152     address start = __ pc();
4153     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4154     __ emit_data64(0x0706050403020100, relocInfo::none );
4155   return start;
4156   }
4157 
4158   // byte swap x86 byte array
4159   address generate_ghash_byte_swap_mask() {
4160     __ align(CodeEntryAlignment);
4161     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4162     address start = __ pc();
4163     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4164     __ emit_data64(0x0001020304050607, relocInfo::none );
4165   return start;
4166   }
4167 
4168   /* Single and multi-block ghash operations */
4169   address generate_ghash_processBlocks() {
4170     __ align(CodeEntryAlignment);
4171     Label L_ghash_loop, L_exit;
4172     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4173     address start = __ pc();
4174 
4175     const Register state        = c_rarg0;
4176     const Register subkeyH      = c_rarg1;
4177     const Register data         = c_rarg2;
4178     const Register blocks       = c_rarg3;
4179 
4180 #ifdef _WIN64
4181     const int XMM_REG_LAST  = 10;
4182 #endif
4183 
4184     const XMMRegister xmm_temp0 = xmm0;
4185     const XMMRegister xmm_temp1 = xmm1;
4186     const XMMRegister xmm_temp2 = xmm2;
4187     const XMMRegister xmm_temp3 = xmm3;
4188     const XMMRegister xmm_temp4 = xmm4;
4189     const XMMRegister xmm_temp5 = xmm5;
4190     const XMMRegister xmm_temp6 = xmm6;
4191     const XMMRegister xmm_temp7 = xmm7;
4192     const XMMRegister xmm_temp8 = xmm8;
4193     const XMMRegister xmm_temp9 = xmm9;
4194     const XMMRegister xmm_temp10 = xmm10;
4195 
4196     __ enter();
4197 
4198     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
4199     // context for the registers used, where all instructions below are using 128-bit mode
4200     // On EVEX without VL and BW, these instructions will all be AVX.
4201     if (VM_Version::supports_avx512vlbw()) {
4202       __ movl(rax, 0xffff);
4203       __ kmovql(k1, rax);
4204     }
4205 
4206 #ifdef _WIN64
4207     // save the xmm registers which must be preserved 6-10
4208     __ subptr(rsp, -rsp_after_call_off * wordSize);
4209     for (int i = 6; i <= XMM_REG_LAST; i++) {
4210       __ movdqu(xmm_save(i), as_XMMRegister(i));
4211     }
4212 #endif
4213 
4214     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4215 
4216     __ movdqu(xmm_temp0, Address(state, 0));
4217     __ pshufb(xmm_temp0, xmm_temp10);
4218 
4219 
4220     __ BIND(L_ghash_loop);
4221     __ movdqu(xmm_temp2, Address(data, 0));
4222     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4223 
4224     __ movdqu(xmm_temp1, Address(subkeyH, 0));
4225     __ pshufb(xmm_temp1, xmm_temp10);
4226 
4227     __ pxor(xmm_temp0, xmm_temp2);
4228 
4229     //
4230     // Multiply with the hash key
4231     //
4232     __ movdqu(xmm_temp3, xmm_temp0);
4233     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0


4293     __ movdqu(xmm_temp5, xmm_temp3);
4294     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
4295     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
4296     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
4297     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
4298     __ pxor(xmm_temp2, xmm_temp5);
4299     __ pxor(xmm_temp2, xmm_temp8);
4300     __ pxor(xmm_temp3, xmm_temp2);
4301     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
4302 
4303     __ decrement(blocks);
4304     __ jcc(Assembler::zero, L_exit);
4305     __ movdqu(xmm_temp0, xmm_temp6);
4306     __ addptr(data, 16);
4307     __ jmp(L_ghash_loop);
4308 
4309     __ BIND(L_exit);
4310     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
4311     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
4312 
4313 #ifdef _WIN64
4314     // restore xmm regs belonging to calling function
4315     for (int i = 6; i <= XMM_REG_LAST; i++) {
4316       __ movdqu(as_XMMRegister(i), xmm_save(i));
4317     }
4318 #endif
4319     __ leave();
4320     __ ret(0);
4321     return start;
4322   }
4323 
4324   /**
4325    *  Arguments:
4326    *
4327    * Inputs:
4328    *   c_rarg0   - int crc
4329    *   c_rarg1   - byte* buf
4330    *   c_rarg2   - int length
4331    *
4332    * Ouput:
4333    *       rax   - int crc result
4334    */
4335   address generate_updateBytesCRC32() {
4336     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
4337 
4338     __ align(CodeEntryAlignment);


4635   }
4636 
4637   address generate_libmExp() {
4638     address start = __ pc();
4639 
4640     const XMMRegister x0  = xmm0;
4641     const XMMRegister x1  = xmm1;
4642     const XMMRegister x2  = xmm2;
4643     const XMMRegister x3  = xmm3;
4644 
4645     const XMMRegister x4  = xmm4;
4646     const XMMRegister x5  = xmm5;
4647     const XMMRegister x6  = xmm6;
4648     const XMMRegister x7  = xmm7;
4649 
4650     const Register tmp   = r11;
4651 
4652     BLOCK_COMMENT("Entry:");
4653     __ enter(); // required for proper stackwalking of RuntimeStub frame
4654 
4655 #ifdef _WIN64
4656     // save the xmm registers which must be preserved 6-7
4657     __ subptr(rsp, 4 * wordSize);
4658     __ movdqu(Address(rsp, 0), xmm6);
4659     __ movdqu(Address(rsp, 2 * wordSize), xmm7);
4660 #endif
4661       __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4662 
4663 #ifdef _WIN64
4664     // restore xmm regs belonging to calling function
4665       __ movdqu(xmm6, Address(rsp, 0));
4666       __ movdqu(xmm7, Address(rsp, 2 * wordSize));
4667       __ addptr(rsp, 4 * wordSize);
4668 #endif
4669 
4670     __ leave(); // required for proper stackwalking of RuntimeStub frame
4671     __ ret(0);
4672 
4673     return start;
4674 
4675   }
4676 
4677   address generate_libmLog() {
4678     address start = __ pc();
4679 
4680     const XMMRegister x0 = xmm0;
4681     const XMMRegister x1 = xmm1;
4682     const XMMRegister x2 = xmm2;
4683     const XMMRegister x3 = xmm3;
4684 
4685     const XMMRegister x4 = xmm4;
4686     const XMMRegister x5 = xmm5;
4687     const XMMRegister x6 = xmm6;
4688     const XMMRegister x7 = xmm7;
4689 
4690     const Register tmp1 = r11;
4691     const Register tmp2 = r8;
4692 
4693     BLOCK_COMMENT("Entry:");
4694     __ enter(); // required for proper stackwalking of RuntimeStub frame
4695 
4696 #ifdef _WIN64
4697     // save the xmm registers which must be preserved 6-7
4698     __ subptr(rsp, 4 * wordSize);
4699     __ movdqu(Address(rsp, 0), xmm6);
4700     __ movdqu(Address(rsp, 2 * wordSize), xmm7);
4701 #endif
4702     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
4703 
4704 #ifdef _WIN64
4705     // restore xmm regs belonging to calling function
4706     __ movdqu(xmm6, Address(rsp, 0));
4707     __ movdqu(xmm7, Address(rsp, 2 * wordSize));
4708     __ addptr(rsp, 4 * wordSize);
4709 #endif
4710 
4711     __ leave(); // required for proper stackwalking of RuntimeStub frame
4712     __ ret(0);
4713 
4714     return start;
4715 
4716   }
4717 
4718   address generate_libmLog10() {
4719     address start = __ pc();
4720 
4721     const XMMRegister x0 = xmm0;
4722     const XMMRegister x1 = xmm1;
4723     const XMMRegister x2 = xmm2;
4724     const XMMRegister x3 = xmm3;
4725 
4726     const XMMRegister x4 = xmm4;
4727     const XMMRegister x5 = xmm5;
4728     const XMMRegister x6 = xmm6;
4729     const XMMRegister x7 = xmm7;
4730 
4731     const Register tmp = r11;
4732 
4733     BLOCK_COMMENT("Entry:");
4734     __ enter(); // required for proper stackwalking of RuntimeStub frame
4735 
4736 #ifdef _WIN64
4737     // save the xmm registers which must be preserved 6-7
4738     __ subptr(rsp, 4 * wordSize);
4739     __ movdqu(Address(rsp, 0), xmm6);
4740     __ movdqu(Address(rsp, 2 * wordSize), xmm7);
4741 #endif
4742     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4743 
4744 #ifdef _WIN64
4745     // restore xmm regs belonging to calling function
4746     __ movdqu(xmm6, Address(rsp, 0));
4747     __ movdqu(xmm7, Address(rsp, 2 * wordSize));
4748     __ addptr(rsp, 4 * wordSize);
4749 #endif
4750 
4751     __ leave(); // required for proper stackwalking of RuntimeStub frame
4752     __ ret(0);
4753 
4754     return start;
4755 
4756   }
4757 
4758   address generate_libmPow() {
4759     address start = __ pc();
4760 
4761     const XMMRegister x0 = xmm0;
4762     const XMMRegister x1 = xmm1;
4763     const XMMRegister x2 = xmm2;
4764     const XMMRegister x3 = xmm3;
4765 
4766     const XMMRegister x4 = xmm4;
4767     const XMMRegister x5 = xmm5;
4768     const XMMRegister x6 = xmm6;
4769     const XMMRegister x7 = xmm7;
4770 
4771     const Register tmp1 = r8;
4772     const Register tmp2 = r9;
4773     const Register tmp3 = r10;
4774     const Register tmp4 = r11;
4775 
4776     BLOCK_COMMENT("Entry:");
4777     __ enter(); // required for proper stackwalking of RuntimeStub frame
4778 
4779 #ifdef _WIN64
4780     // save the xmm registers which must be preserved 6-7
4781     __ subptr(rsp, 4 * wordSize);
4782     __ movdqu(Address(rsp, 0), xmm6);
4783     __ movdqu(Address(rsp, 2 * wordSize), xmm7);
4784 #endif
4785     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4786 
4787 #ifdef _WIN64
4788     // restore xmm regs belonging to calling function
4789     __ movdqu(xmm6, Address(rsp, 0));
4790     __ movdqu(xmm7, Address(rsp, 2 * wordSize));
4791     __ addptr(rsp, 4 * wordSize);
4792 #endif
4793 
4794     __ leave(); // required for proper stackwalking of RuntimeStub frame
4795     __ ret(0);
4796 
4797     return start;
4798 
4799   }
4800 
4801   address generate_libmSin() {
4802     address start = __ pc();
4803 
4804     const XMMRegister x0 = xmm0;
4805     const XMMRegister x1 = xmm1;
4806     const XMMRegister x2 = xmm2;
4807     const XMMRegister x3 = xmm3;
4808 
4809     const XMMRegister x4 = xmm4;
4810     const XMMRegister x5 = xmm5;
4811     const XMMRegister x6 = xmm6;
4812     const XMMRegister x7 = xmm7;
4813 
4814     const Register tmp1 = r8;
4815     const Register tmp2 = r9;
4816     const Register tmp3 = r10;
4817     const Register tmp4 = r11;
4818 
4819     BLOCK_COMMENT("Entry:");
4820     __ enter(); // required for proper stackwalking of RuntimeStub frame
4821 
4822 #ifdef _WIN64
4823     __ push(rsi);
4824     __ push(rdi);
4825     // save the xmm registers which must be preserved 6-7
4826     __ subptr(rsp, 4 * wordSize);
4827     __ movdqu(Address(rsp, 0), xmm6);
4828     __ movdqu(Address(rsp, 2 * wordSize), xmm7);
4829 #endif
4830     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4831 
4832 #ifdef _WIN64
4833     // restore xmm regs belonging to calling function
4834     __ movdqu(xmm6, Address(rsp, 0));
4835     __ movdqu(xmm7, Address(rsp, 2 * wordSize));
4836     __ addptr(rsp, 4 * wordSize);
4837     __ pop(rdi);
4838     __ pop(rsi);
4839 #endif
4840 
4841     __ leave(); // required for proper stackwalking of RuntimeStub frame
4842     __ ret(0);
4843 
4844     return start;
4845 
4846   }
4847 
4848   address generate_libmCos() {
4849     address start = __ pc();
4850 
4851     const XMMRegister x0 = xmm0;
4852     const XMMRegister x1 = xmm1;
4853     const XMMRegister x2 = xmm2;
4854     const XMMRegister x3 = xmm3;
4855 
4856     const XMMRegister x4 = xmm4;
4857     const XMMRegister x5 = xmm5;
4858     const XMMRegister x6 = xmm6;
4859     const XMMRegister x7 = xmm7;
4860 
4861     const Register tmp1 = r8;
4862     const Register tmp2 = r9;
4863     const Register tmp3 = r10;
4864     const Register tmp4 = r11;
4865 
4866     BLOCK_COMMENT("Entry:");
4867     __ enter(); // required for proper stackwalking of RuntimeStub frame
4868 
4869 #ifdef _WIN64
4870     __ push(rsi);
4871     __ push(rdi);
4872     // save the xmm registers which must be preserved 6-7
4873     __ subptr(rsp, 4 * wordSize);
4874     __ movdqu(Address(rsp, 0), xmm6);
4875     __ movdqu(Address(rsp, 2 * wordSize), xmm7);
4876 #endif
4877     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4878 
4879 #ifdef _WIN64
4880     // restore xmm regs belonging to calling function
4881     __ movdqu(xmm6, Address(rsp, 0));
4882     __ movdqu(xmm7, Address(rsp, 2 * wordSize));
4883     __ addptr(rsp, 4 * wordSize);
4884     __ pop(rdi);
4885     __ pop(rsi);
4886 #endif
4887 
4888     __ leave(); // required for proper stackwalking of RuntimeStub frame
4889     __ ret(0);
4890 
4891     return start;
4892 
4893   }
4894 
4895   address generate_libmTan() {
4896     address start = __ pc();
4897 
4898     const XMMRegister x0 = xmm0;
4899     const XMMRegister x1 = xmm1;
4900     const XMMRegister x2 = xmm2;
4901     const XMMRegister x3 = xmm3;
4902 
4903     const XMMRegister x4 = xmm4;
4904     const XMMRegister x5 = xmm5;
4905     const XMMRegister x6 = xmm6;
4906     const XMMRegister x7 = xmm7;
4907 
4908     const Register tmp1 = r8;
4909     const Register tmp2 = r9;
4910     const Register tmp3 = r10;
4911     const Register tmp4 = r11;
4912 
4913     BLOCK_COMMENT("Entry:");
4914     __ enter(); // required for proper stackwalking of RuntimeStub frame
4915 
4916 #ifdef _WIN64
4917     __ push(rsi);
4918     __ push(rdi);
4919     // save the xmm registers which must be preserved 6-7
4920     __ subptr(rsp, 4 * wordSize);
4921     __ movdqu(Address(rsp, 0), xmm6);
4922     __ movdqu(Address(rsp, 2 * wordSize), xmm7);
4923 #endif
4924     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4925 
4926 #ifdef _WIN64
4927     // restore xmm regs belonging to calling function
4928     __ movdqu(xmm6, Address(rsp, 0));
4929     __ movdqu(xmm7, Address(rsp, 2 * wordSize));
4930     __ addptr(rsp, 4 * wordSize);
4931     __ pop(rdi);
4932     __ pop(rsi);
4933 #endif
4934 
4935     __ leave(); // required for proper stackwalking of RuntimeStub frame
4936     __ ret(0);
4937 
4938     return start;
4939 
4940   }
4941 
4942 #undef __
4943 #define __ masm->
4944 
4945   // Continuation point for throwing of implicit exceptions that are
4946   // not handled in the current activation. Fabricates an exception
4947   // oop and initiates normal exception dispatching in this
4948   // frame. Since we need to preserve callee-saved values (currently
4949   // only for C2, but done for C1 as well) we need a callee-saved oop
4950   // map and therefore have to make these stubs into RuntimeStubs




3219     const int XMM_REG_NUM_KEY_LAST  = 15;
3220     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3221     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3222     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3223     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3224     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3225 
3226     __ enter(); // required for proper stackwalking of RuntimeStub frame
3227 
3228     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3229     // context for the registers used, where all instructions below are using 128-bit mode
3230     // On EVEX without VL and BW, these instructions will all be AVX.
3231     if (VM_Version::supports_avx512vlbw()) {
3232       __ movl(rax, 0xffff);
3233       __ kmovql(k1, rax);
3234     }
3235 
3236 #ifdef _WIN64
3237     // on win64, fill len_reg from stack position
3238     __ movl(len_reg, len_mem);





3239 #else
3240     __ push(len_reg); // Save
3241 #endif
3242 
3243     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3244     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3245     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3246     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3247       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3248       offset += 0x10;
3249     }
3250     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3251 
3252     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3253     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3254     __ cmpl(rax, 44);
3255     __ jcc(Assembler::notEqual, L_key_192_256);
3256 
3257     // 128 bit code follows here
3258     __ movptr(pos, 0);
3259     __ align(OptoLoopAlignment);
3260 
3261     __ BIND(L_loopTop_128);
3262     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3263     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3264     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3265     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3266       __ aesenc(xmm_result, as_XMMRegister(rnum));
3267     }
3268     __ aesenclast(xmm_result, xmm_key10);
3269     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3270     // no need to store r to memory until we exit
3271     __ addptr(pos, AESBlockSize);
3272     __ subptr(len_reg, AESBlockSize);
3273     __ jcc(Assembler::notEqual, L_loopTop_128);
3274 
3275     __ BIND(L_exit);
3276     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3277 
3278 #ifdef _WIN64




3279     __ movl(rax, len_mem);
3280 #else
3281     __ pop(rax); // return length
3282 #endif
3283     __ leave(); // required for proper stackwalking of RuntimeStub frame
3284     __ ret(0);
3285 
3286     __ BIND(L_key_192_256);
3287     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3288     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3289     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3290     __ cmpl(rax, 52);
3291     __ jcc(Assembler::notEqual, L_key_256);
3292 
3293     // 192-bit code follows here (could be changed to use more xmm registers)
3294     __ movptr(pos, 0);
3295     __ align(OptoLoopAlignment);
3296 
3297     __ BIND(L_loopTop_192);
3298     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input


3420 
3421     // keys 0-10 preloaded into xmm5-xmm15
3422     const int XMM_REG_NUM_KEY_FIRST = 5;
3423     const int XMM_REG_NUM_KEY_LAST  = 15;
3424     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3425     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3426 
3427     __ enter(); // required for proper stackwalking of RuntimeStub frame
3428 
3429     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3430     // context for the registers used, where all instructions below are using 128-bit mode
3431     // On EVEX without VL and BW, these instructions will all be AVX.
3432     if (VM_Version::supports_avx512vlbw()) {
3433       __ movl(rax, 0xffff);
3434       __ kmovql(k1, rax);
3435     }
3436 
3437 #ifdef _WIN64
3438     // on win64, fill len_reg from stack position
3439     __ movl(len_reg, len_mem);





3440 #else
3441     __ push(len_reg); // Save
3442 #endif
3443     __ push(rbx);
3444     // the java expanded key ordering is rotated one position from what we want
3445     // so we start from 0x10 here and hit 0x00 last
3446     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3447     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3448     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3449     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3450       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3451       offset += 0x10;
3452     }
3453     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3454 
3455     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3456 
3457     // registers holding the four results in the parallelized loop
3458     const XMMRegister xmm_result0 = xmm0;
3459     const XMMRegister xmm_result1 = xmm2;


3613         __ aesdec(xmm_result, key_tmp);
3614       }
3615 
3616       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3617       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3618       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3619       // no need to store r to memory until we exit
3620       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3621       __ addptr(pos, AESBlockSize);
3622       __ subptr(len_reg, AESBlockSize);
3623       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3624       if (k != 2) {
3625         __ jmp(L_exit);
3626       }
3627     } //for 128/192/256
3628 
3629     __ BIND(L_exit);
3630     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
3631     __ pop(rbx);
3632 #ifdef _WIN64




3633     __ movl(rax, len_mem);
3634 #else
3635     __ pop(rax); // return length
3636 #endif
3637     __ leave(); // required for proper stackwalking of RuntimeStub frame
3638     __ ret(0);
3639     return start;
3640 }
3641 
3642   address generate_upper_word_mask() {
3643     __ align(64);
3644     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3645     address start = __ pc();
3646     __ emit_data64(0x0000000000000000, relocInfo::none);
3647     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3648     return start;
3649   }
3650 
3651   address generate_shuffle_byte_flip_mask() {
3652     __ align(64);


3664     StubCodeMark mark(this, "StubRoutines", name);
3665     address start = __ pc();
3666 
3667     Register buf = c_rarg0;
3668     Register state = c_rarg1;
3669     Register ofs = c_rarg2;
3670     Register limit = c_rarg3;
3671 
3672     const XMMRegister abcd = xmm0;
3673     const XMMRegister e0 = xmm1;
3674     const XMMRegister e1 = xmm2;
3675     const XMMRegister msg0 = xmm3;
3676 
3677     const XMMRegister msg1 = xmm4;
3678     const XMMRegister msg2 = xmm5;
3679     const XMMRegister msg3 = xmm6;
3680     const XMMRegister shuf_mask = xmm7;
3681 
3682     __ enter();
3683 







3684     __ subptr(rsp, 4 * wordSize);
3685 
3686     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3687       buf, state, ofs, limit, rsp, multi_block);
3688 
3689     __ addptr(rsp, 4 * wordSize);






3690 
3691     __ leave();
3692     __ ret(0);
3693     return start;
3694   }
3695 
3696   address generate_pshuffle_byte_flip_mask() {
3697     __ align(64);
3698     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3699     address start = __ pc();
3700     __ emit_data64(0x0405060700010203, relocInfo::none);
3701     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3702 
3703     if (VM_Version::supports_avx2()) {
3704       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
3705       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3706       // _SHUF_00BA
3707       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3708       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3709       __ emit_data64(0x0b0a090803020100, relocInfo::none);


3727     address start = __ pc();
3728 
3729     Register buf = c_rarg0;
3730     Register state = c_rarg1;
3731     Register ofs = c_rarg2;
3732     Register limit = c_rarg3;
3733 
3734     const XMMRegister msg = xmm0;
3735     const XMMRegister state0 = xmm1;
3736     const XMMRegister state1 = xmm2;
3737     const XMMRegister msgtmp0 = xmm3;
3738 
3739     const XMMRegister msgtmp1 = xmm4;
3740     const XMMRegister msgtmp2 = xmm5;
3741     const XMMRegister msgtmp3 = xmm6;
3742     const XMMRegister msgtmp4 = xmm7;
3743 
3744     const XMMRegister shuf_mask = xmm8;
3745 
3746     __ enter();
















3747 
3748     __ subptr(rsp, 4 * wordSize);
3749 
3750     if (VM_Version::supports_sha()) {
3751       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3752         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3753     } else if (VM_Version::supports_avx2()) {
3754       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3755         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3756     }
3757     __ addptr(rsp, 4 * wordSize);
3758 














3759     __ leave();
3760     __ ret(0);
3761     return start;
3762   }
3763 
3764   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
3765   // to hide instruction latency
3766   //
3767   // Arguments:
3768   //
3769   // Inputs:
3770   //   c_rarg0   - source byte array address
3771   //   c_rarg1   - destination byte array address
3772   //   c_rarg2   - K (key) in little endian int array
3773   //   c_rarg3   - counter vector byte array address
3774   //   Linux
3775   //     c_rarg4   -          input length
3776   //     c_rarg5   -          saved encryptedCounter start
3777   //     rbp + 6 * wordSize - saved used length
3778   //   Windows


3839     Label L_multiBlock_loopTop[3];
3840     Label L_singleBlockLoopTop[3];
3841     Label L__incCounter[3][6]; //for 6 blocks
3842     Label L__incCounter_single[3]; //for single block, key128, key192, key256
3843     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
3844     Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
3845 
3846     Label L_exit;
3847 
3848     __ enter(); // required for proper stackwalking of RuntimeStub frame
3849 
3850     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3851     // context for the registers used, where all instructions below are using 128-bit mode
3852     // On EVEX without VL and BW, these instructions will all be AVX.
3853     if (VM_Version::supports_avx512vlbw()) {
3854         __ movl(rax, 0xffff);
3855         __ kmovql(k1, rax);
3856     }
3857 
3858 #ifdef _WIN64
3859     // allocate spill slots for r13, r14
3860     enum {
3861         saved_r13_offset,
3862         saved_r14_offset
3863     };
3864     __ subptr(rsp, 2 * wordSize);
3865     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
3866     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);




3867 
3868     // on win64, fill len_reg from stack position
3869     __ movl(len_reg, len_mem);
3870     __ movptr(saved_encCounter_start, saved_encCounter_mem);
3871     __ movptr(used_addr, used_mem);
3872     __ movl(used, Address(used_addr, 0));
3873 #else
3874     __ push(len_reg); // Save
3875     __ movptr(used_addr, used_mem);
3876     __ movl(used, Address(used_addr, 0));
3877 #endif
3878 
3879     __ push(rbx); // Save RBX
3880     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
3881     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
3882     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
3883     __ movptr(pos, 0);
3884 
3885     // Use the partially used encrpyted counter from last invocation
3886     __ BIND(L_preLoop_start);


4048         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4049           __ pextrw(Address(to, pos), xmm_result0, 0);
4050           __ psrldq(xmm_result0, 2);
4051           __ addptr(pos, 2);
4052         __ BIND(L_processTail_1_extr[k]);
4053         __ testptr(len_reg, 1);
4054         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4055           __ pextrb(Address(to, pos), xmm_result0, 0);
4056 
4057         __ BIND(L_processTail_exit_extr[k]);
4058         __ movl(Address(used_addr, 0), len_reg);
4059         __ jmp(L_exit);
4060 
4061     }
4062 
4063     __ BIND(L_exit);
4064     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4065     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4066     __ pop(rbx); // pop the saved RBX.
4067 #ifdef _WIN64




4068     __ movl(rax, len_mem);
4069     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4070     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4071     __ addptr(rsp, 2 * wordSize);
4072 #else
4073     __ pop(rax); // return 'len'
4074 #endif
4075     __ leave(); // required for proper stackwalking of RuntimeStub frame
4076     __ ret(0);
4077     return start;
4078   }
4079 
4080   // byte swap x86 long
4081   address generate_ghash_long_swap_mask() {
4082     __ align(CodeEntryAlignment);
4083     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4084     address start = __ pc();
4085     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4086     __ emit_data64(0x0706050403020100, relocInfo::none );
4087   return start;
4088   }
4089 
4090   // byte swap x86 byte array
4091   address generate_ghash_byte_swap_mask() {
4092     __ align(CodeEntryAlignment);
4093     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4094     address start = __ pc();
4095     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4096     __ emit_data64(0x0001020304050607, relocInfo::none );
4097   return start;
4098   }
4099 
4100   /* Single and multi-block ghash operations */
4101   address generate_ghash_processBlocks() {
4102     __ align(CodeEntryAlignment);
4103     Label L_ghash_loop, L_exit;
4104     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4105     address start = __ pc();
4106 
4107     const Register state        = c_rarg0;
4108     const Register subkeyH      = c_rarg1;
4109     const Register data         = c_rarg2;
4110     const Register blocks       = c_rarg3;
4111 




4112     const XMMRegister xmm_temp0 = xmm0;
4113     const XMMRegister xmm_temp1 = xmm1;
4114     const XMMRegister xmm_temp2 = xmm2;
4115     const XMMRegister xmm_temp3 = xmm3;
4116     const XMMRegister xmm_temp4 = xmm4;
4117     const XMMRegister xmm_temp5 = xmm5;
4118     const XMMRegister xmm_temp6 = xmm6;
4119     const XMMRegister xmm_temp7 = xmm7;
4120     const XMMRegister xmm_temp8 = xmm8;
4121     const XMMRegister xmm_temp9 = xmm9;
4122     const XMMRegister xmm_temp10 = xmm10;
4123 
4124     __ enter();
4125 
4126     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
4127     // context for the registers used, where all instructions below are using 128-bit mode
4128     // On EVEX without VL and BW, these instructions will all be AVX.
4129     if (VM_Version::supports_avx512vlbw()) {
4130       __ movl(rax, 0xffff);
4131       __ kmovql(k1, rax);
4132     }
4133 








4134     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4135 
4136     __ movdqu(xmm_temp0, Address(state, 0));
4137     __ pshufb(xmm_temp0, xmm_temp10);
4138 
4139 
4140     __ BIND(L_ghash_loop);
4141     __ movdqu(xmm_temp2, Address(data, 0));
4142     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4143 
4144     __ movdqu(xmm_temp1, Address(subkeyH, 0));
4145     __ pshufb(xmm_temp1, xmm_temp10);
4146 
4147     __ pxor(xmm_temp0, xmm_temp2);
4148 
4149     //
4150     // Multiply with the hash key
4151     //
4152     __ movdqu(xmm_temp3, xmm_temp0);
4153     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0


4213     __ movdqu(xmm_temp5, xmm_temp3);
4214     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
4215     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
4216     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
4217     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
4218     __ pxor(xmm_temp2, xmm_temp5);
4219     __ pxor(xmm_temp2, xmm_temp8);
4220     __ pxor(xmm_temp3, xmm_temp2);
4221     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
4222 
4223     __ decrement(blocks);
4224     __ jcc(Assembler::zero, L_exit);
4225     __ movdqu(xmm_temp0, xmm_temp6);
4226     __ addptr(data, 16);
4227     __ jmp(L_ghash_loop);
4228 
4229     __ BIND(L_exit);
4230     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
4231     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
4232 






4233     __ leave();
4234     __ ret(0);
4235     return start;
4236   }
4237 
4238   /**
4239    *  Arguments:
4240    *
4241    * Inputs:
4242    *   c_rarg0   - int crc
4243    *   c_rarg1   - byte* buf
4244    *   c_rarg2   - int length
4245    *
4246    * Ouput:
4247    *       rax   - int crc result
4248    */
4249   address generate_updateBytesCRC32() {
4250     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
4251 
4252     __ align(CodeEntryAlignment);


4549   }
4550 
4551   address generate_libmExp() {
4552     address start = __ pc();
4553 
4554     const XMMRegister x0  = xmm0;
4555     const XMMRegister x1  = xmm1;
4556     const XMMRegister x2  = xmm2;
4557     const XMMRegister x3  = xmm3;
4558 
4559     const XMMRegister x4  = xmm4;
4560     const XMMRegister x5  = xmm5;
4561     const XMMRegister x6  = xmm6;
4562     const XMMRegister x7  = xmm7;
4563 
4564     const Register tmp   = r11;
4565 
4566     BLOCK_COMMENT("Entry:");
4567     __ enter(); // required for proper stackwalking of RuntimeStub frame
4568 






4569       __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4570 







4571     __ leave(); // required for proper stackwalking of RuntimeStub frame
4572     __ ret(0);
4573 
4574     return start;
4575 
4576   }
4577 
4578   address generate_libmLog() {
4579     address start = __ pc();
4580 
4581     const XMMRegister x0 = xmm0;
4582     const XMMRegister x1 = xmm1;
4583     const XMMRegister x2 = xmm2;
4584     const XMMRegister x3 = xmm3;
4585 
4586     const XMMRegister x4 = xmm4;
4587     const XMMRegister x5 = xmm5;
4588     const XMMRegister x6 = xmm6;
4589     const XMMRegister x7 = xmm7;
4590 
4591     const Register tmp1 = r11;
4592     const Register tmp2 = r8;
4593 
4594     BLOCK_COMMENT("Entry:");
4595     __ enter(); // required for proper stackwalking of RuntimeStub frame
4596 






4597     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
4598 







4599     __ leave(); // required for proper stackwalking of RuntimeStub frame
4600     __ ret(0);
4601 
4602     return start;
4603 
4604   }
4605 
4606   address generate_libmLog10() {
4607     address start = __ pc();
4608 
4609     const XMMRegister x0 = xmm0;
4610     const XMMRegister x1 = xmm1;
4611     const XMMRegister x2 = xmm2;
4612     const XMMRegister x3 = xmm3;
4613 
4614     const XMMRegister x4 = xmm4;
4615     const XMMRegister x5 = xmm5;
4616     const XMMRegister x6 = xmm6;
4617     const XMMRegister x7 = xmm7;
4618 
4619     const Register tmp = r11;
4620 
4621     BLOCK_COMMENT("Entry:");
4622     __ enter(); // required for proper stackwalking of RuntimeStub frame
4623 






4624     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4625 







4626     __ leave(); // required for proper stackwalking of RuntimeStub frame
4627     __ ret(0);
4628 
4629     return start;
4630 
4631   }
4632 
4633   address generate_libmPow() {
4634     address start = __ pc();
4635 
4636     const XMMRegister x0 = xmm0;
4637     const XMMRegister x1 = xmm1;
4638     const XMMRegister x2 = xmm2;
4639     const XMMRegister x3 = xmm3;
4640 
4641     const XMMRegister x4 = xmm4;
4642     const XMMRegister x5 = xmm5;
4643     const XMMRegister x6 = xmm6;
4644     const XMMRegister x7 = xmm7;
4645 
4646     const Register tmp1 = r8;
4647     const Register tmp2 = r9;
4648     const Register tmp3 = r10;
4649     const Register tmp4 = r11;
4650 
4651     BLOCK_COMMENT("Entry:");
4652     __ enter(); // required for proper stackwalking of RuntimeStub frame
4653 






4654     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4655 







4656     __ leave(); // required for proper stackwalking of RuntimeStub frame
4657     __ ret(0);
4658 
4659     return start;
4660 
4661   }
4662 
4663   address generate_libmSin() {
4664     address start = __ pc();
4665 
4666     const XMMRegister x0 = xmm0;
4667     const XMMRegister x1 = xmm1;
4668     const XMMRegister x2 = xmm2;
4669     const XMMRegister x3 = xmm3;
4670 
4671     const XMMRegister x4 = xmm4;
4672     const XMMRegister x5 = xmm5;
4673     const XMMRegister x6 = xmm6;
4674     const XMMRegister x7 = xmm7;
4675 
4676     const Register tmp1 = r8;
4677     const Register tmp2 = r9;
4678     const Register tmp3 = r10;
4679     const Register tmp4 = r11;
4680 
4681     BLOCK_COMMENT("Entry:");
4682     __ enter(); // required for proper stackwalking of RuntimeStub frame
4683 
4684 #ifdef _WIN64
4685     __ push(rsi);
4686     __ push(rdi);




4687 #endif
4688     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4689 
4690 #ifdef _WIN64




4691     __ pop(rdi);
4692     __ pop(rsi);
4693 #endif
4694 
4695     __ leave(); // required for proper stackwalking of RuntimeStub frame
4696     __ ret(0);
4697 
4698     return start;
4699 
4700   }
4701 
4702   address generate_libmCos() {
4703     address start = __ pc();
4704 
4705     const XMMRegister x0 = xmm0;
4706     const XMMRegister x1 = xmm1;
4707     const XMMRegister x2 = xmm2;
4708     const XMMRegister x3 = xmm3;
4709 
4710     const XMMRegister x4 = xmm4;
4711     const XMMRegister x5 = xmm5;
4712     const XMMRegister x6 = xmm6;
4713     const XMMRegister x7 = xmm7;
4714 
4715     const Register tmp1 = r8;
4716     const Register tmp2 = r9;
4717     const Register tmp3 = r10;
4718     const Register tmp4 = r11;
4719 
4720     BLOCK_COMMENT("Entry:");
4721     __ enter(); // required for proper stackwalking of RuntimeStub frame
4722 
4723 #ifdef _WIN64
4724     __ push(rsi);
4725     __ push(rdi);




4726 #endif
4727     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4728 
4729 #ifdef _WIN64




4730     __ pop(rdi);
4731     __ pop(rsi);
4732 #endif
4733 
4734     __ leave(); // required for proper stackwalking of RuntimeStub frame
4735     __ ret(0);
4736 
4737     return start;
4738 
4739   }
4740 
4741   address generate_libmTan() {
4742     address start = __ pc();
4743 
4744     const XMMRegister x0 = xmm0;
4745     const XMMRegister x1 = xmm1;
4746     const XMMRegister x2 = xmm2;
4747     const XMMRegister x3 = xmm3;
4748 
4749     const XMMRegister x4 = xmm4;
4750     const XMMRegister x5 = xmm5;
4751     const XMMRegister x6 = xmm6;
4752     const XMMRegister x7 = xmm7;
4753 
4754     const Register tmp1 = r8;
4755     const Register tmp2 = r9;
4756     const Register tmp3 = r10;
4757     const Register tmp4 = r11;
4758 
4759     BLOCK_COMMENT("Entry:");
4760     __ enter(); // required for proper stackwalking of RuntimeStub frame
4761 
4762 #ifdef _WIN64
4763     __ push(rsi);
4764     __ push(rdi);




4765 #endif
4766     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4767 
4768 #ifdef _WIN64




4769     __ pop(rdi);
4770     __ pop(rsi);
4771 #endif
4772 
4773     __ leave(); // required for proper stackwalking of RuntimeStub frame
4774     __ ret(0);
4775 
4776     return start;
4777 
4778   }
4779 
4780 #undef __
4781 #define __ masm->
4782 
4783   // Continuation point for throwing of implicit exceptions that are
4784   // not handled in the current activation. Fabricates an exception
4785   // oop and initiates normal exception dispatching in this
4786   // frame. Since we need to preserve callee-saved values (currently
4787   // only for C2, but done for C1 as well) we need a callee-saved oop
4788   // map and therefore have to make these stubs into RuntimeStubs


< prev index next >