3106 }
3107
3108 void MacroAssembler::load_double(Address src) {
3109 if (UseSSE >= 2) {
3110 movdbl(xmm0, src);
3111 } else {
3112 LP64_ONLY(ShouldNotReachHere());
3113 NOT_LP64(fld_d(src));
3114 }
3115 }
3116
3117 void MacroAssembler::store_double(Address dst) {
3118 if (UseSSE >= 2) {
3119 movdbl(dst, xmm0);
3120 } else {
3121 LP64_ONLY(ShouldNotReachHere());
3122 NOT_LP64(fstp_d(dst));
3123 }
3124 }
3125
3126 void MacroAssembler::push_zmm(XMMRegister reg) {
3127 lea(rsp, Address(rsp, -64)); // Use lea to not affect flags
3128 evmovdqul(Address(rsp, 0), reg, Assembler::AVX_512bit);
3129 }
3130
3131 void MacroAssembler::pop_zmm(XMMRegister reg) {
3132 evmovdqul(reg, Address(rsp, 0), Assembler::AVX_512bit);
3133 lea(rsp, Address(rsp, 64)); // Use lea to not affect flags
3134 }
3135
3136 void MacroAssembler::fremr(Register tmp) {
3137 save_rax(tmp);
3138 { Label L;
3139 bind(L);
3140 fprem();
3141 fwait(); fnstsw_ax();
3142 #ifdef _LP64
3143 testl(rax, 0x400);
3144 jcc(Assembler::notEqual, L);
3145 #else
3146 sahf();
3147 jcc(Assembler::parity, L);
3148 #endif // _LP64
3149 }
3150 restore_rax(tmp);
3151 // Result is in ST0.
3152 // Note: fxch & fpop to get rid of ST1
3153 // (otherwise FPU stack could overflow eventually)
3154 fxch(1);
3155 fpop();
3496 }
3497
3498 void MacroAssembler::movptr(Register dst, Register src) {
3499 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3500 }
3501
3502 void MacroAssembler::movptr(Register dst, Address src) {
3503 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3504 }
3505
3506 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3507 void MacroAssembler::movptr(Register dst, intptr_t src) {
3508 LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3509 }
3510
3511 void MacroAssembler::movptr(Address dst, Register src) {
3512 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3513 }
3514
3515 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3516 if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3517 Assembler::vextractf32x4(dst, src, 0);
3518 } else {
3519 Assembler::movdqu(dst, src);
3520 }
3521 }
3522
3523 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3524 if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3525 Assembler::vinsertf32x4(dst, dst, src, 0);
3526 } else {
3527 Assembler::movdqu(dst, src);
3528 }
3529 }
3530
3531 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3532 if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3533 Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3534 } else {
3535 Assembler::movdqu(dst, src);
3536 }
3537 }
3538
3539 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
3540 if (reachable(src)) {
3541 movdqu(dst, as_Address(src));
3542 } else {
3543 lea(scratchReg, src);
3544 movdqu(dst, Address(scratchReg, 0));
3545 }
3546 }
3547
3548 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3549 if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3550 vextractf64x4_low(dst, src);
3551 } else {
3552 Assembler::vmovdqu(dst, src);
3553 }
3554 }
3555
3556 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3557 if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3558 vinsertf64x4_low(dst, src);
3559 } else {
3560 Assembler::vmovdqu(dst, src);
3561 }
3562 }
3563
3564 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3565 if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3566 Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3567 }
3568 else {
3569 Assembler::vmovdqu(dst, src);
3570 }
3571 }
3572
3573 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
3574 if (reachable(src)) {
3575 vmovdqu(dst, as_Address(src));
3576 }
3577 else {
3578 lea(rscratch1, src);
3579 vmovdqu(dst, Address(rscratch1, 0));
3580 }
3581 }
3582
3583 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3584 if (reachable(src)) {
3585 Assembler::evmovdquq(dst, as_Address(src), vector_len);
3586 } else {
3587 lea(rscratch, src);
3588 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
3589 }
3590 }
3834 shll(reg, 24);
3835 sarl(reg, 24);
3836 }
3837 }
3838
3839 void MacroAssembler::sign_extend_short(Register reg) {
3840 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3841 movswl(reg, reg); // movsxw
3842 } else {
3843 shll(reg, 16);
3844 sarl(reg, 16);
3845 }
3846 }
3847
3848 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3849 assert(reachable(src), "Address should be reachable");
3850 testl(dst, as_Address(src));
3851 }
3852
3853 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3854 int dst_enc = dst->encoding();
3855 int src_enc = src->encoding();
3856 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3857 Assembler::pcmpeqb(dst, src);
3858 } else if ((dst_enc < 16) && (src_enc < 16)) {
3859 Assembler::pcmpeqb(dst, src);
3860 } else if (src_enc < 16) {
3861 push_zmm(xmm0);
3862 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3863 Assembler::pcmpeqb(xmm0, src);
3864 movdqu(dst, xmm0);
3865 pop_zmm(xmm0);
3866 } else if (dst_enc < 16) {
3867 push_zmm(xmm0);
3868 evmovdqul(xmm0, src, Assembler::AVX_512bit);
3869 Assembler::pcmpeqb(dst, xmm0);
3870 pop_zmm(xmm0);
3871 } else {
3872 push_zmm(xmm0);
3873 push_zmm(xmm1);
3874 movdqu(xmm0, src);
3875 movdqu(xmm1, dst);
3876 Assembler::pcmpeqb(xmm1, xmm0);
3877 movdqu(dst, xmm1);
3878 pop_zmm(xmm1);
3879 pop_zmm(xmm0);
3880 }
3881 }
3882
3883 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3884 int dst_enc = dst->encoding();
3885 int src_enc = src->encoding();
3886 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3887 Assembler::pcmpeqw(dst, src);
3888 } else if ((dst_enc < 16) && (src_enc < 16)) {
3889 Assembler::pcmpeqw(dst, src);
3890 } else if (src_enc < 16) {
3891 push_zmm(xmm0);
3892 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3893 Assembler::pcmpeqw(xmm0, src);
3894 movdqu(dst, xmm0);
3895 pop_zmm(xmm0);
3896 } else if (dst_enc < 16) {
3897 push_zmm(xmm0);
3898 evmovdqul(xmm0, src, Assembler::AVX_512bit);
3899 Assembler::pcmpeqw(dst, xmm0);
3900 pop_zmm(xmm0);
3901 } else {
3902 push_zmm(xmm0);
3903 push_zmm(xmm1);
3904 movdqu(xmm0, src);
3905 movdqu(xmm1, dst);
3906 Assembler::pcmpeqw(xmm1, xmm0);
3907 movdqu(dst, xmm1);
3908 pop_zmm(xmm1);
3909 pop_zmm(xmm0);
3910 }
3911 }
3912
3913 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3914 int dst_enc = dst->encoding();
3915 if (dst_enc < 16) {
3916 Assembler::pcmpestri(dst, src, imm8);
3917 } else {
3918 push_zmm(xmm0);
3919 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3920 Assembler::pcmpestri(xmm0, src, imm8);
3921 movdqu(dst, xmm0);
3922 pop_zmm(xmm0);
3923 }
3924 }
3925
3926 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3927 int dst_enc = dst->encoding();
3928 int src_enc = src->encoding();
3929 if ((dst_enc < 16) && (src_enc < 16)) {
3930 Assembler::pcmpestri(dst, src, imm8);
3931 } else if (src_enc < 16) {
3932 push_zmm(xmm0);
3933 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3934 Assembler::pcmpestri(xmm0, src, imm8);
3935 movdqu(dst, xmm0);
3936 pop_zmm(xmm0);
3937 } else if (dst_enc < 16) {
3938 push_zmm(xmm0);
3939 evmovdqul(xmm0, src, Assembler::AVX_512bit);
3940 Assembler::pcmpestri(dst, xmm0, imm8);
3941 pop_zmm(xmm0);
3942 } else {
3943 push_zmm(xmm0);
3944 push_zmm(xmm1);
3945 movdqu(xmm0, src);
3946 movdqu(xmm1, dst);
3947 Assembler::pcmpestri(xmm1, xmm0, imm8);
3948 movdqu(dst, xmm1);
3949 pop_zmm(xmm1);
3950 pop_zmm(xmm0);
3951 }
3952 }
3953
3954 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3955 int dst_enc = dst->encoding();
3956 int src_enc = src->encoding();
3957 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3958 Assembler::pmovzxbw(dst, src);
3959 } else if ((dst_enc < 16) && (src_enc < 16)) {
3960 Assembler::pmovzxbw(dst, src);
3961 } else if (src_enc < 16) {
3962 push_zmm(xmm0);
3963 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3964 Assembler::pmovzxbw(xmm0, src);
3965 movdqu(dst, xmm0);
3966 pop_zmm(xmm0);
3967 } else if (dst_enc < 16) {
3968 push_zmm(xmm0);
3969 evmovdqul(xmm0, src, Assembler::AVX_512bit);
3970 Assembler::pmovzxbw(dst, xmm0);
3971 pop_zmm(xmm0);
3972 } else {
3973 push_zmm(xmm0);
3974 push_zmm(xmm1);
3975 movdqu(xmm0, src);
3976 movdqu(xmm1, dst);
3977 Assembler::pmovzxbw(xmm1, xmm0);
3978 movdqu(dst, xmm1);
3979 pop_zmm(xmm1);
3980 pop_zmm(xmm0);
3981 }
3982 }
3983
3984 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3985 int dst_enc = dst->encoding();
3986 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3987 Assembler::pmovzxbw(dst, src);
3988 } else if (dst_enc < 16) {
3989 Assembler::pmovzxbw(dst, src);
3990 } else {
3991 push_zmm(xmm0);
3992 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3993 Assembler::pmovzxbw(xmm0, src);
3994 movdqu(dst, xmm0);
3995 pop_zmm(xmm0);
3996 }
3997 }
3998
3999 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
4000 int src_enc = src->encoding();
4001 if (src_enc < 16) {
4002 Assembler::pmovmskb(dst, src);
4003 } else {
4004 push_zmm(xmm0);
4005 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4006 Assembler::pmovmskb(dst, xmm0);
4007 pop_zmm(xmm0);
4008 }
4009 }
4010
4011 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
4012 int dst_enc = dst->encoding();
4013 int src_enc = src->encoding();
4014 if ((dst_enc < 16) && (src_enc < 16)) {
4015 Assembler::ptest(dst, src);
4016 } else if (src_enc < 16) {
4017 push_zmm(xmm0);
4018 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4019 Assembler::ptest(xmm0, src);
4020 pop_zmm(xmm0);
4021 } else if (dst_enc < 16) {
4022 push_zmm(xmm0);
4023 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4024 Assembler::ptest(dst, xmm0);
4025 pop_zmm(xmm0);
4026 } else {
4027 push_zmm(xmm0);
4028 push_zmm(xmm1);
4029 movdqu(xmm0, src);
4030 movdqu(xmm1, dst);
4031 Assembler::ptest(xmm1, xmm0);
4032 pop_zmm(xmm1);
4033 pop_zmm(xmm0);
4034 }
4035 }
4036
4037 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
4038 if (reachable(src)) {
4039 Assembler::sqrtsd(dst, as_Address(src));
4040 } else {
4041 lea(rscratch1, src);
4042 Assembler::sqrtsd(dst, Address(rscratch1, 0));
4043 }
4044 }
4045
4046 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
4047 if (reachable(src)) {
4048 Assembler::sqrtss(dst, as_Address(src));
4049 } else {
4050 lea(rscratch1, src);
4051 Assembler::sqrtss(dst, Address(rscratch1, 0));
4052 }
4053 }
4054
4143
4144 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4145 if (reachable(src)) {
4146 vaddsd(dst, nds, as_Address(src));
4147 } else {
4148 lea(rscratch1, src);
4149 vaddsd(dst, nds, Address(rscratch1, 0));
4150 }
4151 }
4152
4153 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4154 if (reachable(src)) {
4155 vaddss(dst, nds, as_Address(src));
4156 } else {
4157 lea(rscratch1, src);
4158 vaddss(dst, nds, Address(rscratch1, 0));
4159 }
4160 }
4161
4162 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
4163 int dst_enc = dst->encoding();
4164 int nds_enc = nds->encoding();
4165 int src_enc = src->encoding();
4166 if ((dst_enc < 16) && (nds_enc < 16)) {
4167 vandps(dst, nds, negate_field, vector_len);
4168 } else if ((src_enc < 16) && (dst_enc < 16)) {
4169 // Use src scratch register
4170 evmovdqul(src, nds, Assembler::AVX_512bit);
4171 vandps(dst, src, negate_field, vector_len);
4172 } else if (dst_enc < 16) {
4173 evmovdqul(dst, nds, Assembler::AVX_512bit);
4174 vandps(dst, dst, negate_field, vector_len);
4175 } else if (nds_enc < 16) {
4176 vandps(nds, nds, negate_field, vector_len);
4177 evmovdqul(dst, nds, Assembler::AVX_512bit);
4178 } else if (src_enc < 16) {
4179 evmovdqul(src, nds, Assembler::AVX_512bit);
4180 vandps(src, src, negate_field, vector_len);
4181 evmovdqul(dst, src, Assembler::AVX_512bit);
4182 } else {
4183 if (src_enc != dst_enc) {
4184 // Use src scratch register
4185 evmovdqul(src, xmm0, Assembler::AVX_512bit);
4186 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4187 vandps(xmm0, xmm0, negate_field, vector_len);
4188 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4189 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4190 } else {
4191 push_zmm(xmm0);
4192 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4193 vandps(xmm0, xmm0, negate_field, vector_len);
4194 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4195 pop_zmm(xmm0);
4196 }
4197 }
4198 }
4199
4200 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
4201 int dst_enc = dst->encoding();
4202 int nds_enc = nds->encoding();
4203 int src_enc = src->encoding();
4204 if ((dst_enc < 16) && (nds_enc < 16)) {
4205 vandpd(dst, nds, negate_field, vector_len);
4206 } else if ((src_enc < 16) && (dst_enc < 16)) {
4207 // Use src scratch register
4208 evmovdqul(src, nds, Assembler::AVX_512bit);
4209 vandpd(dst, src, negate_field, vector_len);
4210 } else if (dst_enc < 16) {
4211 evmovdqul(dst, nds, Assembler::AVX_512bit);
4212 vandpd(dst, dst, negate_field, vector_len);
4213 } else if (nds_enc < 16) {
4214 vandpd(nds, nds, negate_field, vector_len);
4215 evmovdqul(dst, nds, Assembler::AVX_512bit);
4216 } else if (src_enc < 16) {
4217 evmovdqul(src, nds, Assembler::AVX_512bit);
4218 vandpd(src, src, negate_field, vector_len);
4219 evmovdqul(dst, src, Assembler::AVX_512bit);
4220 } else {
4221 if (src_enc != dst_enc) {
4222 evmovdqul(src, xmm0, Assembler::AVX_512bit);
4223 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4224 vandpd(xmm0, xmm0, negate_field, vector_len);
4225 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4226 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4227 } else {
4228 push_zmm(xmm0);
4229 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4230 vandpd(xmm0, xmm0, negate_field, vector_len);
4231 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4232 pop_zmm(xmm0);
4233 }
4234 }
4235 }
4236
4237 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4238 int dst_enc = dst->encoding();
4239 int nds_enc = nds->encoding();
4240 int src_enc = src->encoding();
4241 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4242 Assembler::vpaddb(dst, nds, src, vector_len);
4243 } else if ((dst_enc < 16) && (src_enc < 16)) {
4244 Assembler::vpaddb(dst, dst, src, vector_len);
4245 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4246 // use nds as scratch for src
4247 evmovdqul(nds, src, Assembler::AVX_512bit);
4248 Assembler::vpaddb(dst, dst, nds, vector_len);
4249 } else if ((src_enc < 16) && (nds_enc < 16)) {
4250 // use nds as scratch for dst
4251 evmovdqul(nds, dst, Assembler::AVX_512bit);
4252 Assembler::vpaddb(nds, nds, src, vector_len);
4253 evmovdqul(dst, nds, Assembler::AVX_512bit);
4254 } else if (dst_enc < 16) {
4255 // use nds as scatch for xmm0 to hold src
4256 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4257 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4258 Assembler::vpaddb(dst, dst, xmm0, vector_len);
4259 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4260 } else {
4261 // worse case scenario, all regs are in the upper bank
4262 push_zmm(xmm1);
4263 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4264 evmovdqul(xmm1, src, Assembler::AVX_512bit);
4265 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4266 Assembler::vpaddb(xmm0, xmm0, xmm1, vector_len);
4267 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4268 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4269 pop_zmm(xmm1);
4270 }
4271 }
4272
4273 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4274 int dst_enc = dst->encoding();
4275 int nds_enc = nds->encoding();
4276 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4277 Assembler::vpaddb(dst, nds, src, vector_len);
4278 } else if (dst_enc < 16) {
4279 Assembler::vpaddb(dst, dst, src, vector_len);
4280 } else if (nds_enc < 16) {
4281 // implies dst_enc in upper bank with src as scratch
4282 evmovdqul(nds, dst, Assembler::AVX_512bit);
4283 Assembler::vpaddb(nds, nds, src, vector_len);
4284 evmovdqul(dst, nds, Assembler::AVX_512bit);
4285 } else {
4286 // worse case scenario, all regs in upper bank
4287 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4288 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4289 Assembler::vpaddb(xmm0, xmm0, src, vector_len);
4290 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4291 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4292 }
4293 }
4294
4295 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4296 int dst_enc = dst->encoding();
4297 int nds_enc = nds->encoding();
4298 int src_enc = src->encoding();
4299 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4300 Assembler::vpaddw(dst, nds, src, vector_len);
4301 } else if ((dst_enc < 16) && (src_enc < 16)) {
4302 Assembler::vpaddw(dst, dst, src, vector_len);
4303 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4304 // use nds as scratch for src
4305 evmovdqul(nds, src, Assembler::AVX_512bit);
4306 Assembler::vpaddw(dst, dst, nds, vector_len);
4307 } else if ((src_enc < 16) && (nds_enc < 16)) {
4308 // use nds as scratch for dst
4309 evmovdqul(nds, dst, Assembler::AVX_512bit);
4310 Assembler::vpaddw(nds, nds, src, vector_len);
4311 evmovdqul(dst, nds, Assembler::AVX_512bit);
4312 } else if (dst_enc < 16) {
4313 // use nds as scatch for xmm0 to hold src
4314 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4315 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4316 Assembler::vpaddw(dst, dst, xmm0, vector_len);
4317 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4318 } else {
4319 // worse case scenario, all regs are in the upper bank
4320 push_zmm(xmm1);
4321 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4322 evmovdqul(xmm1, src, Assembler::AVX_512bit);
4323 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4324 Assembler::vpaddw(xmm0, xmm0, xmm1, vector_len);
4325 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4326 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4327 pop_zmm(xmm1);
4328 }
4329 }
4330
4331 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4332 int dst_enc = dst->encoding();
4333 int nds_enc = nds->encoding();
4334 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4335 Assembler::vpaddw(dst, nds, src, vector_len);
4336 } else if (dst_enc < 16) {
4337 Assembler::vpaddw(dst, dst, src, vector_len);
4338 } else if (nds_enc < 16) {
4339 // implies dst_enc in upper bank with nds as scratch
4340 evmovdqul(nds, dst, Assembler::AVX_512bit);
4341 Assembler::vpaddw(nds, nds, src, vector_len);
4342 evmovdqul(dst, nds, Assembler::AVX_512bit);
4343 } else {
4344 // worse case scenario, all regs in upper bank
4345 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4346 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4347 Assembler::vpaddw(xmm0, xmm0, src, vector_len);
4348 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4349 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4350 }
4351 }
4352
4353 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4354 if (reachable(src)) {
4355 Assembler::vpand(dst, nds, as_Address(src), vector_len);
4356 } else {
4357 lea(rscratch1, src);
4358 Assembler::vpand(dst, nds, Address(rscratch1, 0), vector_len);
4359 }
4360 }
4361
4362 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
4363 int dst_enc = dst->encoding();
4364 int src_enc = src->encoding();
4365 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4366 Assembler::vpbroadcastw(dst, src);
4367 } else if ((dst_enc < 16) && (src_enc < 16)) {
4368 Assembler::vpbroadcastw(dst, src);
4369 } else if (src_enc < 16) {
4370 push_zmm(xmm0);
4371 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4372 Assembler::vpbroadcastw(xmm0, src);
4373 movdqu(dst, xmm0);
4374 pop_zmm(xmm0);
4375 } else if (dst_enc < 16) {
4376 push_zmm(xmm0);
4377 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4378 Assembler::vpbroadcastw(dst, xmm0);
4379 pop_zmm(xmm0);
4380 } else {
4381 push_zmm(xmm0);
4382 push_zmm(xmm1);
4383 movdqu(xmm0, src);
4384 movdqu(xmm1, dst);
4385 Assembler::vpbroadcastw(xmm1, xmm0);
4386 movdqu(dst, xmm1);
4387 pop_zmm(xmm1);
4388 pop_zmm(xmm0);
4389 }
4390 }
4391
4392 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4393 int dst_enc = dst->encoding();
4394 int nds_enc = nds->encoding();
4395 int src_enc = src->encoding();
4396 assert(dst_enc == nds_enc, "");
4397 if ((dst_enc < 16) && (src_enc < 16)) {
4398 Assembler::vpcmpeqb(dst, nds, src, vector_len);
4399 } else if (src_enc < 16) {
4400 push_zmm(xmm0);
4401 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4402 Assembler::vpcmpeqb(xmm0, xmm0, src, vector_len);
4403 movdqu(dst, xmm0);
4404 pop_zmm(xmm0);
4405 } else if (dst_enc < 16) {
4406 push_zmm(xmm0);
4407 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4408 Assembler::vpcmpeqb(dst, dst, xmm0, vector_len);
4409 pop_zmm(xmm0);
4410 } else {
4411 push_zmm(xmm0);
4412 push_zmm(xmm1);
4413 movdqu(xmm0, src);
4414 movdqu(xmm1, dst);
4415 Assembler::vpcmpeqb(xmm1, xmm1, xmm0, vector_len);
4416 movdqu(dst, xmm1);
4417 pop_zmm(xmm1);
4418 pop_zmm(xmm0);
4419 }
4420 }
4421
4422 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4423 int dst_enc = dst->encoding();
4424 int nds_enc = nds->encoding();
4425 int src_enc = src->encoding();
4426 assert(dst_enc == nds_enc, "");
4427 if ((dst_enc < 16) && (src_enc < 16)) {
4428 Assembler::vpcmpeqw(dst, nds, src, vector_len);
4429 } else if (src_enc < 16) {
4430 push_zmm(xmm0);
4431 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4432 Assembler::vpcmpeqw(xmm0, xmm0, src, vector_len);
4433 movdqu(dst, xmm0);
4434 pop_zmm(xmm0);
4435 } else if (dst_enc < 16) {
4436 push_zmm(xmm0);
4437 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4438 Assembler::vpcmpeqw(dst, dst, xmm0, vector_len);
4439 pop_zmm(xmm0);
4440 } else {
4441 push_zmm(xmm0);
4442 push_zmm(xmm1);
4443 movdqu(xmm0, src);
4444 movdqu(xmm1, dst);
4445 Assembler::vpcmpeqw(xmm1, xmm1, xmm0, vector_len);
4446 movdqu(dst, xmm1);
4447 pop_zmm(xmm1);
4448 pop_zmm(xmm0);
4449 }
4450 }
4451
4452 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
4453 int dst_enc = dst->encoding();
4454 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4455 Assembler::vpmovzxbw(dst, src, vector_len);
4456 } else if (dst_enc < 16) {
4457 Assembler::vpmovzxbw(dst, src, vector_len);
4458 } else {
4459 push_zmm(xmm0);
4460 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4461 Assembler::vpmovzxbw(xmm0, src, vector_len);
4462 movdqu(dst, xmm0);
4463 pop_zmm(xmm0);
4464 }
4465 }
4466
4467 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
4468 int src_enc = src->encoding();
4469 if (src_enc < 16) {
4470 Assembler::vpmovmskb(dst, src);
4471 } else {
4472 push_zmm(xmm0);
4473 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4474 Assembler::vpmovmskb(dst, xmm0);
4475 pop_zmm(xmm0);
4476 }
4477 }
4478
4479 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4480 int dst_enc = dst->encoding();
4481 int nds_enc = nds->encoding();
4482 int src_enc = src->encoding();
4483 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4484 Assembler::vpmullw(dst, nds, src, vector_len);
4485 } else if ((dst_enc < 16) && (src_enc < 16)) {
4486 Assembler::vpmullw(dst, dst, src, vector_len);
4487 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4488 // use nds as scratch for src
4489 evmovdqul(nds, src, Assembler::AVX_512bit);
4490 Assembler::vpmullw(dst, dst, nds, vector_len);
4491 } else if ((src_enc < 16) && (nds_enc < 16)) {
4492 // use nds as scratch for dst
4493 evmovdqul(nds, dst, Assembler::AVX_512bit);
4494 Assembler::vpmullw(nds, nds, src, vector_len);
4495 evmovdqul(dst, nds, Assembler::AVX_512bit);
4496 } else if (dst_enc < 16) {
4497 // use nds as scatch for xmm0 to hold src
4498 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4499 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4500 Assembler::vpmullw(dst, dst, xmm0, vector_len);
4501 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4502 } else {
4503 // worse case scenario, all regs are in the upper bank
4504 push_zmm(xmm1);
4505 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4506 evmovdqul(xmm1, src, Assembler::AVX_512bit);
4507 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4508 Assembler::vpmullw(xmm0, xmm0, xmm1, vector_len);
4509 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4510 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4511 pop_zmm(xmm1);
4512 }
4513 }
4514
4515 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4516 int dst_enc = dst->encoding();
4517 int nds_enc = nds->encoding();
4518 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4519 Assembler::vpmullw(dst, nds, src, vector_len);
4520 } else if (dst_enc < 16) {
4521 Assembler::vpmullw(dst, dst, src, vector_len);
4522 } else if (nds_enc < 16) {
4523 // implies dst_enc in upper bank with src as scratch
4524 evmovdqul(nds, dst, Assembler::AVX_512bit);
4525 Assembler::vpmullw(nds, nds, src, vector_len);
4526 evmovdqul(dst, nds, Assembler::AVX_512bit);
4527 } else {
4528 // worse case scenario, all regs in upper bank
4529 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4530 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4531 Assembler::vpmullw(xmm0, xmm0, src, vector_len);
4532 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4533 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4534 }
4535 }
4536
4537 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4538 int dst_enc = dst->encoding();
4539 int nds_enc = nds->encoding();
4540 int src_enc = src->encoding();
4541 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4542 Assembler::vpsubb(dst, nds, src, vector_len);
4543 } else if ((dst_enc < 16) && (src_enc < 16)) {
4544 Assembler::vpsubb(dst, dst, src, vector_len);
4545 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4546 // use nds as scratch for src
4547 evmovdqul(nds, src, Assembler::AVX_512bit);
4548 Assembler::vpsubb(dst, dst, nds, vector_len);
4549 } else if ((src_enc < 16) && (nds_enc < 16)) {
4550 // use nds as scratch for dst
4551 evmovdqul(nds, dst, Assembler::AVX_512bit);
4552 Assembler::vpsubb(nds, nds, src, vector_len);
4553 evmovdqul(dst, nds, Assembler::AVX_512bit);
4554 } else if (dst_enc < 16) {
4555 // use nds as scatch for xmm0 to hold src
4556 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4557 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4558 Assembler::vpsubb(dst, dst, xmm0, vector_len);
4559 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4560 } else {
4561 // worse case scenario, all regs are in the upper bank
4562 push_zmm(xmm1);
4563 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4564 evmovdqul(xmm1, src, Assembler::AVX_512bit);
4565 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4566 Assembler::vpsubb(xmm0, xmm0, xmm1, vector_len);
4567 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4568 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4569 pop_zmm(xmm1);
4570 }
4571 }
4572
4573 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4574 int dst_enc = dst->encoding();
4575 int nds_enc = nds->encoding();
4576 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4577 Assembler::vpsubb(dst, nds, src, vector_len);
4578 } else if (dst_enc < 16) {
4579 Assembler::vpsubb(dst, dst, src, vector_len);
4580 } else if (nds_enc < 16) {
4581 // implies dst_enc in upper bank with src as scratch
4582 evmovdqul(nds, dst, Assembler::AVX_512bit);
4583 Assembler::vpsubb(nds, nds, src, vector_len);
4584 evmovdqul(dst, nds, Assembler::AVX_512bit);
4585 } else {
4586 // worse case scenario, all regs in upper bank
4587 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4588 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4589 Assembler::vpsubb(xmm0, xmm0, src, vector_len);
4590 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4591 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4592 }
4593 }
4594
4595 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4596 int dst_enc = dst->encoding();
4597 int nds_enc = nds->encoding();
4598 int src_enc = src->encoding();
4599 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4600 Assembler::vpsubw(dst, nds, src, vector_len);
4601 } else if ((dst_enc < 16) && (src_enc < 16)) {
4602 Assembler::vpsubw(dst, dst, src, vector_len);
4603 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4604 // use nds as scratch for src
4605 evmovdqul(nds, src, Assembler::AVX_512bit);
4606 Assembler::vpsubw(dst, dst, nds, vector_len);
4607 } else if ((src_enc < 16) && (nds_enc < 16)) {
4608 // use nds as scratch for dst
4609 evmovdqul(nds, dst, Assembler::AVX_512bit);
4610 Assembler::vpsubw(nds, nds, src, vector_len);
4611 evmovdqul(dst, nds, Assembler::AVX_512bit);
4612 } else if (dst_enc < 16) {
4613 // use nds as scatch for xmm0 to hold src
4614 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4615 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4616 Assembler::vpsubw(dst, dst, xmm0, vector_len);
4617 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4618 } else {
4619 // worse case scenario, all regs are in the upper bank
4620 push_zmm(xmm1);
4621 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4622 evmovdqul(xmm1, src, Assembler::AVX_512bit);
4623 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4624 Assembler::vpsubw(xmm0, xmm0, xmm1, vector_len);
4625 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4626 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4627 pop_zmm(xmm1);
4628 }
4629 }
4630
4631 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4632 int dst_enc = dst->encoding();
4633 int nds_enc = nds->encoding();
4634 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4635 Assembler::vpsubw(dst, nds, src, vector_len);
4636 } else if (dst_enc < 16) {
4637 Assembler::vpsubw(dst, dst, src, vector_len);
4638 } else if (nds_enc < 16) {
4639 // implies dst_enc in upper bank with src as scratch
4640 evmovdqul(nds, dst, Assembler::AVX_512bit);
4641 Assembler::vpsubw(nds, nds, src, vector_len);
4642 evmovdqul(dst, nds, Assembler::AVX_512bit);
4643 } else {
4644 // worse case scenario, all regs in upper bank
4645 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4646 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4647 Assembler::vpsubw(xmm0, xmm0, src, vector_len);
4648 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4649 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4650 }
4651 }
4652
4653 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4654 int dst_enc = dst->encoding();
4655 int nds_enc = nds->encoding();
4656 int shift_enc = shift->encoding();
4657 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4658 Assembler::vpsraw(dst, nds, shift, vector_len);
4659 } else if ((dst_enc < 16) && (shift_enc < 16)) {
4660 Assembler::vpsraw(dst, dst, shift, vector_len);
4661 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4662 // use nds as scratch with shift
4663 evmovdqul(nds, shift, Assembler::AVX_512bit);
4664 Assembler::vpsraw(dst, dst, nds, vector_len);
4665 } else if ((shift_enc < 16) && (nds_enc < 16)) {
4666 // use nds as scratch with dst
4667 evmovdqul(nds, dst, Assembler::AVX_512bit);
4668 Assembler::vpsraw(nds, nds, shift, vector_len);
4669 evmovdqul(dst, nds, Assembler::AVX_512bit);
4670 } else if (dst_enc < 16) {
4671 // use nds to save a copy of xmm0 and hold shift
4672 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4673 evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4674 Assembler::vpsraw(dst, dst, xmm0, vector_len);
4675 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4676 } else if (nds_enc < 16) {
4677 // use nds and dst as temps
4678 evmovdqul(nds, dst, Assembler::AVX_512bit);
4679 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4680 evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4681 Assembler::vpsraw(nds, nds, xmm0, vector_len);
4682 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4683 evmovdqul(dst, nds, Assembler::AVX_512bit);
4684 } else {
4685 // worse case scenario, all regs are in the upper bank
4686 push_zmm(xmm1);
4687 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4688 evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4689 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4690 Assembler::vpsraw(xmm0, xmm0, xmm1, vector_len);
4691 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4692 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4693 pop_zmm(xmm1);
4694 }
4695 }
4696
4697 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4698 int dst_enc = dst->encoding();
4699 int nds_enc = nds->encoding();
4700 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4701 Assembler::vpsraw(dst, nds, shift, vector_len);
4702 } else if (dst_enc < 16) {
4703 Assembler::vpsraw(dst, dst, shift, vector_len);
4704 } else if (nds_enc < 16) {
4705 // use nds as scratch
4706 evmovdqul(nds, dst, Assembler::AVX_512bit);
4707 Assembler::vpsraw(nds, nds, shift, vector_len);
4708 evmovdqul(dst, nds, Assembler::AVX_512bit);
4709 } else {
4710 // use nds as scratch for xmm0
4711 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4712 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4713 Assembler::vpsraw(xmm0, xmm0, shift, vector_len);
4714 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4715 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4716 }
4717 }
4718
4719 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4720 int dst_enc = dst->encoding();
4721 int nds_enc = nds->encoding();
4722 int shift_enc = shift->encoding();
4723 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4724 Assembler::vpsrlw(dst, nds, shift, vector_len);
4725 } else if ((dst_enc < 16) && (shift_enc < 16)) {
4726 Assembler::vpsrlw(dst, dst, shift, vector_len);
4727 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4728 // use nds as scratch with shift
4729 evmovdqul(nds, shift, Assembler::AVX_512bit);
4730 Assembler::vpsrlw(dst, dst, nds, vector_len);
4731 } else if ((shift_enc < 16) && (nds_enc < 16)) {
4732 // use nds as scratch with dst
4733 evmovdqul(nds, dst, Assembler::AVX_512bit);
4734 Assembler::vpsrlw(nds, nds, shift, vector_len);
4735 evmovdqul(dst, nds, Assembler::AVX_512bit);
4736 } else if (dst_enc < 16) {
4737 // use nds to save a copy of xmm0 and hold shift
4738 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4739 evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4740 Assembler::vpsrlw(dst, dst, xmm0, vector_len);
4741 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4742 } else if (nds_enc < 16) {
4743 // use nds and dst as temps
4744 evmovdqul(nds, dst, Assembler::AVX_512bit);
4745 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4746 evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4747 Assembler::vpsrlw(nds, nds, xmm0, vector_len);
4748 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4749 evmovdqul(dst, nds, Assembler::AVX_512bit);
4750 } else {
4751 // worse case scenario, all regs are in the upper bank
4752 push_zmm(xmm1);
4753 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4754 evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4755 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4756 Assembler::vpsrlw(xmm0, xmm0, xmm1, vector_len);
4757 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4758 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4759 pop_zmm(xmm1);
4760 }
4761 }
4762
4763 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4764 int dst_enc = dst->encoding();
4765 int nds_enc = nds->encoding();
4766 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4767 Assembler::vpsrlw(dst, nds, shift, vector_len);
4768 } else if (dst_enc < 16) {
4769 Assembler::vpsrlw(dst, dst, shift, vector_len);
4770 } else if (nds_enc < 16) {
4771 // use nds as scratch
4772 evmovdqul(nds, dst, Assembler::AVX_512bit);
4773 Assembler::vpsrlw(nds, nds, shift, vector_len);
4774 evmovdqul(dst, nds, Assembler::AVX_512bit);
4775 } else {
4776 // use nds as scratch for xmm0
4777 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4778 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4779 Assembler::vpsrlw(xmm0, xmm0, shift, vector_len);
4780 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4781 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4782 }
4783 }
4784
4785 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4786 int dst_enc = dst->encoding();
4787 int nds_enc = nds->encoding();
4788 int shift_enc = shift->encoding();
4789 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4790 Assembler::vpsllw(dst, nds, shift, vector_len);
4791 } else if ((dst_enc < 16) && (shift_enc < 16)) {
4792 Assembler::vpsllw(dst, dst, shift, vector_len);
4793 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4794 // use nds as scratch with shift
4795 evmovdqul(nds, shift, Assembler::AVX_512bit);
4796 Assembler::vpsllw(dst, dst, nds, vector_len);
4797 } else if ((shift_enc < 16) && (nds_enc < 16)) {
4798 // use nds as scratch with dst
4799 evmovdqul(nds, dst, Assembler::AVX_512bit);
4800 Assembler::vpsllw(nds, nds, shift, vector_len);
4801 evmovdqul(dst, nds, Assembler::AVX_512bit);
4802 } else if (dst_enc < 16) {
4803 // use nds to save a copy of xmm0 and hold shift
4804 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4805 evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4806 Assembler::vpsllw(dst, dst, xmm0, vector_len);
4807 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4808 } else if (nds_enc < 16) {
4809 // use nds and dst as temps
4810 evmovdqul(nds, dst, Assembler::AVX_512bit);
4811 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4812 evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4813 Assembler::vpsllw(nds, nds, xmm0, vector_len);
4814 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4815 evmovdqul(dst, nds, Assembler::AVX_512bit);
4816 } else {
4817 // worse case scenario, all regs are in the upper bank
4818 push_zmm(xmm1);
4819 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4820 evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4821 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4822 Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4823 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4824 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4825 pop_zmm(xmm1);
4826 }
4827 }
4828
4829 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4830 int dst_enc = dst->encoding();
4831 int nds_enc = nds->encoding();
4832 if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4833 Assembler::vpsllw(dst, nds, shift, vector_len);
4834 } else if (dst_enc < 16) {
4835 Assembler::vpsllw(dst, dst, shift, vector_len);
4836 } else if (nds_enc < 16) {
4837 // use nds as scratch
4838 evmovdqul(nds, dst, Assembler::AVX_512bit);
4839 Assembler::vpsllw(nds, nds, shift, vector_len);
4840 evmovdqul(dst, nds, Assembler::AVX_512bit);
4841 } else {
4842 // use nds as scratch for xmm0
4843 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4844 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4845 Assembler::vpsllw(xmm0, xmm0, shift, vector_len);
4846 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4847 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4848 }
4849 }
4850
4851 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
4852 int dst_enc = dst->encoding();
4853 int src_enc = src->encoding();
4854 if ((dst_enc < 16) && (src_enc < 16)) {
4855 Assembler::vptest(dst, src);
4856 } else if (src_enc < 16) {
4857 push_zmm(xmm0);
4858 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4859 Assembler::vptest(xmm0, src);
4860 pop_zmm(xmm0);
4861 } else if (dst_enc < 16) {
4862 push_zmm(xmm0);
4863 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4864 Assembler::vptest(dst, xmm0);
4865 pop_zmm(xmm0);
4866 } else {
4867 push_zmm(xmm0);
4868 push_zmm(xmm1);
4869 movdqu(xmm0, src);
4870 movdqu(xmm1, dst);
4871 Assembler::vptest(xmm1, xmm0);
4872 pop_zmm(xmm1);
4873 pop_zmm(xmm0);
4874 }
4875 }
4876
4877 // This instruction exists within macros, ergo we cannot control its input
4878 // when emitted through those patterns.
4879 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
4880 if (VM_Version::supports_avx512nobw()) {
4881 int dst_enc = dst->encoding();
4882 int src_enc = src->encoding();
4883 if (dst_enc == src_enc) {
4884 if (dst_enc < 16) {
4885 Assembler::punpcklbw(dst, src);
4886 } else {
4887 push_zmm(xmm0);
4888 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4889 Assembler::punpcklbw(xmm0, xmm0);
4890 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4891 pop_zmm(xmm0);
4892 }
4893 } else {
4894 if ((src_enc < 16) && (dst_enc < 16)) {
4895 Assembler::punpcklbw(dst, src);
4896 } else if (src_enc < 16) {
4897 push_zmm(xmm0);
4898 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4899 Assembler::punpcklbw(xmm0, src);
4900 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4901 pop_zmm(xmm0);
4902 } else if (dst_enc < 16) {
4903 push_zmm(xmm0);
4904 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4905 Assembler::punpcklbw(dst, xmm0);
4906 pop_zmm(xmm0);
4907 } else {
4908 push_zmm(xmm0);
4909 push_zmm(xmm1);
4910 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4911 evmovdqul(xmm1, src, Assembler::AVX_512bit);
4912 Assembler::punpcklbw(xmm0, xmm1);
4913 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4914 pop_zmm(xmm1);
4915 pop_zmm(xmm0);
4916 }
4917 }
4918 } else {
4919 Assembler::punpcklbw(dst, src);
4920 }
4921 }
4922
4923 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
4924 if (VM_Version::supports_avx512vl()) {
4925 Assembler::pshufd(dst, src, mode);
4926 } else {
4927 int dst_enc = dst->encoding();
4928 if (dst_enc < 16) {
4929 Assembler::pshufd(dst, src, mode);
4930 } else {
4931 push_zmm(xmm0);
4932 Assembler::pshufd(xmm0, src, mode);
4933 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4934 pop_zmm(xmm0);
4935 }
4936 }
4937 }
4938
4939 // This instruction exists within macros, ergo we cannot control its input
4940 // when emitted through those patterns.
4941 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
4942 if (VM_Version::supports_avx512nobw()) {
4943 int dst_enc = dst->encoding();
4944 int src_enc = src->encoding();
4945 if (dst_enc == src_enc) {
4946 if (dst_enc < 16) {
4947 Assembler::pshuflw(dst, src, mode);
4948 } else {
4949 push_zmm(xmm0);
4950 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4951 Assembler::pshuflw(xmm0, xmm0, mode);
4952 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4953 pop_zmm(xmm0);
4954 }
4955 } else {
4956 if ((src_enc < 16) && (dst_enc < 16)) {
4957 Assembler::pshuflw(dst, src, mode);
4958 } else if (src_enc < 16) {
4959 push_zmm(xmm0);
4960 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4961 Assembler::pshuflw(xmm0, src, mode);
4962 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4963 pop_zmm(xmm0);
4964 } else if (dst_enc < 16) {
4965 push_zmm(xmm0);
4966 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4967 Assembler::pshuflw(dst, xmm0, mode);
4968 pop_zmm(xmm0);
4969 } else {
4970 push_zmm(xmm0);
4971 push_zmm(xmm1);
4972 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4973 evmovdqul(xmm1, src, Assembler::AVX_512bit);
4974 Assembler::pshuflw(xmm0, xmm1, mode);
4975 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4976 pop_zmm(xmm1);
4977 pop_zmm(xmm0);
4978 }
4979 }
4980 } else {
4981 Assembler::pshuflw(dst, src, mode);
4982 }
4983 }
4984
4985 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4986 if (reachable(src)) {
4987 vandpd(dst, nds, as_Address(src), vector_len);
4988 } else {
4989 lea(rscratch1, src);
4990 vandpd(dst, nds, Address(rscratch1, 0), vector_len);
4991 }
4992 }
4993
4994 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4995 if (reachable(src)) {
4996 vandps(dst, nds, as_Address(src), vector_len);
4997 } else {
4998 lea(rscratch1, src);
4999 vandps(dst, nds, Address(rscratch1, 0), vector_len);
5000 }
5001 }
5002
5038
5039 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5040 if (reachable(src)) {
5041 vsubsd(dst, nds, as_Address(src));
5042 } else {
5043 lea(rscratch1, src);
5044 vsubsd(dst, nds, Address(rscratch1, 0));
5045 }
5046 }
5047
5048 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5049 if (reachable(src)) {
5050 vsubss(dst, nds, as_Address(src));
5051 } else {
5052 lea(rscratch1, src);
5053 vsubss(dst, nds, Address(rscratch1, 0));
5054 }
5055 }
5056
5057 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5058 int nds_enc = nds->encoding();
5059 int dst_enc = dst->encoding();
5060 bool dst_upper_bank = (dst_enc > 15);
5061 bool nds_upper_bank = (nds_enc > 15);
5062 if (VM_Version::supports_avx512novl() &&
5063 (nds_upper_bank || dst_upper_bank)) {
5064 if (dst_upper_bank) {
5065 push_zmm(xmm0);
5066 movflt(xmm0, nds);
5067 vxorps(xmm0, xmm0, src, Assembler::AVX_128bit);
5068 movflt(dst, xmm0);
5069 pop_zmm(xmm0);
5070 } else {
5071 movflt(dst, nds);
5072 vxorps(dst, dst, src, Assembler::AVX_128bit);
5073 }
5074 } else {
5075 vxorps(dst, nds, src, Assembler::AVX_128bit);
5076 }
5077 }
5078
5079 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5080 int nds_enc = nds->encoding();
5081 int dst_enc = dst->encoding();
5082 bool dst_upper_bank = (dst_enc > 15);
5083 bool nds_upper_bank = (nds_enc > 15);
5084 if (VM_Version::supports_avx512novl() &&
5085 (nds_upper_bank || dst_upper_bank)) {
5086 if (dst_upper_bank) {
5087 push_zmm(xmm0);
5088 movdbl(xmm0, nds);
5089 vxorpd(xmm0, xmm0, src, Assembler::AVX_128bit);
5090 movdbl(dst, xmm0);
5091 pop_zmm(xmm0);
5092 } else {
5093 movdbl(dst, nds);
5094 vxorpd(dst, dst, src, Assembler::AVX_128bit);
5095 }
5096 } else {
5097 vxorpd(dst, nds, src, Assembler::AVX_128bit);
5098 }
5099 }
5100
5101 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
5102 if (reachable(src)) {
5103 vxorpd(dst, nds, as_Address(src), vector_len);
5104 } else {
5105 lea(rscratch1, src);
5106 vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
5107 }
5108 }
5109
5110 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
5111 if (reachable(src)) {
5112 vxorps(dst, nds, as_Address(src), vector_len);
5113 } else {
5114 lea(rscratch1, src);
5115 vxorps(dst, nds, Address(rscratch1, 0), vector_len);
5116 }
5117 }
5118
7228
7229 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
7230 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
7231 ShortBranchVerifier sbv(this);
7232 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
7233
7234 int stride = 8;
7235
7236 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
7237 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
7238 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
7239 FOUND_SEQ_CHAR, DONE_LABEL;
7240
7241 movptr(result, str1);
7242 if (UseAVX >= 2) {
7243 cmpl(cnt1, stride);
7244 jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
7245 cmpl(cnt1, 2*stride);
7246 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
7247 movdl(vec1, ch);
7248 vpbroadcastw(vec1, vec1);
7249 vpxor(vec2, vec2);
7250 movl(tmp, cnt1);
7251 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
7252 andl(cnt1,0x0000000F); //tail count (in chars)
7253
7254 bind(SCAN_TO_16_CHAR_LOOP);
7255 vmovdqu(vec3, Address(result, 0));
7256 vpcmpeqw(vec3, vec3, vec1, 1);
7257 vptest(vec2, vec3);
7258 jcc(Assembler::carryClear, FOUND_CHAR);
7259 addptr(result, 32);
7260 subl(tmp, 2*stride);
7261 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
7262 jmp(SCAN_TO_8_CHAR);
7263 bind(SCAN_TO_8_CHAR_INIT);
7264 movdl(vec1, ch);
7265 pshuflw(vec1, vec1, 0x00);
7266 pshufd(vec1, vec1, 0);
7267 pxor(vec2, vec2);
7268 }
7823 jmp(FALSE_LABEL);
7824
7825 clear_vector_masking(); // closing of the stub context for programming mask registers
7826 } else {
7827 movl(result, len); // copy
7828
7829 if (UseAVX == 2 && UseSSE >= 2) {
7830 // With AVX2, use 32-byte vector compare
7831 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7832
7833 // Compare 32-byte vectors
7834 andl(result, 0x0000001f); // tail count (in bytes)
7835 andl(len, 0xffffffe0); // vector count (in bytes)
7836 jccb(Assembler::zero, COMPARE_TAIL);
7837
7838 lea(ary1, Address(ary1, len, Address::times_1));
7839 negptr(len);
7840
7841 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
7842 movdl(vec2, tmp1);
7843 vpbroadcastd(vec2, vec2);
7844
7845 bind(COMPARE_WIDE_VECTORS);
7846 vmovdqu(vec1, Address(ary1, len, Address::times_1));
7847 vptest(vec1, vec2);
7848 jccb(Assembler::notZero, TRUE_LABEL);
7849 addptr(len, 32);
7850 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7851
7852 testl(result, result);
7853 jccb(Assembler::zero, FALSE_LABEL);
7854
7855 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
7856 vptest(vec1, vec2);
7857 jccb(Assembler::notZero, TRUE_LABEL);
7858 jmpb(FALSE_LABEL);
7859
7860 bind(COMPARE_TAIL); // len is zero
7861 movl(len, result);
7862 // Fallthru to tail compare
7863 } else if (UseSSE42Intrinsics) {
8253 if (!UseUnalignedLoadStores) {
8254 // align to 8 bytes, we know we are 4 byte aligned to start
8255 testptr(to, 4);
8256 jccb(Assembler::zero, L_fill_32_bytes);
8257 movl(Address(to, 0), value);
8258 addptr(to, 4);
8259 subl(count, 1<<shift);
8260 }
8261 BIND(L_fill_32_bytes);
8262 {
8263 assert( UseSSE >= 2, "supported cpu only" );
8264 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
8265 if (UseAVX > 2) {
8266 movl(rtmp, 0xffff);
8267 kmovwl(k1, rtmp);
8268 }
8269 movdl(xtmp, value);
8270 if (UseAVX > 2 && UseUnalignedLoadStores) {
8271 // Fill 64-byte chunks
8272 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8273 evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
8274
8275 subl(count, 16 << shift);
8276 jcc(Assembler::less, L_check_fill_32_bytes);
8277 align(16);
8278
8279 BIND(L_fill_64_bytes_loop);
8280 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
8281 addptr(to, 64);
8282 subl(count, 16 << shift);
8283 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8284
8285 BIND(L_check_fill_32_bytes);
8286 addl(count, 8 << shift);
8287 jccb(Assembler::less, L_check_fill_8_bytes);
8288 vmovdqu(Address(to, 0), xtmp);
8289 addptr(to, 32);
8290 subl(count, 8 << shift);
8291
8292 BIND(L_check_fill_8_bytes);
8293 } else if (UseAVX == 2 && UseUnalignedLoadStores) {
8294 // Fill 64-byte chunks
8295 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8296 vpbroadcastd(xtmp, xtmp);
8297
8298 subl(count, 16 << shift);
8299 jcc(Assembler::less, L_check_fill_32_bytes);
8300 align(16);
8301
8302 BIND(L_fill_64_bytes_loop);
8303 vmovdqu(Address(to, 0), xtmp);
8304 vmovdqu(Address(to, 32), xtmp);
8305 addptr(to, 64);
8306 subl(count, 16 << shift);
8307 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8308
8309 BIND(L_check_fill_32_bytes);
8310 addl(count, 8 << shift);
8311 jccb(Assembler::less, L_check_fill_8_bytes);
8312 vmovdqu(Address(to, 0), xtmp);
8313 addptr(to, 32);
8314 subl(count, 8 << shift);
8315
8316 BIND(L_check_fill_8_bytes);
8417 xorl(result, result);
8418 // check for zero length
8419 testl(len, len);
8420 jcc(Assembler::zero, L_done);
8421
8422 movl(result, len);
8423
8424 // Setup pointers
8425 lea(src, Address(src, len, Address::times_2)); // char[]
8426 lea(dst, Address(dst, len, Address::times_1)); // byte[]
8427 negptr(len);
8428
8429 if (UseSSE42Intrinsics || UseAVX >= 2) {
8430 Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
8431 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
8432
8433 if (UseAVX >= 2) {
8434 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
8435 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
8436 movdl(tmp1Reg, tmp5);
8437 vpbroadcastd(tmp1Reg, tmp1Reg);
8438 jmp(L_chars_32_check);
8439
8440 bind(L_copy_32_chars);
8441 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
8442 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
8443 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8444 vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
8445 jccb(Assembler::notZero, L_copy_32_chars_exit);
8446 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8447 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
8448 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
8449
8450 bind(L_chars_32_check);
8451 addptr(len, 32);
8452 jcc(Assembler::lessEqual, L_copy_32_chars);
8453
8454 bind(L_copy_32_chars_exit);
8455 subptr(len, 16);
8456 jccb(Assembler::greater, L_copy_16_chars_exit);
8457
|
3106 }
3107
3108 void MacroAssembler::load_double(Address src) {
3109 if (UseSSE >= 2) {
3110 movdbl(xmm0, src);
3111 } else {
3112 LP64_ONLY(ShouldNotReachHere());
3113 NOT_LP64(fld_d(src));
3114 }
3115 }
3116
3117 void MacroAssembler::store_double(Address dst) {
3118 if (UseSSE >= 2) {
3119 movdbl(dst, xmm0);
3120 } else {
3121 LP64_ONLY(ShouldNotReachHere());
3122 NOT_LP64(fstp_d(dst));
3123 }
3124 }
3125
3126 void MacroAssembler::fremr(Register tmp) {
3127 save_rax(tmp);
3128 { Label L;
3129 bind(L);
3130 fprem();
3131 fwait(); fnstsw_ax();
3132 #ifdef _LP64
3133 testl(rax, 0x400);
3134 jcc(Assembler::notEqual, L);
3135 #else
3136 sahf();
3137 jcc(Assembler::parity, L);
3138 #endif // _LP64
3139 }
3140 restore_rax(tmp);
3141 // Result is in ST0.
3142 // Note: fxch & fpop to get rid of ST1
3143 // (otherwise FPU stack could overflow eventually)
3144 fxch(1);
3145 fpop();
3486 }
3487
3488 void MacroAssembler::movptr(Register dst, Register src) {
3489 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3490 }
3491
3492 void MacroAssembler::movptr(Register dst, Address src) {
3493 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3494 }
3495
3496 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3497 void MacroAssembler::movptr(Register dst, intptr_t src) {
3498 LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3499 }
3500
3501 void MacroAssembler::movptr(Address dst, Register src) {
3502 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3503 }
3504
3505 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3506 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3507 Assembler::movdqu(dst, src);
3508 }
3509
3510 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3511 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3512 Assembler::movdqu(dst, src);
3513 }
3514
3515 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3516 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3517 Assembler::movdqu(dst, src);
3518 }
3519
3520 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
3521 if (reachable(src)) {
3522 movdqu(dst, as_Address(src));
3523 } else {
3524 lea(scratchReg, src);
3525 movdqu(dst, Address(scratchReg, 0));
3526 }
3527 }
3528
3529 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3530 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3531 Assembler::vmovdqu(dst, src);
3532 }
3533
3534 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3535 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3536 Assembler::vmovdqu(dst, src);
3537 }
3538
3539 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3540 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3541 Assembler::vmovdqu(dst, src);
3542 }
3543
3544 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
3545 if (reachable(src)) {
3546 vmovdqu(dst, as_Address(src));
3547 }
3548 else {
3549 lea(rscratch1, src);
3550 vmovdqu(dst, Address(rscratch1, 0));
3551 }
3552 }
3553
3554 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3555 if (reachable(src)) {
3556 Assembler::evmovdquq(dst, as_Address(src), vector_len);
3557 } else {
3558 lea(rscratch, src);
3559 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
3560 }
3561 }
3805 shll(reg, 24);
3806 sarl(reg, 24);
3807 }
3808 }
3809
3810 void MacroAssembler::sign_extend_short(Register reg) {
3811 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3812 movswl(reg, reg); // movsxw
3813 } else {
3814 shll(reg, 16);
3815 sarl(reg, 16);
3816 }
3817 }
3818
3819 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3820 assert(reachable(src), "Address should be reachable");
3821 testl(dst, as_Address(src));
3822 }
3823
3824 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3825 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3826 Assembler::pcmpeqb(dst, src);
3827 }
3828
3829 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3830 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3831 Assembler::pcmpeqw(dst, src);
3832 }
3833
3834 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3835 assert((dst->encoding() < 16),"XMM register should be 0-15");
3836 Assembler::pcmpestri(dst, src, imm8);
3837 }
3838
3839 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3840 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3841 Assembler::pcmpestri(dst, src, imm8);
3842 }
3843
3844 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3845 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3846 Assembler::pmovzxbw(dst, src);
3847 }
3848
3849 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3850 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3851 Assembler::pmovzxbw(dst, src);
3852 }
3853
3854 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3855 assert((src->encoding() < 16),"XMM register should be 0-15");
3856 Assembler::pmovmskb(dst, src);
3857 }
3858
3859 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3860 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3861 Assembler::ptest(dst, src);
3862 }
3863
3864 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3865 if (reachable(src)) {
3866 Assembler::sqrtsd(dst, as_Address(src));
3867 } else {
3868 lea(rscratch1, src);
3869 Assembler::sqrtsd(dst, Address(rscratch1, 0));
3870 }
3871 }
3872
3873 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3874 if (reachable(src)) {
3875 Assembler::sqrtss(dst, as_Address(src));
3876 } else {
3877 lea(rscratch1, src);
3878 Assembler::sqrtss(dst, Address(rscratch1, 0));
3879 }
3880 }
3881
3970
3971 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3972 if (reachable(src)) {
3973 vaddsd(dst, nds, as_Address(src));
3974 } else {
3975 lea(rscratch1, src);
3976 vaddsd(dst, nds, Address(rscratch1, 0));
3977 }
3978 }
3979
3980 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3981 if (reachable(src)) {
3982 vaddss(dst, nds, as_Address(src));
3983 } else {
3984 lea(rscratch1, src);
3985 vaddss(dst, nds, Address(rscratch1, 0));
3986 }
3987 }
3988
3989 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3990 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3991 vandps(dst, nds, negate_field, vector_len);
3992 }
3993
3994 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3995 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3996 vandpd(dst, nds, negate_field, vector_len);
3997 }
3998
3999 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4000 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4001 Assembler::vpaddb(dst, nds, src, vector_len);
4002 }
4003
4004 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4005 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4006 Assembler::vpaddb(dst, nds, src, vector_len);
4007 }
4008
4009 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4010 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4011 Assembler::vpaddw(dst, nds, src, vector_len);
4012 }
4013
4014 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4015 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4016 Assembler::vpaddw(dst, nds, src, vector_len);
4017 }
4018
4019 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4020 if (reachable(src)) {
4021 Assembler::vpand(dst, nds, as_Address(src), vector_len);
4022 } else {
4023 lea(rscratch1, src);
4024 Assembler::vpand(dst, nds, Address(rscratch1, 0), vector_len);
4025 }
4026 }
4027
4028 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
4029 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4030 Assembler::vpbroadcastw(dst, src, vector_len);
4031 }
4032
4033 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4034 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4035 Assembler::vpcmpeqb(dst, nds, src, vector_len);
4036 }
4037
4038 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4039 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4040 Assembler::vpcmpeqw(dst, nds, src, vector_len);
4041 }
4042
4043 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
4044 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4045 Assembler::vpmovzxbw(dst, src, vector_len);
4046 }
4047
4048 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
4049 assert((src->encoding() < 16),"XMM register should be 0-15");
4050 Assembler::vpmovmskb(dst, src);
4051 }
4052
4053 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4054 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4055 Assembler::vpmullw(dst, nds, src, vector_len);
4056 }
4057
4058 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4059 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4060 Assembler::vpmullw(dst, nds, src, vector_len);
4061 }
4062
4063 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4064 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4065 Assembler::vpsubb(dst, nds, src, vector_len);
4066 }
4067
4068 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4069 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4070 Assembler::vpsubb(dst, nds, src, vector_len);
4071 }
4072
4073 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4074 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4075 Assembler::vpsubw(dst, nds, src, vector_len);
4076 }
4077
4078 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4079 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4080 Assembler::vpsubw(dst, nds, src, vector_len);
4081 }
4082
4083 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4084 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4085 Assembler::vpsraw(dst, nds, shift, vector_len);
4086 }
4087
4088 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4089 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4090 Assembler::vpsraw(dst, nds, shift, vector_len);
4091 }
4092
4093 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4094 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4095 Assembler::vpsrlw(dst, nds, shift, vector_len);
4096 }
4097
4098 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4099 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4100 Assembler::vpsrlw(dst, nds, shift, vector_len);
4101 }
4102
4103 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4104 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4105 Assembler::vpsllw(dst, nds, shift, vector_len);
4106 }
4107
4108 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4109 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4110 Assembler::vpsllw(dst, nds, shift, vector_len);
4111 }
4112
4113 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
4114 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
4115 Assembler::vptest(dst, src);
4116 }
4117
4118 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
4119 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4120 Assembler::punpcklbw(dst, src);
4121 }
4122
4123 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
4124 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
4125 Assembler::pshufd(dst, src, mode);
4126 }
4127
4128 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
4129 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4130 Assembler::pshuflw(dst, src, mode);
4131 }
4132
4133 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4134 if (reachable(src)) {
4135 vandpd(dst, nds, as_Address(src), vector_len);
4136 } else {
4137 lea(rscratch1, src);
4138 vandpd(dst, nds, Address(rscratch1, 0), vector_len);
4139 }
4140 }
4141
4142 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4143 if (reachable(src)) {
4144 vandps(dst, nds, as_Address(src), vector_len);
4145 } else {
4146 lea(rscratch1, src);
4147 vandps(dst, nds, Address(rscratch1, 0), vector_len);
4148 }
4149 }
4150
4186
4187 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4188 if (reachable(src)) {
4189 vsubsd(dst, nds, as_Address(src));
4190 } else {
4191 lea(rscratch1, src);
4192 vsubsd(dst, nds, Address(rscratch1, 0));
4193 }
4194 }
4195
4196 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4197 if (reachable(src)) {
4198 vsubss(dst, nds, as_Address(src));
4199 } else {
4200 lea(rscratch1, src);
4201 vsubss(dst, nds, Address(rscratch1, 0));
4202 }
4203 }
4204
4205 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4206 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
4207 vxorps(dst, nds, src, Assembler::AVX_128bit);
4208 }
4209
4210 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4211 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
4212 vxorpd(dst, nds, src, Assembler::AVX_128bit);
4213 }
4214
4215 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4216 if (reachable(src)) {
4217 vxorpd(dst, nds, as_Address(src), vector_len);
4218 } else {
4219 lea(rscratch1, src);
4220 vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
4221 }
4222 }
4223
4224 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4225 if (reachable(src)) {
4226 vxorps(dst, nds, as_Address(src), vector_len);
4227 } else {
4228 lea(rscratch1, src);
4229 vxorps(dst, nds, Address(rscratch1, 0), vector_len);
4230 }
4231 }
4232
6342
6343 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
6344 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
6345 ShortBranchVerifier sbv(this);
6346 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6347
6348 int stride = 8;
6349
6350 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
6351 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
6352 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
6353 FOUND_SEQ_CHAR, DONE_LABEL;
6354
6355 movptr(result, str1);
6356 if (UseAVX >= 2) {
6357 cmpl(cnt1, stride);
6358 jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
6359 cmpl(cnt1, 2*stride);
6360 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
6361 movdl(vec1, ch);
6362 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
6363 vpxor(vec2, vec2);
6364 movl(tmp, cnt1);
6365 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
6366 andl(cnt1,0x0000000F); //tail count (in chars)
6367
6368 bind(SCAN_TO_16_CHAR_LOOP);
6369 vmovdqu(vec3, Address(result, 0));
6370 vpcmpeqw(vec3, vec3, vec1, 1);
6371 vptest(vec2, vec3);
6372 jcc(Assembler::carryClear, FOUND_CHAR);
6373 addptr(result, 32);
6374 subl(tmp, 2*stride);
6375 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
6376 jmp(SCAN_TO_8_CHAR);
6377 bind(SCAN_TO_8_CHAR_INIT);
6378 movdl(vec1, ch);
6379 pshuflw(vec1, vec1, 0x00);
6380 pshufd(vec1, vec1, 0);
6381 pxor(vec2, vec2);
6382 }
6937 jmp(FALSE_LABEL);
6938
6939 clear_vector_masking(); // closing of the stub context for programming mask registers
6940 } else {
6941 movl(result, len); // copy
6942
6943 if (UseAVX == 2 && UseSSE >= 2) {
6944 // With AVX2, use 32-byte vector compare
6945 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6946
6947 // Compare 32-byte vectors
6948 andl(result, 0x0000001f); // tail count (in bytes)
6949 andl(len, 0xffffffe0); // vector count (in bytes)
6950 jccb(Assembler::zero, COMPARE_TAIL);
6951
6952 lea(ary1, Address(ary1, len, Address::times_1));
6953 negptr(len);
6954
6955 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
6956 movdl(vec2, tmp1);
6957 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
6958
6959 bind(COMPARE_WIDE_VECTORS);
6960 vmovdqu(vec1, Address(ary1, len, Address::times_1));
6961 vptest(vec1, vec2);
6962 jccb(Assembler::notZero, TRUE_LABEL);
6963 addptr(len, 32);
6964 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6965
6966 testl(result, result);
6967 jccb(Assembler::zero, FALSE_LABEL);
6968
6969 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
6970 vptest(vec1, vec2);
6971 jccb(Assembler::notZero, TRUE_LABEL);
6972 jmpb(FALSE_LABEL);
6973
6974 bind(COMPARE_TAIL); // len is zero
6975 movl(len, result);
6976 // Fallthru to tail compare
6977 } else if (UseSSE42Intrinsics) {
7367 if (!UseUnalignedLoadStores) {
7368 // align to 8 bytes, we know we are 4 byte aligned to start
7369 testptr(to, 4);
7370 jccb(Assembler::zero, L_fill_32_bytes);
7371 movl(Address(to, 0), value);
7372 addptr(to, 4);
7373 subl(count, 1<<shift);
7374 }
7375 BIND(L_fill_32_bytes);
7376 {
7377 assert( UseSSE >= 2, "supported cpu only" );
7378 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7379 if (UseAVX > 2) {
7380 movl(rtmp, 0xffff);
7381 kmovwl(k1, rtmp);
7382 }
7383 movdl(xtmp, value);
7384 if (UseAVX > 2 && UseUnalignedLoadStores) {
7385 // Fill 64-byte chunks
7386 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7387 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
7388
7389 subl(count, 16 << shift);
7390 jcc(Assembler::less, L_check_fill_32_bytes);
7391 align(16);
7392
7393 BIND(L_fill_64_bytes_loop);
7394 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
7395 addptr(to, 64);
7396 subl(count, 16 << shift);
7397 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7398
7399 BIND(L_check_fill_32_bytes);
7400 addl(count, 8 << shift);
7401 jccb(Assembler::less, L_check_fill_8_bytes);
7402 vmovdqu(Address(to, 0), xtmp);
7403 addptr(to, 32);
7404 subl(count, 8 << shift);
7405
7406 BIND(L_check_fill_8_bytes);
7407 } else if (UseAVX == 2 && UseUnalignedLoadStores) {
7408 // Fill 64-byte chunks
7409 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7410 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
7411
7412 subl(count, 16 << shift);
7413 jcc(Assembler::less, L_check_fill_32_bytes);
7414 align(16);
7415
7416 BIND(L_fill_64_bytes_loop);
7417 vmovdqu(Address(to, 0), xtmp);
7418 vmovdqu(Address(to, 32), xtmp);
7419 addptr(to, 64);
7420 subl(count, 16 << shift);
7421 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7422
7423 BIND(L_check_fill_32_bytes);
7424 addl(count, 8 << shift);
7425 jccb(Assembler::less, L_check_fill_8_bytes);
7426 vmovdqu(Address(to, 0), xtmp);
7427 addptr(to, 32);
7428 subl(count, 8 << shift);
7429
7430 BIND(L_check_fill_8_bytes);
7531 xorl(result, result);
7532 // check for zero length
7533 testl(len, len);
7534 jcc(Assembler::zero, L_done);
7535
7536 movl(result, len);
7537
7538 // Setup pointers
7539 lea(src, Address(src, len, Address::times_2)); // char[]
7540 lea(dst, Address(dst, len, Address::times_1)); // byte[]
7541 negptr(len);
7542
7543 if (UseSSE42Intrinsics || UseAVX >= 2) {
7544 Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
7545 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7546
7547 if (UseAVX >= 2) {
7548 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7549 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
7550 movdl(tmp1Reg, tmp5);
7551 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
7552 jmp(L_chars_32_check);
7553
7554 bind(L_copy_32_chars);
7555 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7556 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7557 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7558 vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7559 jccb(Assembler::notZero, L_copy_32_chars_exit);
7560 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7561 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
7562 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7563
7564 bind(L_chars_32_check);
7565 addptr(len, 32);
7566 jcc(Assembler::lessEqual, L_copy_32_chars);
7567
7568 bind(L_copy_32_chars_exit);
7569 subptr(len, 16);
7570 jccb(Assembler::greater, L_copy_16_chars_exit);
7571
|