< prev index next >

src/hotspot/cpu/x86/macroAssembler_x86.cpp

Print this page




3106 }
3107 
3108 void MacroAssembler::load_double(Address src) {
3109   if (UseSSE >= 2) {
3110     movdbl(xmm0, src);
3111   } else {
3112     LP64_ONLY(ShouldNotReachHere());
3113     NOT_LP64(fld_d(src));
3114   }
3115 }
3116 
3117 void MacroAssembler::store_double(Address dst) {
3118   if (UseSSE >= 2) {
3119     movdbl(dst, xmm0);
3120   } else {
3121     LP64_ONLY(ShouldNotReachHere());
3122     NOT_LP64(fstp_d(dst));
3123   }
3124 }
3125 
3126 void MacroAssembler::push_zmm(XMMRegister reg) {
3127   lea(rsp, Address(rsp, -64)); // Use lea to not affect flags
3128   evmovdqul(Address(rsp, 0), reg, Assembler::AVX_512bit);
3129 }
3130 
3131 void MacroAssembler::pop_zmm(XMMRegister reg) {
3132   evmovdqul(reg, Address(rsp, 0), Assembler::AVX_512bit);
3133   lea(rsp, Address(rsp, 64)); // Use lea to not affect flags
3134 }
3135 
3136 void MacroAssembler::fremr(Register tmp) {
3137   save_rax(tmp);
3138   { Label L;
3139     bind(L);
3140     fprem();
3141     fwait(); fnstsw_ax();
3142 #ifdef _LP64
3143     testl(rax, 0x400);
3144     jcc(Assembler::notEqual, L);
3145 #else
3146     sahf();
3147     jcc(Assembler::parity, L);
3148 #endif // _LP64
3149   }
3150   restore_rax(tmp);
3151   // Result is in ST0.
3152   // Note: fxch & fpop to get rid of ST1
3153   // (otherwise FPU stack could overflow eventually)
3154   fxch(1);
3155   fpop();


3496 }
3497 
3498 void MacroAssembler::movptr(Register dst, Register src) {
3499   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3500 }
3501 
3502 void MacroAssembler::movptr(Register dst, Address src) {
3503   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3504 }
3505 
3506 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3507 void MacroAssembler::movptr(Register dst, intptr_t src) {
3508   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3509 }
3510 
3511 void MacroAssembler::movptr(Address dst, Register src) {
3512   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3513 }
3514 
3515 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3516   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3517     Assembler::vextractf32x4(dst, src, 0);
3518   } else {
3519     Assembler::movdqu(dst, src);
3520   }
3521 }
3522 
3523 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3524   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3525     Assembler::vinsertf32x4(dst, dst, src, 0);
3526   } else {
3527     Assembler::movdqu(dst, src);
3528   }
3529 }
3530 
3531 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3532   if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3533     Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3534   } else {
3535     Assembler::movdqu(dst, src);
3536   }
3537 }
3538 
3539 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
3540   if (reachable(src)) {
3541     movdqu(dst, as_Address(src));
3542   } else {
3543     lea(scratchReg, src);
3544     movdqu(dst, Address(scratchReg, 0));
3545   }
3546 }
3547 
3548 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3549   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3550     vextractf64x4_low(dst, src);
3551   } else {
3552     Assembler::vmovdqu(dst, src);
3553   }
3554 }
3555 
3556 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3557   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3558     vinsertf64x4_low(dst, src);
3559   } else {
3560     Assembler::vmovdqu(dst, src);
3561   }
3562 }
3563 
3564 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3565   if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3566     Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3567   }
3568   else {
3569     Assembler::vmovdqu(dst, src);
3570   }
3571 }
3572 
3573 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
3574   if (reachable(src)) {
3575     vmovdqu(dst, as_Address(src));
3576   }
3577   else {
3578     lea(rscratch1, src);
3579     vmovdqu(dst, Address(rscratch1, 0));
3580   }
3581 }
3582 
3583 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3584   if (reachable(src)) {
3585     Assembler::evmovdquq(dst, as_Address(src), vector_len);
3586   } else {
3587     lea(rscratch, src);
3588     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
3589   }
3590 }


3834     shll(reg, 24);
3835     sarl(reg, 24);
3836   }
3837 }
3838 
3839 void MacroAssembler::sign_extend_short(Register reg) {
3840   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3841     movswl(reg, reg); // movsxw
3842   } else {
3843     shll(reg, 16);
3844     sarl(reg, 16);
3845   }
3846 }
3847 
3848 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3849   assert(reachable(src), "Address should be reachable");
3850   testl(dst, as_Address(src));
3851 }
3852 
3853 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3854   int dst_enc = dst->encoding();
3855   int src_enc = src->encoding();
3856   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3857     Assembler::pcmpeqb(dst, src);
3858   } else if ((dst_enc < 16) && (src_enc < 16)) {
3859     Assembler::pcmpeqb(dst, src);
3860   } else if (src_enc < 16) {
3861     push_zmm(xmm0);
3862     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3863     Assembler::pcmpeqb(xmm0, src);
3864     movdqu(dst, xmm0);
3865     pop_zmm(xmm0);
3866   } else if (dst_enc < 16) {
3867     push_zmm(xmm0);
3868     evmovdqul(xmm0, src, Assembler::AVX_512bit);
3869     Assembler::pcmpeqb(dst, xmm0);
3870     pop_zmm(xmm0);
3871   } else {
3872     push_zmm(xmm0);
3873     push_zmm(xmm1);
3874     movdqu(xmm0, src);
3875     movdqu(xmm1, dst);
3876     Assembler::pcmpeqb(xmm1, xmm0);
3877     movdqu(dst, xmm1);
3878     pop_zmm(xmm1);
3879     pop_zmm(xmm0);
3880   }
3881 }
3882 
3883 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3884   int dst_enc = dst->encoding();
3885   int src_enc = src->encoding();
3886   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3887     Assembler::pcmpeqw(dst, src);
3888   } else if ((dst_enc < 16) && (src_enc < 16)) {
3889     Assembler::pcmpeqw(dst, src);
3890   } else if (src_enc < 16) {
3891     push_zmm(xmm0);
3892     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3893     Assembler::pcmpeqw(xmm0, src);
3894     movdqu(dst, xmm0);
3895     pop_zmm(xmm0);
3896   } else if (dst_enc < 16) {
3897     push_zmm(xmm0);
3898     evmovdqul(xmm0, src, Assembler::AVX_512bit);
3899     Assembler::pcmpeqw(dst, xmm0);
3900     pop_zmm(xmm0);
3901   } else {
3902     push_zmm(xmm0);
3903     push_zmm(xmm1);
3904     movdqu(xmm0, src);
3905     movdqu(xmm1, dst);
3906     Assembler::pcmpeqw(xmm1, xmm0);
3907     movdqu(dst, xmm1);
3908     pop_zmm(xmm1);
3909     pop_zmm(xmm0);
3910   }
3911 }
3912 
3913 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3914   int dst_enc = dst->encoding();
3915   if (dst_enc < 16) {
3916     Assembler::pcmpestri(dst, src, imm8);
3917   } else {
3918     push_zmm(xmm0);
3919     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3920     Assembler::pcmpestri(xmm0, src, imm8);
3921     movdqu(dst, xmm0);
3922     pop_zmm(xmm0);
3923   }
3924 }
3925 
3926 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3927   int dst_enc = dst->encoding();
3928   int src_enc = src->encoding();
3929   if ((dst_enc < 16) && (src_enc < 16)) {
3930     Assembler::pcmpestri(dst, src, imm8);
3931   } else if (src_enc < 16) {
3932     push_zmm(xmm0);
3933     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3934     Assembler::pcmpestri(xmm0, src, imm8);
3935     movdqu(dst, xmm0);
3936     pop_zmm(xmm0);
3937   } else if (dst_enc < 16) {
3938     push_zmm(xmm0);
3939     evmovdqul(xmm0, src, Assembler::AVX_512bit);
3940     Assembler::pcmpestri(dst, xmm0, imm8);
3941     pop_zmm(xmm0);
3942   } else {
3943     push_zmm(xmm0);
3944     push_zmm(xmm1);
3945     movdqu(xmm0, src);
3946     movdqu(xmm1, dst);
3947     Assembler::pcmpestri(xmm1, xmm0, imm8);
3948     movdqu(dst, xmm1);
3949     pop_zmm(xmm1);
3950     pop_zmm(xmm0);
3951   }
3952 }
3953 
3954 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3955   int dst_enc = dst->encoding();
3956   int src_enc = src->encoding();
3957   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3958     Assembler::pmovzxbw(dst, src);
3959   } else if ((dst_enc < 16) && (src_enc < 16)) {
3960     Assembler::pmovzxbw(dst, src);
3961   } else if (src_enc < 16) {
3962     push_zmm(xmm0);
3963     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3964     Assembler::pmovzxbw(xmm0, src);
3965     movdqu(dst, xmm0);
3966     pop_zmm(xmm0);
3967   } else if (dst_enc < 16) {
3968     push_zmm(xmm0);
3969     evmovdqul(xmm0, src, Assembler::AVX_512bit);
3970     Assembler::pmovzxbw(dst, xmm0);
3971     pop_zmm(xmm0);
3972   } else {
3973     push_zmm(xmm0);
3974     push_zmm(xmm1);
3975     movdqu(xmm0, src);
3976     movdqu(xmm1, dst);
3977     Assembler::pmovzxbw(xmm1, xmm0);
3978     movdqu(dst, xmm1);
3979     pop_zmm(xmm1);
3980     pop_zmm(xmm0);
3981   }
3982 }
3983 
3984 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3985   int dst_enc = dst->encoding();
3986   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3987     Assembler::pmovzxbw(dst, src);
3988   } else if (dst_enc < 16) {
3989     Assembler::pmovzxbw(dst, src);
3990   } else {
3991     push_zmm(xmm0);
3992     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3993     Assembler::pmovzxbw(xmm0, src);
3994     movdqu(dst, xmm0);
3995     pop_zmm(xmm0);
3996   }
3997 }
3998 
3999 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
4000   int src_enc = src->encoding();
4001   if (src_enc < 16) {
4002     Assembler::pmovmskb(dst, src);
4003   } else {
4004     push_zmm(xmm0);
4005     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4006     Assembler::pmovmskb(dst, xmm0);
4007     pop_zmm(xmm0);
4008   }
4009 }
4010 
4011 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
4012   int dst_enc = dst->encoding();
4013   int src_enc = src->encoding();
4014   if ((dst_enc < 16) && (src_enc < 16)) {
4015     Assembler::ptest(dst, src);
4016   } else if (src_enc < 16) {
4017     push_zmm(xmm0);
4018     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4019     Assembler::ptest(xmm0, src);
4020     pop_zmm(xmm0);
4021   } else if (dst_enc < 16) {
4022     push_zmm(xmm0);
4023     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4024     Assembler::ptest(dst, xmm0);
4025     pop_zmm(xmm0);
4026   } else {
4027     push_zmm(xmm0);
4028     push_zmm(xmm1);
4029     movdqu(xmm0, src);
4030     movdqu(xmm1, dst);
4031     Assembler::ptest(xmm1, xmm0);
4032     pop_zmm(xmm1);
4033     pop_zmm(xmm0);
4034   }
4035 }
4036 
4037 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
4038   if (reachable(src)) {
4039     Assembler::sqrtsd(dst, as_Address(src));
4040   } else {
4041     lea(rscratch1, src);
4042     Assembler::sqrtsd(dst, Address(rscratch1, 0));
4043   }
4044 }
4045 
4046 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
4047   if (reachable(src)) {
4048     Assembler::sqrtss(dst, as_Address(src));
4049   } else {
4050     lea(rscratch1, src);
4051     Assembler::sqrtss(dst, Address(rscratch1, 0));
4052   }
4053 }
4054 


4143 
4144 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4145   if (reachable(src)) {
4146     vaddsd(dst, nds, as_Address(src));
4147   } else {
4148     lea(rscratch1, src);
4149     vaddsd(dst, nds, Address(rscratch1, 0));
4150   }
4151 }
4152 
4153 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4154   if (reachable(src)) {
4155     vaddss(dst, nds, as_Address(src));
4156   } else {
4157     lea(rscratch1, src);
4158     vaddss(dst, nds, Address(rscratch1, 0));
4159   }
4160 }
4161 
4162 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
4163   int dst_enc = dst->encoding();
4164   int nds_enc = nds->encoding();
4165   int src_enc = src->encoding();
4166   if ((dst_enc < 16) && (nds_enc < 16)) {
4167     vandps(dst, nds, negate_field, vector_len);
4168   } else if ((src_enc < 16) && (dst_enc < 16)) {
4169     // Use src scratch register
4170     evmovdqul(src, nds, Assembler::AVX_512bit);
4171     vandps(dst, src, negate_field, vector_len);
4172   } else if (dst_enc < 16) {
4173     evmovdqul(dst, nds, Assembler::AVX_512bit);
4174     vandps(dst, dst, negate_field, vector_len);
4175   } else if (nds_enc < 16) {
4176     vandps(nds, nds, negate_field, vector_len);
4177     evmovdqul(dst, nds, Assembler::AVX_512bit);
4178   } else if (src_enc < 16) {
4179     evmovdqul(src, nds, Assembler::AVX_512bit);
4180     vandps(src, src, negate_field, vector_len);
4181     evmovdqul(dst, src, Assembler::AVX_512bit);
4182   } else {
4183     if (src_enc != dst_enc) {
4184       // Use src scratch register
4185       evmovdqul(src, xmm0, Assembler::AVX_512bit);
4186       evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4187       vandps(xmm0, xmm0, negate_field, vector_len);
4188       evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4189       evmovdqul(xmm0, src, Assembler::AVX_512bit);
4190     } else {
4191       push_zmm(xmm0);
4192       evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4193       vandps(xmm0, xmm0, negate_field, vector_len);
4194       evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4195       pop_zmm(xmm0);
4196     }
4197   }
4198 }
4199 
4200 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
4201   int dst_enc = dst->encoding();
4202   int nds_enc = nds->encoding();
4203   int src_enc = src->encoding();
4204   if ((dst_enc < 16) && (nds_enc < 16)) {
4205     vandpd(dst, nds, negate_field, vector_len);
4206   } else if ((src_enc < 16) && (dst_enc < 16)) {
4207     // Use src scratch register
4208     evmovdqul(src, nds, Assembler::AVX_512bit);
4209     vandpd(dst, src, negate_field, vector_len);
4210   } else if (dst_enc < 16) {
4211     evmovdqul(dst, nds, Assembler::AVX_512bit);
4212     vandpd(dst, dst, negate_field, vector_len);
4213   } else if (nds_enc < 16) {
4214     vandpd(nds, nds, negate_field, vector_len);
4215     evmovdqul(dst, nds, Assembler::AVX_512bit);
4216   } else if (src_enc < 16) {
4217     evmovdqul(src, nds, Assembler::AVX_512bit);
4218     vandpd(src, src, negate_field, vector_len);
4219     evmovdqul(dst, src, Assembler::AVX_512bit);
4220   } else {
4221     if (src_enc != dst_enc) {
4222       evmovdqul(src, xmm0, Assembler::AVX_512bit);
4223       evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4224       vandpd(xmm0, xmm0, negate_field, vector_len);
4225       evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4226       evmovdqul(xmm0, src, Assembler::AVX_512bit);
4227     } else {
4228       push_zmm(xmm0);
4229       evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4230       vandpd(xmm0, xmm0, negate_field, vector_len);
4231       evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4232       pop_zmm(xmm0);
4233     }
4234   }
4235 }
4236 
4237 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4238   int dst_enc = dst->encoding();
4239   int nds_enc = nds->encoding();
4240   int src_enc = src->encoding();
4241   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4242     Assembler::vpaddb(dst, nds, src, vector_len);
4243   } else if ((dst_enc < 16) && (src_enc < 16)) {
4244     Assembler::vpaddb(dst, dst, src, vector_len);
4245   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4246     // use nds as scratch for src
4247     evmovdqul(nds, src, Assembler::AVX_512bit);
4248     Assembler::vpaddb(dst, dst, nds, vector_len);
4249   } else if ((src_enc < 16) && (nds_enc < 16)) {
4250     // use nds as scratch for dst
4251     evmovdqul(nds, dst, Assembler::AVX_512bit);
4252     Assembler::vpaddb(nds, nds, src, vector_len);
4253     evmovdqul(dst, nds, Assembler::AVX_512bit);
4254   } else if (dst_enc < 16) {
4255     // use nds as scatch for xmm0 to hold src
4256     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4257     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4258     Assembler::vpaddb(dst, dst, xmm0, vector_len);
4259     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4260   } else {
4261     // worse case scenario, all regs are in the upper bank
4262     push_zmm(xmm1);
4263     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4264     evmovdqul(xmm1, src, Assembler::AVX_512bit);
4265     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4266     Assembler::vpaddb(xmm0, xmm0, xmm1, vector_len);
4267     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4268     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4269     pop_zmm(xmm1);
4270   }
4271 }
4272 
4273 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4274   int dst_enc = dst->encoding();
4275   int nds_enc = nds->encoding();
4276   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4277     Assembler::vpaddb(dst, nds, src, vector_len);
4278   } else if (dst_enc < 16) {
4279     Assembler::vpaddb(dst, dst, src, vector_len);
4280   } else if (nds_enc < 16) {
4281     // implies dst_enc in upper bank with src as scratch
4282     evmovdqul(nds, dst, Assembler::AVX_512bit);
4283     Assembler::vpaddb(nds, nds, src, vector_len);
4284     evmovdqul(dst, nds, Assembler::AVX_512bit);
4285   } else {
4286     // worse case scenario, all regs in upper bank
4287     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4288     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4289     Assembler::vpaddb(xmm0, xmm0, src, vector_len);
4290     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4291     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4292   }
4293 }
4294 
4295 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4296   int dst_enc = dst->encoding();
4297   int nds_enc = nds->encoding();
4298   int src_enc = src->encoding();
4299   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4300     Assembler::vpaddw(dst, nds, src, vector_len);
4301   } else if ((dst_enc < 16) && (src_enc < 16)) {
4302     Assembler::vpaddw(dst, dst, src, vector_len);
4303   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4304     // use nds as scratch for src
4305     evmovdqul(nds, src, Assembler::AVX_512bit);
4306     Assembler::vpaddw(dst, dst, nds, vector_len);
4307   } else if ((src_enc < 16) && (nds_enc < 16)) {
4308     // use nds as scratch for dst
4309     evmovdqul(nds, dst, Assembler::AVX_512bit);
4310     Assembler::vpaddw(nds, nds, src, vector_len);
4311     evmovdqul(dst, nds, Assembler::AVX_512bit);
4312   } else if (dst_enc < 16) {
4313     // use nds as scatch for xmm0 to hold src
4314     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4315     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4316     Assembler::vpaddw(dst, dst, xmm0, vector_len);
4317     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4318   } else {
4319     // worse case scenario, all regs are in the upper bank
4320     push_zmm(xmm1);
4321     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4322     evmovdqul(xmm1, src, Assembler::AVX_512bit);
4323     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4324     Assembler::vpaddw(xmm0, xmm0, xmm1, vector_len);
4325     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4326     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4327     pop_zmm(xmm1);
4328   }
4329 }
4330 
4331 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4332   int dst_enc = dst->encoding();
4333   int nds_enc = nds->encoding();
4334   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4335     Assembler::vpaddw(dst, nds, src, vector_len);
4336   } else if (dst_enc < 16) {
4337     Assembler::vpaddw(dst, dst, src, vector_len);
4338   } else if (nds_enc < 16) {
4339     // implies dst_enc in upper bank with nds as scratch
4340     evmovdqul(nds, dst, Assembler::AVX_512bit);
4341     Assembler::vpaddw(nds, nds, src, vector_len);
4342     evmovdqul(dst, nds, Assembler::AVX_512bit);
4343   } else {
4344     // worse case scenario, all regs in upper bank
4345     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4346     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4347     Assembler::vpaddw(xmm0, xmm0, src, vector_len);
4348     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4349     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4350   }
4351 }
4352 
4353 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4354   if (reachable(src)) {
4355     Assembler::vpand(dst, nds, as_Address(src), vector_len);
4356   } else {
4357     lea(rscratch1, src);
4358     Assembler::vpand(dst, nds, Address(rscratch1, 0), vector_len);
4359   }
4360 }
4361 
4362 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
4363   int dst_enc = dst->encoding();
4364   int src_enc = src->encoding();
4365   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4366     Assembler::vpbroadcastw(dst, src);
4367   } else if ((dst_enc < 16) && (src_enc < 16)) {
4368     Assembler::vpbroadcastw(dst, src);
4369   } else if (src_enc < 16) {
4370     push_zmm(xmm0);
4371     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4372     Assembler::vpbroadcastw(xmm0, src);
4373     movdqu(dst, xmm0);
4374     pop_zmm(xmm0);
4375   } else if (dst_enc < 16) {
4376     push_zmm(xmm0);
4377     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4378     Assembler::vpbroadcastw(dst, xmm0);
4379     pop_zmm(xmm0);
4380   } else {
4381     push_zmm(xmm0);
4382     push_zmm(xmm1);
4383     movdqu(xmm0, src);
4384     movdqu(xmm1, dst);
4385     Assembler::vpbroadcastw(xmm1, xmm0);
4386     movdqu(dst, xmm1);
4387     pop_zmm(xmm1);
4388     pop_zmm(xmm0);
4389   }
4390 }
4391 
4392 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4393   int dst_enc = dst->encoding();
4394   int nds_enc = nds->encoding();
4395   int src_enc = src->encoding();
4396   assert(dst_enc == nds_enc, "");
4397   if ((dst_enc < 16) && (src_enc < 16)) {
4398     Assembler::vpcmpeqb(dst, nds, src, vector_len);
4399   } else if (src_enc < 16) {
4400     push_zmm(xmm0);
4401     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4402     Assembler::vpcmpeqb(xmm0, xmm0, src, vector_len);
4403     movdqu(dst, xmm0);
4404     pop_zmm(xmm0);
4405   } else if (dst_enc < 16) {
4406     push_zmm(xmm0);
4407     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4408     Assembler::vpcmpeqb(dst, dst, xmm0, vector_len);
4409     pop_zmm(xmm0);
4410   } else {
4411     push_zmm(xmm0);
4412     push_zmm(xmm1);
4413     movdqu(xmm0, src);
4414     movdqu(xmm1, dst);
4415     Assembler::vpcmpeqb(xmm1, xmm1, xmm0, vector_len);
4416     movdqu(dst, xmm1);
4417     pop_zmm(xmm1);
4418     pop_zmm(xmm0);
4419   }
4420 }
4421 
4422 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4423   int dst_enc = dst->encoding();
4424   int nds_enc = nds->encoding();
4425   int src_enc = src->encoding();
4426   assert(dst_enc == nds_enc, "");
4427   if ((dst_enc < 16) && (src_enc < 16)) {
4428     Assembler::vpcmpeqw(dst, nds, src, vector_len);
4429   } else if (src_enc < 16) {
4430     push_zmm(xmm0);
4431     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4432     Assembler::vpcmpeqw(xmm0, xmm0, src, vector_len);
4433     movdqu(dst, xmm0);
4434     pop_zmm(xmm0);
4435   } else if (dst_enc < 16) {
4436     push_zmm(xmm0);
4437     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4438     Assembler::vpcmpeqw(dst, dst, xmm0, vector_len);
4439     pop_zmm(xmm0);
4440   } else {
4441     push_zmm(xmm0);
4442     push_zmm(xmm1);
4443     movdqu(xmm0, src);
4444     movdqu(xmm1, dst);
4445     Assembler::vpcmpeqw(xmm1, xmm1, xmm0, vector_len);
4446     movdqu(dst, xmm1);
4447     pop_zmm(xmm1);
4448     pop_zmm(xmm0);
4449   }
4450 }
4451 
4452 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
4453   int dst_enc = dst->encoding();
4454   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4455     Assembler::vpmovzxbw(dst, src, vector_len);
4456   } else if (dst_enc < 16) {
4457     Assembler::vpmovzxbw(dst, src, vector_len);
4458   } else {
4459     push_zmm(xmm0);
4460     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4461     Assembler::vpmovzxbw(xmm0, src, vector_len);
4462     movdqu(dst, xmm0);
4463     pop_zmm(xmm0);
4464   }
4465 }
4466 
4467 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
4468   int src_enc = src->encoding();
4469   if (src_enc < 16) {
4470     Assembler::vpmovmskb(dst, src);
4471   } else {
4472     push_zmm(xmm0);
4473     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4474     Assembler::vpmovmskb(dst, xmm0);
4475     pop_zmm(xmm0);
4476   }
4477 }
4478 
4479 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4480   int dst_enc = dst->encoding();
4481   int nds_enc = nds->encoding();
4482   int src_enc = src->encoding();
4483   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4484     Assembler::vpmullw(dst, nds, src, vector_len);
4485   } else if ((dst_enc < 16) && (src_enc < 16)) {
4486     Assembler::vpmullw(dst, dst, src, vector_len);
4487   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4488     // use nds as scratch for src
4489     evmovdqul(nds, src, Assembler::AVX_512bit);
4490     Assembler::vpmullw(dst, dst, nds, vector_len);
4491   } else if ((src_enc < 16) && (nds_enc < 16)) {
4492     // use nds as scratch for dst
4493     evmovdqul(nds, dst, Assembler::AVX_512bit);
4494     Assembler::vpmullw(nds, nds, src, vector_len);
4495     evmovdqul(dst, nds, Assembler::AVX_512bit);
4496   } else if (dst_enc < 16) {
4497     // use nds as scatch for xmm0 to hold src
4498     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4499     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4500     Assembler::vpmullw(dst, dst, xmm0, vector_len);
4501     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4502   } else {
4503     // worse case scenario, all regs are in the upper bank
4504     push_zmm(xmm1);
4505     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4506     evmovdqul(xmm1, src, Assembler::AVX_512bit);
4507     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4508     Assembler::vpmullw(xmm0, xmm0, xmm1, vector_len);
4509     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4510     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4511     pop_zmm(xmm1);
4512   }
4513 }
4514 
4515 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4516   int dst_enc = dst->encoding();
4517   int nds_enc = nds->encoding();
4518   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4519     Assembler::vpmullw(dst, nds, src, vector_len);
4520   } else if (dst_enc < 16) {
4521     Assembler::vpmullw(dst, dst, src, vector_len);
4522   } else if (nds_enc < 16) {
4523     // implies dst_enc in upper bank with src as scratch
4524     evmovdqul(nds, dst, Assembler::AVX_512bit);
4525     Assembler::vpmullw(nds, nds, src, vector_len);
4526     evmovdqul(dst, nds, Assembler::AVX_512bit);
4527   } else {
4528     // worse case scenario, all regs in upper bank
4529     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4530     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4531     Assembler::vpmullw(xmm0, xmm0, src, vector_len);
4532     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4533     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4534   }
4535 }
4536 
4537 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4538   int dst_enc = dst->encoding();
4539   int nds_enc = nds->encoding();
4540   int src_enc = src->encoding();
4541   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4542     Assembler::vpsubb(dst, nds, src, vector_len);
4543   } else if ((dst_enc < 16) && (src_enc < 16)) {
4544     Assembler::vpsubb(dst, dst, src, vector_len);
4545   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4546     // use nds as scratch for src
4547     evmovdqul(nds, src, Assembler::AVX_512bit);
4548     Assembler::vpsubb(dst, dst, nds, vector_len);
4549   } else if ((src_enc < 16) && (nds_enc < 16)) {
4550     // use nds as scratch for dst
4551     evmovdqul(nds, dst, Assembler::AVX_512bit);
4552     Assembler::vpsubb(nds, nds, src, vector_len);
4553     evmovdqul(dst, nds, Assembler::AVX_512bit);
4554   } else if (dst_enc < 16) {
4555     // use nds as scatch for xmm0 to hold src
4556     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4557     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4558     Assembler::vpsubb(dst, dst, xmm0, vector_len);
4559     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4560   } else {
4561     // worse case scenario, all regs are in the upper bank
4562     push_zmm(xmm1);
4563     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4564     evmovdqul(xmm1, src, Assembler::AVX_512bit);
4565     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4566     Assembler::vpsubb(xmm0, xmm0, xmm1, vector_len);
4567     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4568     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4569     pop_zmm(xmm1);
4570   }
4571 }
4572 
4573 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4574   int dst_enc = dst->encoding();
4575   int nds_enc = nds->encoding();
4576   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4577     Assembler::vpsubb(dst, nds, src, vector_len);
4578   } else if (dst_enc < 16) {
4579     Assembler::vpsubb(dst, dst, src, vector_len);
4580   } else if (nds_enc < 16) {
4581     // implies dst_enc in upper bank with src as scratch
4582     evmovdqul(nds, dst, Assembler::AVX_512bit);
4583     Assembler::vpsubb(nds, nds, src, vector_len);
4584     evmovdqul(dst, nds, Assembler::AVX_512bit);
4585   } else {
4586     // worse case scenario, all regs in upper bank
4587     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4588     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4589     Assembler::vpsubb(xmm0, xmm0, src, vector_len);
4590     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4591     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4592   }
4593 }
4594 
4595 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4596   int dst_enc = dst->encoding();
4597   int nds_enc = nds->encoding();
4598   int src_enc = src->encoding();
4599   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4600     Assembler::vpsubw(dst, nds, src, vector_len);
4601   } else if ((dst_enc < 16) && (src_enc < 16)) {
4602     Assembler::vpsubw(dst, dst, src, vector_len);
4603   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4604     // use nds as scratch for src
4605     evmovdqul(nds, src, Assembler::AVX_512bit);
4606     Assembler::vpsubw(dst, dst, nds, vector_len);
4607   } else if ((src_enc < 16) && (nds_enc < 16)) {
4608     // use nds as scratch for dst
4609     evmovdqul(nds, dst, Assembler::AVX_512bit);
4610     Assembler::vpsubw(nds, nds, src, vector_len);
4611     evmovdqul(dst, nds, Assembler::AVX_512bit);
4612   } else if (dst_enc < 16) {
4613     // use nds as scatch for xmm0 to hold src
4614     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4615     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4616     Assembler::vpsubw(dst, dst, xmm0, vector_len);
4617     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4618   } else {
4619     // worse case scenario, all regs are in the upper bank
4620     push_zmm(xmm1);
4621     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4622     evmovdqul(xmm1, src, Assembler::AVX_512bit);
4623     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4624     Assembler::vpsubw(xmm0, xmm0, xmm1, vector_len);
4625     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4626     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4627     pop_zmm(xmm1);
4628   }
4629 }
4630 
4631 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4632   int dst_enc = dst->encoding();
4633   int nds_enc = nds->encoding();
4634   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4635     Assembler::vpsubw(dst, nds, src, vector_len);
4636   } else if (dst_enc < 16) {
4637     Assembler::vpsubw(dst, dst, src, vector_len);
4638   } else if (nds_enc < 16) {
4639     // implies dst_enc in upper bank with src as scratch
4640     evmovdqul(nds, dst, Assembler::AVX_512bit);
4641     Assembler::vpsubw(nds, nds, src, vector_len);
4642     evmovdqul(dst, nds, Assembler::AVX_512bit);
4643   } else {
4644     // worse case scenario, all regs in upper bank
4645     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4646     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4647     Assembler::vpsubw(xmm0, xmm0, src, vector_len);
4648     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4649     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4650   }
4651 }
4652 
4653 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4654   int dst_enc = dst->encoding();
4655   int nds_enc = nds->encoding();
4656   int shift_enc = shift->encoding();
4657   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4658     Assembler::vpsraw(dst, nds, shift, vector_len);
4659   } else if ((dst_enc < 16) && (shift_enc < 16)) {
4660     Assembler::vpsraw(dst, dst, shift, vector_len);
4661   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4662     // use nds as scratch with shift
4663     evmovdqul(nds, shift, Assembler::AVX_512bit);
4664     Assembler::vpsraw(dst, dst, nds, vector_len);
4665   } else if ((shift_enc < 16) && (nds_enc < 16)) {
4666     // use nds as scratch with dst
4667     evmovdqul(nds, dst, Assembler::AVX_512bit);
4668     Assembler::vpsraw(nds, nds, shift, vector_len);
4669     evmovdqul(dst, nds, Assembler::AVX_512bit);
4670   } else if (dst_enc < 16) {
4671     // use nds to save a copy of xmm0 and hold shift
4672     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4673     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4674     Assembler::vpsraw(dst, dst, xmm0, vector_len);
4675     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4676   } else if (nds_enc < 16) {
4677     // use nds and dst as temps
4678     evmovdqul(nds, dst, Assembler::AVX_512bit);
4679     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4680     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4681     Assembler::vpsraw(nds, nds, xmm0, vector_len);
4682     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4683     evmovdqul(dst, nds, Assembler::AVX_512bit);
4684   } else {
4685     // worse case scenario, all regs are in the upper bank
4686     push_zmm(xmm1);
4687     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4688     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4689     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4690     Assembler::vpsraw(xmm0, xmm0, xmm1, vector_len);
4691     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4692     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4693     pop_zmm(xmm1);
4694   }
4695 }
4696 
4697 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4698   int dst_enc = dst->encoding();
4699   int nds_enc = nds->encoding();
4700   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4701     Assembler::vpsraw(dst, nds, shift, vector_len);
4702   } else if (dst_enc < 16) {
4703     Assembler::vpsraw(dst, dst, shift, vector_len);
4704   } else if (nds_enc < 16) {
4705     // use nds as scratch
4706     evmovdqul(nds, dst, Assembler::AVX_512bit);
4707     Assembler::vpsraw(nds, nds, shift, vector_len);
4708     evmovdqul(dst, nds, Assembler::AVX_512bit);
4709   } else {
4710     // use nds as scratch for xmm0
4711     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4712     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4713     Assembler::vpsraw(xmm0, xmm0, shift, vector_len);
4714     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4715     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4716   }
4717 }
4718 
4719 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4720   int dst_enc = dst->encoding();
4721   int nds_enc = nds->encoding();
4722   int shift_enc = shift->encoding();
4723   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4724     Assembler::vpsrlw(dst, nds, shift, vector_len);
4725   } else if ((dst_enc < 16) && (shift_enc < 16)) {
4726     Assembler::vpsrlw(dst, dst, shift, vector_len);
4727   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4728     // use nds as scratch with shift
4729     evmovdqul(nds, shift, Assembler::AVX_512bit);
4730     Assembler::vpsrlw(dst, dst, nds, vector_len);
4731   } else if ((shift_enc < 16) && (nds_enc < 16)) {
4732     // use nds as scratch with dst
4733     evmovdqul(nds, dst, Assembler::AVX_512bit);
4734     Assembler::vpsrlw(nds, nds, shift, vector_len);
4735     evmovdqul(dst, nds, Assembler::AVX_512bit);
4736   } else if (dst_enc < 16) {
4737     // use nds to save a copy of xmm0 and hold shift
4738     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4739     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4740     Assembler::vpsrlw(dst, dst, xmm0, vector_len);
4741     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4742   } else if (nds_enc < 16) {
4743     // use nds and dst as temps
4744     evmovdqul(nds, dst, Assembler::AVX_512bit);
4745     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4746     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4747     Assembler::vpsrlw(nds, nds, xmm0, vector_len);
4748     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4749     evmovdqul(dst, nds, Assembler::AVX_512bit);
4750   } else {
4751     // worse case scenario, all regs are in the upper bank
4752     push_zmm(xmm1);
4753     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4754     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4755     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4756     Assembler::vpsrlw(xmm0, xmm0, xmm1, vector_len);
4757     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4758     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4759     pop_zmm(xmm1);
4760   }
4761 }
4762 
4763 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4764   int dst_enc = dst->encoding();
4765   int nds_enc = nds->encoding();
4766   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4767     Assembler::vpsrlw(dst, nds, shift, vector_len);
4768   } else if (dst_enc < 16) {
4769     Assembler::vpsrlw(dst, dst, shift, vector_len);
4770   } else if (nds_enc < 16) {
4771     // use nds as scratch
4772     evmovdqul(nds, dst, Assembler::AVX_512bit);
4773     Assembler::vpsrlw(nds, nds, shift, vector_len);
4774     evmovdqul(dst, nds, Assembler::AVX_512bit);
4775   } else {
4776     // use nds as scratch for xmm0
4777     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4778     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4779     Assembler::vpsrlw(xmm0, xmm0, shift, vector_len);
4780     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4781     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4782   }
4783 }
4784 
4785 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4786   int dst_enc = dst->encoding();
4787   int nds_enc = nds->encoding();
4788   int shift_enc = shift->encoding();
4789   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4790     Assembler::vpsllw(dst, nds, shift, vector_len);
4791   } else if ((dst_enc < 16) && (shift_enc < 16)) {
4792     Assembler::vpsllw(dst, dst, shift, vector_len);
4793   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4794     // use nds as scratch with shift
4795     evmovdqul(nds, shift, Assembler::AVX_512bit);
4796     Assembler::vpsllw(dst, dst, nds, vector_len);
4797   } else if ((shift_enc < 16) && (nds_enc < 16)) {
4798     // use nds as scratch with dst
4799     evmovdqul(nds, dst, Assembler::AVX_512bit);
4800     Assembler::vpsllw(nds, nds, shift, vector_len);
4801     evmovdqul(dst, nds, Assembler::AVX_512bit);
4802   } else if (dst_enc < 16) {
4803     // use nds to save a copy of xmm0 and hold shift
4804     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4805     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4806     Assembler::vpsllw(dst, dst, xmm0, vector_len);
4807     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4808   } else if (nds_enc < 16) {
4809     // use nds and dst as temps
4810     evmovdqul(nds, dst, Assembler::AVX_512bit);
4811     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4812     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4813     Assembler::vpsllw(nds, nds, xmm0, vector_len);
4814     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4815     evmovdqul(dst, nds, Assembler::AVX_512bit);
4816   } else {
4817     // worse case scenario, all regs are in the upper bank
4818     push_zmm(xmm1);
4819     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4820     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4821     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4822     Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4823     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4824     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4825     pop_zmm(xmm1);
4826   }
4827 }
4828 
4829 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4830   int dst_enc = dst->encoding();
4831   int nds_enc = nds->encoding();
4832   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4833     Assembler::vpsllw(dst, nds, shift, vector_len);
4834   } else if (dst_enc < 16) {
4835     Assembler::vpsllw(dst, dst, shift, vector_len);
4836   } else if (nds_enc < 16) {
4837     // use nds as scratch
4838     evmovdqul(nds, dst, Assembler::AVX_512bit);
4839     Assembler::vpsllw(nds, nds, shift, vector_len);
4840     evmovdqul(dst, nds, Assembler::AVX_512bit);
4841   } else {
4842     // use nds as scratch for xmm0
4843     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4844     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4845     Assembler::vpsllw(xmm0, xmm0, shift, vector_len);
4846     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4847     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4848   }
4849 }
4850 
4851 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
4852   int dst_enc = dst->encoding();
4853   int src_enc = src->encoding();
4854   if ((dst_enc < 16) && (src_enc < 16)) {
4855     Assembler::vptest(dst, src);
4856   } else if (src_enc < 16) {
4857     push_zmm(xmm0);
4858     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4859     Assembler::vptest(xmm0, src);
4860     pop_zmm(xmm0);
4861   } else if (dst_enc < 16) {
4862     push_zmm(xmm0);
4863     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4864     Assembler::vptest(dst, xmm0);
4865     pop_zmm(xmm0);
4866   } else {
4867     push_zmm(xmm0);
4868     push_zmm(xmm1);
4869     movdqu(xmm0, src);
4870     movdqu(xmm1, dst);
4871     Assembler::vptest(xmm1, xmm0);
4872     pop_zmm(xmm1);
4873     pop_zmm(xmm0);
4874   }
4875 }
4876 
4877 // This instruction exists within macros, ergo we cannot control its input
4878 // when emitted through those patterns.
4879 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
4880   if (VM_Version::supports_avx512nobw()) {
4881     int dst_enc = dst->encoding();
4882     int src_enc = src->encoding();
4883     if (dst_enc == src_enc) {
4884       if (dst_enc < 16) {
4885         Assembler::punpcklbw(dst, src);
4886       } else {
4887         push_zmm(xmm0);
4888         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4889         Assembler::punpcklbw(xmm0, xmm0);
4890         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4891         pop_zmm(xmm0);
4892       }
4893     } else {
4894       if ((src_enc < 16) && (dst_enc < 16)) {
4895         Assembler::punpcklbw(dst, src);
4896       } else if (src_enc < 16) {
4897         push_zmm(xmm0);
4898         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4899         Assembler::punpcklbw(xmm0, src);
4900         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4901         pop_zmm(xmm0);
4902       } else if (dst_enc < 16) {
4903         push_zmm(xmm0);
4904         evmovdqul(xmm0, src, Assembler::AVX_512bit);
4905         Assembler::punpcklbw(dst, xmm0);
4906         pop_zmm(xmm0);
4907       } else {
4908         push_zmm(xmm0);
4909         push_zmm(xmm1);
4910         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4911         evmovdqul(xmm1, src, Assembler::AVX_512bit);
4912         Assembler::punpcklbw(xmm0, xmm1);
4913         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4914         pop_zmm(xmm1);
4915         pop_zmm(xmm0);
4916       }
4917     }
4918   } else {
4919     Assembler::punpcklbw(dst, src);
4920   }
4921 }
4922 
4923 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
4924   if (VM_Version::supports_avx512vl()) {
4925     Assembler::pshufd(dst, src, mode);
4926   } else {
4927     int dst_enc = dst->encoding();
4928     if (dst_enc < 16) {
4929       Assembler::pshufd(dst, src, mode);
4930     } else {
4931       push_zmm(xmm0);
4932       Assembler::pshufd(xmm0, src, mode);
4933       evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4934       pop_zmm(xmm0);
4935     }
4936   }
4937 }
4938 
4939 // This instruction exists within macros, ergo we cannot control its input
4940 // when emitted through those patterns.
4941 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
4942   if (VM_Version::supports_avx512nobw()) {
4943     int dst_enc = dst->encoding();
4944     int src_enc = src->encoding();
4945     if (dst_enc == src_enc) {
4946       if (dst_enc < 16) {
4947         Assembler::pshuflw(dst, src, mode);
4948       } else {
4949         push_zmm(xmm0);
4950         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4951         Assembler::pshuflw(xmm0, xmm0, mode);
4952         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4953         pop_zmm(xmm0);
4954       }
4955     } else {
4956       if ((src_enc < 16) && (dst_enc < 16)) {
4957         Assembler::pshuflw(dst, src, mode);
4958       } else if (src_enc < 16) {
4959         push_zmm(xmm0);
4960         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4961         Assembler::pshuflw(xmm0, src, mode);
4962         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4963         pop_zmm(xmm0);
4964       } else if (dst_enc < 16) {
4965         push_zmm(xmm0);
4966         evmovdqul(xmm0, src, Assembler::AVX_512bit);
4967         Assembler::pshuflw(dst, xmm0, mode);
4968         pop_zmm(xmm0);
4969       } else {
4970         push_zmm(xmm0);
4971         push_zmm(xmm1);
4972         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4973         evmovdqul(xmm1, src, Assembler::AVX_512bit);
4974         Assembler::pshuflw(xmm0, xmm1, mode);
4975         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4976         pop_zmm(xmm1);
4977         pop_zmm(xmm0);
4978       }
4979     }
4980   } else {
4981     Assembler::pshuflw(dst, src, mode);
4982   }
4983 }
4984 
4985 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4986   if (reachable(src)) {
4987     vandpd(dst, nds, as_Address(src), vector_len);
4988   } else {
4989     lea(rscratch1, src);
4990     vandpd(dst, nds, Address(rscratch1, 0), vector_len);
4991   }
4992 }
4993 
4994 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4995   if (reachable(src)) {
4996     vandps(dst, nds, as_Address(src), vector_len);
4997   } else {
4998     lea(rscratch1, src);
4999     vandps(dst, nds, Address(rscratch1, 0), vector_len);
5000   }
5001 }
5002 


5038 
5039 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5040   if (reachable(src)) {
5041     vsubsd(dst, nds, as_Address(src));
5042   } else {
5043     lea(rscratch1, src);
5044     vsubsd(dst, nds, Address(rscratch1, 0));
5045   }
5046 }
5047 
5048 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5049   if (reachable(src)) {
5050     vsubss(dst, nds, as_Address(src));
5051   } else {
5052     lea(rscratch1, src);
5053     vsubss(dst, nds, Address(rscratch1, 0));
5054   }
5055 }
5056 
5057 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5058   int nds_enc = nds->encoding();
5059   int dst_enc = dst->encoding();
5060   bool dst_upper_bank = (dst_enc > 15);
5061   bool nds_upper_bank = (nds_enc > 15);
5062   if (VM_Version::supports_avx512novl() &&
5063       (nds_upper_bank || dst_upper_bank)) {
5064     if (dst_upper_bank) {
5065       push_zmm(xmm0);
5066       movflt(xmm0, nds);
5067       vxorps(xmm0, xmm0, src, Assembler::AVX_128bit);
5068       movflt(dst, xmm0);
5069       pop_zmm(xmm0);
5070     } else {
5071       movflt(dst, nds);
5072       vxorps(dst, dst, src, Assembler::AVX_128bit);
5073     }
5074   } else {
5075     vxorps(dst, nds, src, Assembler::AVX_128bit);
5076   }
5077 }
5078 
5079 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5080   int nds_enc = nds->encoding();
5081   int dst_enc = dst->encoding();
5082   bool dst_upper_bank = (dst_enc > 15);
5083   bool nds_upper_bank = (nds_enc > 15);
5084   if (VM_Version::supports_avx512novl() &&
5085       (nds_upper_bank || dst_upper_bank)) {
5086     if (dst_upper_bank) {
5087       push_zmm(xmm0);
5088       movdbl(xmm0, nds);
5089       vxorpd(xmm0, xmm0, src, Assembler::AVX_128bit);
5090       movdbl(dst, xmm0);
5091       pop_zmm(xmm0);
5092     } else {
5093       movdbl(dst, nds);
5094       vxorpd(dst, dst, src, Assembler::AVX_128bit);
5095     }
5096   } else {
5097     vxorpd(dst, nds, src, Assembler::AVX_128bit);
5098   }
5099 }
5100 
5101 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
5102   if (reachable(src)) {
5103     vxorpd(dst, nds, as_Address(src), vector_len);
5104   } else {
5105     lea(rscratch1, src);
5106     vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
5107   }
5108 }
5109 
5110 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
5111   if (reachable(src)) {
5112     vxorps(dst, nds, as_Address(src), vector_len);
5113   } else {
5114     lea(rscratch1, src);
5115     vxorps(dst, nds, Address(rscratch1, 0), vector_len);
5116   }
5117 }
5118 


7228 
7229 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
7230                                          XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
7231   ShortBranchVerifier sbv(this);
7232   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
7233 
7234   int stride = 8;
7235 
7236   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
7237         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
7238         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
7239         FOUND_SEQ_CHAR, DONE_LABEL;
7240 
7241   movptr(result, str1);
7242   if (UseAVX >= 2) {
7243     cmpl(cnt1, stride);
7244     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
7245     cmpl(cnt1, 2*stride);
7246     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
7247     movdl(vec1, ch);
7248     vpbroadcastw(vec1, vec1);
7249     vpxor(vec2, vec2);
7250     movl(tmp, cnt1);
7251     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
7252     andl(cnt1,0x0000000F);  //tail count (in chars)
7253 
7254     bind(SCAN_TO_16_CHAR_LOOP);
7255     vmovdqu(vec3, Address(result, 0));
7256     vpcmpeqw(vec3, vec3, vec1, 1);
7257     vptest(vec2, vec3);
7258     jcc(Assembler::carryClear, FOUND_CHAR);
7259     addptr(result, 32);
7260     subl(tmp, 2*stride);
7261     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
7262     jmp(SCAN_TO_8_CHAR);
7263     bind(SCAN_TO_8_CHAR_INIT);
7264     movdl(vec1, ch);
7265     pshuflw(vec1, vec1, 0x00);
7266     pshufd(vec1, vec1, 0);
7267     pxor(vec2, vec2);
7268   }


7823     jmp(FALSE_LABEL);
7824 
7825     clear_vector_masking();   // closing of the stub context for programming mask registers
7826   } else {
7827     movl(result, len); // copy
7828 
7829     if (UseAVX == 2 && UseSSE >= 2) {
7830       // With AVX2, use 32-byte vector compare
7831       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7832 
7833       // Compare 32-byte vectors
7834       andl(result, 0x0000001f);  //   tail count (in bytes)
7835       andl(len, 0xffffffe0);   // vector count (in bytes)
7836       jccb(Assembler::zero, COMPARE_TAIL);
7837 
7838       lea(ary1, Address(ary1, len, Address::times_1));
7839       negptr(len);
7840 
7841       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
7842       movdl(vec2, tmp1);
7843       vpbroadcastd(vec2, vec2);
7844 
7845       bind(COMPARE_WIDE_VECTORS);
7846       vmovdqu(vec1, Address(ary1, len, Address::times_1));
7847       vptest(vec1, vec2);
7848       jccb(Assembler::notZero, TRUE_LABEL);
7849       addptr(len, 32);
7850       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7851 
7852       testl(result, result);
7853       jccb(Assembler::zero, FALSE_LABEL);
7854 
7855       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
7856       vptest(vec1, vec2);
7857       jccb(Assembler::notZero, TRUE_LABEL);
7858       jmpb(FALSE_LABEL);
7859 
7860       bind(COMPARE_TAIL); // len is zero
7861       movl(len, result);
7862       // Fallthru to tail compare
7863     } else if (UseSSE42Intrinsics) {


8253     if (!UseUnalignedLoadStores) {
8254       // align to 8 bytes, we know we are 4 byte aligned to start
8255       testptr(to, 4);
8256       jccb(Assembler::zero, L_fill_32_bytes);
8257       movl(Address(to, 0), value);
8258       addptr(to, 4);
8259       subl(count, 1<<shift);
8260     }
8261     BIND(L_fill_32_bytes);
8262     {
8263       assert( UseSSE >= 2, "supported cpu only" );
8264       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
8265       if (UseAVX > 2) {
8266         movl(rtmp, 0xffff);
8267         kmovwl(k1, rtmp);
8268       }
8269       movdl(xtmp, value);
8270       if (UseAVX > 2 && UseUnalignedLoadStores) {
8271         // Fill 64-byte chunks
8272         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8273         evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
8274 
8275         subl(count, 16 << shift);
8276         jcc(Assembler::less, L_check_fill_32_bytes);
8277         align(16);
8278 
8279         BIND(L_fill_64_bytes_loop);
8280         evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
8281         addptr(to, 64);
8282         subl(count, 16 << shift);
8283         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8284 
8285         BIND(L_check_fill_32_bytes);
8286         addl(count, 8 << shift);
8287         jccb(Assembler::less, L_check_fill_8_bytes);
8288         vmovdqu(Address(to, 0), xtmp);
8289         addptr(to, 32);
8290         subl(count, 8 << shift);
8291 
8292         BIND(L_check_fill_8_bytes);
8293       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
8294         // Fill 64-byte chunks
8295         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8296         vpbroadcastd(xtmp, xtmp);
8297 
8298         subl(count, 16 << shift);
8299         jcc(Assembler::less, L_check_fill_32_bytes);
8300         align(16);
8301 
8302         BIND(L_fill_64_bytes_loop);
8303         vmovdqu(Address(to, 0), xtmp);
8304         vmovdqu(Address(to, 32), xtmp);
8305         addptr(to, 64);
8306         subl(count, 16 << shift);
8307         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8308 
8309         BIND(L_check_fill_32_bytes);
8310         addl(count, 8 << shift);
8311         jccb(Assembler::less, L_check_fill_8_bytes);
8312         vmovdqu(Address(to, 0), xtmp);
8313         addptr(to, 32);
8314         subl(count, 8 << shift);
8315 
8316         BIND(L_check_fill_8_bytes);


8417   xorl(result, result);
8418   // check for zero length
8419   testl(len, len);
8420   jcc(Assembler::zero, L_done);
8421 
8422   movl(result, len);
8423 
8424   // Setup pointers
8425   lea(src, Address(src, len, Address::times_2)); // char[]
8426   lea(dst, Address(dst, len, Address::times_1)); // byte[]
8427   negptr(len);
8428 
8429   if (UseSSE42Intrinsics || UseAVX >= 2) {
8430     Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
8431     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
8432 
8433     if (UseAVX >= 2) {
8434       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
8435       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
8436       movdl(tmp1Reg, tmp5);
8437       vpbroadcastd(tmp1Reg, tmp1Reg);
8438       jmp(L_chars_32_check);
8439 
8440       bind(L_copy_32_chars);
8441       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
8442       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
8443       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8444       vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
8445       jccb(Assembler::notZero, L_copy_32_chars_exit);
8446       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8447       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
8448       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
8449 
8450       bind(L_chars_32_check);
8451       addptr(len, 32);
8452       jcc(Assembler::lessEqual, L_copy_32_chars);
8453 
8454       bind(L_copy_32_chars_exit);
8455       subptr(len, 16);
8456       jccb(Assembler::greater, L_copy_16_chars_exit);
8457 




3106 }
3107 
3108 void MacroAssembler::load_double(Address src) {
3109   if (UseSSE >= 2) {
3110     movdbl(xmm0, src);
3111   } else {
3112     LP64_ONLY(ShouldNotReachHere());
3113     NOT_LP64(fld_d(src));
3114   }
3115 }
3116 
3117 void MacroAssembler::store_double(Address dst) {
3118   if (UseSSE >= 2) {
3119     movdbl(dst, xmm0);
3120   } else {
3121     LP64_ONLY(ShouldNotReachHere());
3122     NOT_LP64(fstp_d(dst));
3123   }
3124 }
3125 










3126 void MacroAssembler::fremr(Register tmp) {
3127   save_rax(tmp);
3128   { Label L;
3129     bind(L);
3130     fprem();
3131     fwait(); fnstsw_ax();
3132 #ifdef _LP64
3133     testl(rax, 0x400);
3134     jcc(Assembler::notEqual, L);
3135 #else
3136     sahf();
3137     jcc(Assembler::parity, L);
3138 #endif // _LP64
3139   }
3140   restore_rax(tmp);
3141   // Result is in ST0.
3142   // Note: fxch & fpop to get rid of ST1
3143   // (otherwise FPU stack could overflow eventually)
3144   fxch(1);
3145   fpop();


3486 }
3487 
3488 void MacroAssembler::movptr(Register dst, Register src) {
3489   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3490 }
3491 
3492 void MacroAssembler::movptr(Register dst, Address src) {
3493   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3494 }
3495 
3496 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3497 void MacroAssembler::movptr(Register dst, intptr_t src) {
3498   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3499 }
3500 
3501 void MacroAssembler::movptr(Address dst, Register src) {
3502   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3503 }
3504 
3505 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3506     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");


3507     Assembler::movdqu(dst, src);

3508 }
3509 
3510 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3511     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");


3512     Assembler::movdqu(dst, src);

3513 }
3514 
3515 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3516     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");


3517     Assembler::movdqu(dst, src);

3518 }
3519 
3520 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
3521   if (reachable(src)) {
3522     movdqu(dst, as_Address(src));
3523   } else {
3524     lea(scratchReg, src);
3525     movdqu(dst, Address(scratchReg, 0));
3526   }
3527 }
3528 
3529 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3530     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");


3531     Assembler::vmovdqu(dst, src);

3532 }
3533 
3534 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3535     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");


3536     Assembler::vmovdqu(dst, src);

3537 }
3538 
3539 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3540     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");



3541     Assembler::vmovdqu(dst, src);

3542 }
3543 
3544 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
3545   if (reachable(src)) {
3546     vmovdqu(dst, as_Address(src));
3547   }
3548   else {
3549     lea(rscratch1, src);
3550     vmovdqu(dst, Address(rscratch1, 0));
3551   }
3552 }
3553 
3554 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3555   if (reachable(src)) {
3556     Assembler::evmovdquq(dst, as_Address(src), vector_len);
3557   } else {
3558     lea(rscratch, src);
3559     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
3560   }
3561 }


3805     shll(reg, 24);
3806     sarl(reg, 24);
3807   }
3808 }
3809 
3810 void MacroAssembler::sign_extend_short(Register reg) {
3811   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3812     movswl(reg, reg); // movsxw
3813   } else {
3814     shll(reg, 16);
3815     sarl(reg, 16);
3816   }
3817 }
3818 
3819 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3820   assert(reachable(src), "Address should be reachable");
3821   testl(dst, as_Address(src));
3822 }
3823 
3824 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3825   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");




3826   Assembler::pcmpeqb(dst, src);





















3827 }
3828 
3829 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3830   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");




3831   Assembler::pcmpeqw(dst, src);





















3832 }
3833 
3834 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3835   assert((dst->encoding() < 16),"XMM register should be 0-15");

3836   Assembler::pcmpestri(dst, src, imm8);







3837 }
3838 
3839 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3840   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");


3841   Assembler::pcmpestri(dst, src, imm8);





















3842 }
3843 
3844 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3845   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");




3846   Assembler::pmovzxbw(dst, src);





















3847 }
3848 
3849 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3850   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");

3851   Assembler::pmovzxbw(dst, src);









3852 }
3853 
3854 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3855   assert((src->encoding() < 16),"XMM register should be 0-15");

3856   Assembler::pmovmskb(dst, src);






3857 }
3858 
3859 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3860   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");


3861   Assembler::ptest(dst, src);



















3862 }
3863 
3864 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3865   if (reachable(src)) {
3866     Assembler::sqrtsd(dst, as_Address(src));
3867   } else {
3868     lea(rscratch1, src);
3869     Assembler::sqrtsd(dst, Address(rscratch1, 0));
3870   }
3871 }
3872 
3873 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3874   if (reachable(src)) {
3875     Assembler::sqrtss(dst, as_Address(src));
3876   } else {
3877     lea(rscratch1, src);
3878     Assembler::sqrtss(dst, Address(rscratch1, 0));
3879   }
3880 }
3881 


3970 
3971 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3972   if (reachable(src)) {
3973     vaddsd(dst, nds, as_Address(src));
3974   } else {
3975     lea(rscratch1, src);
3976     vaddsd(dst, nds, Address(rscratch1, 0));
3977   }
3978 }
3979 
3980 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3981   if (reachable(src)) {
3982     vaddss(dst, nds, as_Address(src));
3983   } else {
3984     lea(rscratch1, src);
3985     vaddss(dst, nds, Address(rscratch1, 0));
3986   }
3987 }
3988 
3989 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3990   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");



3991   vandps(dst, nds, negate_field, vector_len);






























3992 }
3993 
3994 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3995   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");



3996   vandpd(dst, nds, negate_field, vector_len);





























3997 }
3998 
3999 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4000   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");



4001   Assembler::vpaddb(dst, nds, src, vector_len);




























4002 }
4003 
4004 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4005   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");


4006   Assembler::vpaddb(dst, nds, src, vector_len);















4007 }
4008 
4009 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4010   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");



4011   Assembler::vpaddw(dst, nds, src, vector_len);




























4012 }
4013 
4014 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4015   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");


4016   Assembler::vpaddw(dst, nds, src, vector_len);















4017 }
4018 
4019 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4020   if (reachable(src)) {
4021     Assembler::vpand(dst, nds, as_Address(src), vector_len);
4022   } else {
4023     lea(rscratch1, src);
4024     Assembler::vpand(dst, nds, Address(rscratch1, 0), vector_len);
4025   }
4026 }
4027 
4028 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
4029   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4030   Assembler::vpbroadcastw(dst, src, vector_len);

























4031 }
4032 
4033 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4034   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");




4035   Assembler::vpcmpeqb(dst, nds, src, vector_len);





















4036 }
4037 
4038 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4039   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");




4040   Assembler::vpcmpeqw(dst, nds, src, vector_len);





















4041 }
4042 
4043 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
4044   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");



4045   Assembler::vpmovzxbw(dst, src, vector_len);







4046 }
4047 
4048 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
4049   assert((src->encoding() < 16),"XMM register should be 0-15");

4050   Assembler::vpmovmskb(dst, src);






4051 }
4052 
4053 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4054   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");



4055   Assembler::vpmullw(dst, nds, src, vector_len);




























4056 }
4057 
4058 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4059   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");


4060   Assembler::vpmullw(dst, nds, src, vector_len);















4061 }
4062 
4063 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4064   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");



4065   Assembler::vpsubb(dst, nds, src, vector_len);




























4066 }
4067 
4068 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4069   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");


4070   Assembler::vpsubb(dst, nds, src, vector_len);















4071 }
4072 
4073 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4074   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");



4075   Assembler::vpsubw(dst, nds, src, vector_len);




























4076 }
4077 
4078 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4079   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");


4080   Assembler::vpsubw(dst, nds, src, vector_len);















4081 }
4082 
4083 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4084   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");



4085   Assembler::vpsraw(dst, nds, shift, vector_len);




































4086 }
4087 
4088 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4089   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");


4090   Assembler::vpsraw(dst, nds, shift, vector_len);















4091 }
4092 
4093 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4094   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");



4095   Assembler::vpsrlw(dst, nds, shift, vector_len);




































4096 }
4097 
4098 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4099   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");


4100   Assembler::vpsrlw(dst, nds, shift, vector_len);















4101 }
4102 
4103 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4104   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");



4105   Assembler::vpsllw(dst, nds, shift, vector_len);




































4106 }
4107 
4108 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4109   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");


4110   Assembler::vpsllw(dst, nds, shift, vector_len);















4111 }
4112 
4113 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
4114   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");


4115   Assembler::vptest(dst, src);



















4116 }
4117 


4118 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
4119   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");






































4120   Assembler::punpcklbw(dst, src);

4121 }
4122 
4123 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
4124   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");




4125   Assembler::pshufd(dst, src, mode);







4126 }
4127 


4128 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
4129   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");




4130   Assembler::pshuflw(dst, src, mode);



































4131 }
4132 
4133 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4134   if (reachable(src)) {
4135     vandpd(dst, nds, as_Address(src), vector_len);
4136   } else {
4137     lea(rscratch1, src);
4138     vandpd(dst, nds, Address(rscratch1, 0), vector_len);
4139   }
4140 }
4141 
4142 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4143   if (reachable(src)) {
4144     vandps(dst, nds, as_Address(src), vector_len);
4145   } else {
4146     lea(rscratch1, src);
4147     vandps(dst, nds, Address(rscratch1, 0), vector_len);
4148   }
4149 }
4150 


4186 
4187 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4188   if (reachable(src)) {
4189     vsubsd(dst, nds, as_Address(src));
4190   } else {
4191     lea(rscratch1, src);
4192     vsubsd(dst, nds, Address(rscratch1, 0));
4193   }
4194 }
4195 
4196 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4197   if (reachable(src)) {
4198     vsubss(dst, nds, as_Address(src));
4199   } else {
4200     lea(rscratch1, src);
4201     vsubss(dst, nds, Address(rscratch1, 0));
4202   }
4203 }
4204 
4205 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4206   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
















4207   vxorps(dst, nds, src, Assembler::AVX_128bit);

4208 }
4209 
4210 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4211   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
















4212   vxorpd(dst, nds, src, Assembler::AVX_128bit);

4213 }
4214 
4215 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4216   if (reachable(src)) {
4217     vxorpd(dst, nds, as_Address(src), vector_len);
4218   } else {
4219     lea(rscratch1, src);
4220     vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
4221   }
4222 }
4223 
4224 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4225   if (reachable(src)) {
4226     vxorps(dst, nds, as_Address(src), vector_len);
4227   } else {
4228     lea(rscratch1, src);
4229     vxorps(dst, nds, Address(rscratch1, 0), vector_len);
4230   }
4231 }
4232 


6342 
6343 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
6344                                          XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
6345   ShortBranchVerifier sbv(this);
6346   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6347 
6348   int stride = 8;
6349 
6350   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
6351         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
6352         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
6353         FOUND_SEQ_CHAR, DONE_LABEL;
6354 
6355   movptr(result, str1);
6356   if (UseAVX >= 2) {
6357     cmpl(cnt1, stride);
6358     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
6359     cmpl(cnt1, 2*stride);
6360     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
6361     movdl(vec1, ch);
6362     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
6363     vpxor(vec2, vec2);
6364     movl(tmp, cnt1);
6365     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
6366     andl(cnt1,0x0000000F);  //tail count (in chars)
6367 
6368     bind(SCAN_TO_16_CHAR_LOOP);
6369     vmovdqu(vec3, Address(result, 0));
6370     vpcmpeqw(vec3, vec3, vec1, 1);
6371     vptest(vec2, vec3);
6372     jcc(Assembler::carryClear, FOUND_CHAR);
6373     addptr(result, 32);
6374     subl(tmp, 2*stride);
6375     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
6376     jmp(SCAN_TO_8_CHAR);
6377     bind(SCAN_TO_8_CHAR_INIT);
6378     movdl(vec1, ch);
6379     pshuflw(vec1, vec1, 0x00);
6380     pshufd(vec1, vec1, 0);
6381     pxor(vec2, vec2);
6382   }


6937     jmp(FALSE_LABEL);
6938 
6939     clear_vector_masking();   // closing of the stub context for programming mask registers
6940   } else {
6941     movl(result, len); // copy
6942 
6943     if (UseAVX == 2 && UseSSE >= 2) {
6944       // With AVX2, use 32-byte vector compare
6945       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6946 
6947       // Compare 32-byte vectors
6948       andl(result, 0x0000001f);  //   tail count (in bytes)
6949       andl(len, 0xffffffe0);   // vector count (in bytes)
6950       jccb(Assembler::zero, COMPARE_TAIL);
6951 
6952       lea(ary1, Address(ary1, len, Address::times_1));
6953       negptr(len);
6954 
6955       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
6956       movdl(vec2, tmp1);
6957       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
6958 
6959       bind(COMPARE_WIDE_VECTORS);
6960       vmovdqu(vec1, Address(ary1, len, Address::times_1));
6961       vptest(vec1, vec2);
6962       jccb(Assembler::notZero, TRUE_LABEL);
6963       addptr(len, 32);
6964       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6965 
6966       testl(result, result);
6967       jccb(Assembler::zero, FALSE_LABEL);
6968 
6969       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
6970       vptest(vec1, vec2);
6971       jccb(Assembler::notZero, TRUE_LABEL);
6972       jmpb(FALSE_LABEL);
6973 
6974       bind(COMPARE_TAIL); // len is zero
6975       movl(len, result);
6976       // Fallthru to tail compare
6977     } else if (UseSSE42Intrinsics) {


7367     if (!UseUnalignedLoadStores) {
7368       // align to 8 bytes, we know we are 4 byte aligned to start
7369       testptr(to, 4);
7370       jccb(Assembler::zero, L_fill_32_bytes);
7371       movl(Address(to, 0), value);
7372       addptr(to, 4);
7373       subl(count, 1<<shift);
7374     }
7375     BIND(L_fill_32_bytes);
7376     {
7377       assert( UseSSE >= 2, "supported cpu only" );
7378       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7379       if (UseAVX > 2) {
7380         movl(rtmp, 0xffff);
7381         kmovwl(k1, rtmp);
7382       }
7383       movdl(xtmp, value);
7384       if (UseAVX > 2 && UseUnalignedLoadStores) {
7385         // Fill 64-byte chunks
7386         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7387         vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
7388 
7389         subl(count, 16 << shift);
7390         jcc(Assembler::less, L_check_fill_32_bytes);
7391         align(16);
7392 
7393         BIND(L_fill_64_bytes_loop);
7394         evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
7395         addptr(to, 64);
7396         subl(count, 16 << shift);
7397         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7398 
7399         BIND(L_check_fill_32_bytes);
7400         addl(count, 8 << shift);
7401         jccb(Assembler::less, L_check_fill_8_bytes);
7402         vmovdqu(Address(to, 0), xtmp);
7403         addptr(to, 32);
7404         subl(count, 8 << shift);
7405 
7406         BIND(L_check_fill_8_bytes);
7407       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
7408         // Fill 64-byte chunks
7409         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7410         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
7411 
7412         subl(count, 16 << shift);
7413         jcc(Assembler::less, L_check_fill_32_bytes);
7414         align(16);
7415 
7416         BIND(L_fill_64_bytes_loop);
7417         vmovdqu(Address(to, 0), xtmp);
7418         vmovdqu(Address(to, 32), xtmp);
7419         addptr(to, 64);
7420         subl(count, 16 << shift);
7421         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7422 
7423         BIND(L_check_fill_32_bytes);
7424         addl(count, 8 << shift);
7425         jccb(Assembler::less, L_check_fill_8_bytes);
7426         vmovdqu(Address(to, 0), xtmp);
7427         addptr(to, 32);
7428         subl(count, 8 << shift);
7429 
7430         BIND(L_check_fill_8_bytes);


7531   xorl(result, result);
7532   // check for zero length
7533   testl(len, len);
7534   jcc(Assembler::zero, L_done);
7535 
7536   movl(result, len);
7537 
7538   // Setup pointers
7539   lea(src, Address(src, len, Address::times_2)); // char[]
7540   lea(dst, Address(dst, len, Address::times_1)); // byte[]
7541   negptr(len);
7542 
7543   if (UseSSE42Intrinsics || UseAVX >= 2) {
7544     Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
7545     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7546 
7547     if (UseAVX >= 2) {
7548       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7549       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7550       movdl(tmp1Reg, tmp5);
7551       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
7552       jmp(L_chars_32_check);
7553 
7554       bind(L_copy_32_chars);
7555       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7556       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7557       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7558       vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7559       jccb(Assembler::notZero, L_copy_32_chars_exit);
7560       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7561       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
7562       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7563 
7564       bind(L_chars_32_check);
7565       addptr(len, 32);
7566       jcc(Assembler::lessEqual, L_copy_32_chars);
7567 
7568       bind(L_copy_32_chars_exit);
7569       subptr(len, 16);
7570       jccb(Assembler::greater, L_copy_16_chars_exit);
7571 


< prev index next >