< prev index next >

src/cpu/x86/vm/macroAssembler_x86.cpp

Print this page




3634   }
3635 }
3636 
3637 void MacroAssembler::movptr(Register dst, Register src) {
3638   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3639 }
3640 
3641 void MacroAssembler::movptr(Register dst, Address src) {
3642   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3643 }
3644 
3645 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3646 void MacroAssembler::movptr(Register dst, intptr_t src) {
3647   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3648 }
3649 
3650 void MacroAssembler::movptr(Address dst, Register src) {
3651   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3652 }
3653 
























3654 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
3655   if (reachable(src)) {
3656     Assembler::movdqu(dst, as_Address(src));

















3657   } else {


















3658     lea(rscratch1, src);
3659     Assembler::movdqu(dst, Address(rscratch1, 0));
3660   }
3661 }
3662 
3663 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3664   if (reachable(src)) {
3665     Assembler::movdqa(dst, as_Address(src));
3666   } else {
3667     lea(rscratch1, src);
3668     Assembler::movdqa(dst, Address(rscratch1, 0));
3669   }
3670 }
3671 
3672 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3673   if (reachable(src)) {
3674     Assembler::movsd(dst, as_Address(src));
3675   } else {
3676     lea(rscratch1, src);
3677     Assembler::movsd(dst, Address(rscratch1, 0));
3678   }
3679 }


3709   if (needs_explicit_null_check(offset)) {
3710     // provoke OS NULL exception if reg = NULL by
3711     // accessing M[reg] w/o changing any (non-CC) registers
3712     // NOTE: cmpl is plenty here to provoke a segv
3713     cmpptr(rax, Address(reg, 0));
3714     // Note: should probably use testl(rax, Address(reg, 0));
3715     //       may be shorter code (however, this version of
3716     //       testl needs to be implemented first)
3717   } else {
3718     // nothing to do, (later) access of M[reg + offset]
3719     // will provoke OS NULL exception if reg = NULL
3720   }
3721 }
3722 
3723 void MacroAssembler::os_breakpoint() {
3724   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3725   // (e.g., MSVC can't call ps() otherwise)
3726   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3727 }
3728 




3729 void MacroAssembler::pop_CPU_state() {
3730   pop_FPU_state();
3731   pop_IU_state();
3732 }
3733 
3734 void MacroAssembler::pop_FPU_state() {
3735 #ifndef _LP64
3736   frstor(Address(rsp, 0));
3737 #else
3738   // AVX will continue to use the fxsave area.
3739   // EVEX needs to utilize the xsave area, which is under different
3740   // management.
3741   if(VM_Version::supports_evex()) {
3742     // EDX:EAX describe the XSAVE header and
3743     // are obtained while fetching info for XCR0 via cpuid.
3744     // These two registers make up 64-bits in the header for which bits
3745     // 62:10 are currently reserved for future implementations and unused.  Bit 63
3746     // is unused for our implementation as we do not utilize
3747     // compressed XSAVE areas.  Bits 9..8 are currently ignored as we do not use
3748     // the functionality for PKRU state and MSR tracing.
3749     // Ergo we are primarily concerned with bits 7..0, which define
3750     // which ISA extensions and features are enabled for a given machine and are
3751     // defined in XemXcr0Eax and is used to map the XSAVE area
3752     // for restoring registers as described via XCR0.
3753     movl(rdx,VM_Version::get_xsave_header_upper_segment());
3754     movl(rax,VM_Version::get_xsave_header_lower_segment());
3755     xrstor(Address(rsp, 0));
3756   } else {
3757     fxrstor(Address(rsp, 0));
3758   }
3759 #endif
3760   addptr(rsp, FPUStateSizeInWords * wordSize);
3761 }
3762 
3763 void MacroAssembler::pop_IU_state() {
3764   popa();
3765   LP64_ONLY(addq(rsp, 8));
3766   popf();
3767 }
3768 
3769 // Save Integer and Float state
3770 // Warning: Stack must be 16 byte aligned (64bit)
3771 void MacroAssembler::push_CPU_state() {
3772   push_IU_state();
3773   push_FPU_state();
3774 }
3775 
3776 #ifdef _LP64
3777 #define XSTATE_BV 0x200
3778 #endif
3779 
3780 void MacroAssembler::push_FPU_state() {
3781   subptr(rsp, FPUStateSizeInWords * wordSize);
3782 #ifndef _LP64
3783   fnsave(Address(rsp, 0));
3784   fwait();
3785 #else
3786   // AVX will continue to use the fxsave area.
3787   // EVEX needs to utilize the xsave area, which is under different
3788   // management.
3789   if(VM_Version::supports_evex()) {
3790     // Save a copy of EAX and EDX
3791     push(rax);
3792     push(rdx);
3793     // EDX:EAX describe the XSAVE header and
3794     // are obtained while fetching info for XCR0 via cpuid.
3795     // These two registers make up 64-bits in the header for which bits
3796     // 62:10 are currently reserved for future implementations and unused.  Bit 63
3797     // is unused for our implementation as we do not utilize
3798     // compressed XSAVE areas.  Bits 9..8 are currently ignored as we do not use
3799     // the functionality for PKRU state and MSR tracing.
3800     // Ergo we are primarily concerned with bits 7..0, which define
3801     // which ISA extensions and features are enabled for a given machine and are
3802     // defined in XemXcr0Eax and is used to program XSAVE area
3803     // for saving the required registers as defined in XCR0.
3804     int xcr0_edx = VM_Version::get_xsave_header_upper_segment();
3805     int xcr0_eax = VM_Version::get_xsave_header_lower_segment();
3806     movl(rdx,xcr0_edx);
3807     movl(rax,xcr0_eax);
3808     xsave(Address(rsp, wordSize*2));
3809     // now Apply control bits and clear bytes 8..23 in the header
3810     pop(rdx);
3811     pop(rax);
3812     movl(Address(rsp, XSTATE_BV), xcr0_eax);
3813     movl(Address(rsp, XSTATE_BV+4), xcr0_edx);
3814     andq(Address(rsp, XSTATE_BV+8), 0);
3815     andq(Address(rsp, XSTATE_BV+16), 0);
3816   } else {
3817     fxsave(Address(rsp, 0));
3818   }
3819 #endif // LP64
3820 }
3821 
3822 void MacroAssembler::push_IU_state() {
3823   // Push flags first because pusha kills them
3824   pushf();
3825   // Make sure rsp stays 16-byte aligned
3826   LP64_ONLY(subq(rsp, 8));
3827   pusha();
3828 }
3829 
3830 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
3831   // determine java_thread register
3832   if (!java_thread->is_valid()) {
3833     java_thread = rdi;
3834     get_thread(java_thread);
3835   }
3836   // we must set sp to zero to clear frame
3837   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3838   if (clear_fp) {


3990 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3991   if (reachable(src)) {
3992     Assembler::ucomiss(dst, as_Address(src));
3993   } else {
3994     lea(rscratch1, src);
3995     Assembler::ucomiss(dst, Address(rscratch1, 0));
3996   }
3997 }
3998 
3999 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
4000   // Used in sign-bit flipping with aligned address.
4001   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
4002   if (reachable(src)) {
4003     Assembler::xorpd(dst, as_Address(src));
4004   } else {
4005     lea(rscratch1, src);
4006     Assembler::xorpd(dst, Address(rscratch1, 0));
4007   }
4008 }
4009 

















4010 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
4011   // Used in sign-bit flipping with aligned address.
4012   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
4013   if (reachable(src)) {
4014     Assembler::xorps(dst, as_Address(src));
4015   } else {
4016     lea(rscratch1, src);
4017     Assembler::xorps(dst, Address(rscratch1, 0));
4018   }
4019 }
4020 
4021 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
4022   // Used in sign-bit flipping with aligned address.
4023   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
4024   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
4025   if (reachable(src)) {
4026     Assembler::pshufb(dst, as_Address(src));
4027   } else {
4028     lea(rscratch1, src);
4029     Assembler::pshufb(dst, Address(rscratch1, 0));


4033 // AVX 3-operands instructions
4034 
4035 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4036   if (reachable(src)) {
4037     vaddsd(dst, nds, as_Address(src));
4038   } else {
4039     lea(rscratch1, src);
4040     vaddsd(dst, nds, Address(rscratch1, 0));
4041   }
4042 }
4043 
4044 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4045   if (reachable(src)) {
4046     vaddss(dst, nds, as_Address(src));
4047   } else {
4048     lea(rscratch1, src);
4049     vaddss(dst, nds, Address(rscratch1, 0));
4050   }
4051 }
4052 




































































































































































































































































































































































































































































































































































































































































































4053 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4054   if (reachable(src)) {
4055     vandpd(dst, nds, as_Address(src), vector_len);
4056   } else {
4057     lea(rscratch1, src);
4058     vandpd(dst, nds, Address(rscratch1, 0), vector_len);
4059   }
4060 }
4061 
4062 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4063   if (reachable(src)) {
4064     vandps(dst, nds, as_Address(src), vector_len);
4065   } else {
4066     lea(rscratch1, src);
4067     vandps(dst, nds, Address(rscratch1, 0), vector_len);
4068   }
4069 }
4070 
4071 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4072   if (reachable(src)) {


4116 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4117   if (reachable(src)) {
4118     vsubss(dst, nds, as_Address(src));
4119   } else {
4120     lea(rscratch1, src);
4121     vsubss(dst, nds, Address(rscratch1, 0));
4122   }
4123 }
4124 
4125 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4126   int nds_enc = nds->encoding();
4127   int dst_enc = dst->encoding();
4128   bool dst_upper_bank = (dst_enc > 15);
4129   bool nds_upper_bank = (nds_enc > 15);
4130   if (VM_Version::supports_avx512novl() &&
4131       (nds_upper_bank || dst_upper_bank)) {
4132     if (dst_upper_bank) {
4133       subptr(rsp, 64);
4134       evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4135       movflt(xmm0, nds);
4136       if (reachable(src)) {
4137         vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
4138       } else {
4139         lea(rscratch1, src);
4140         vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
4141       }
4142       movflt(dst, xmm0);
4143       evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4144       addptr(rsp, 64);
4145     } else {
4146       movflt(dst, nds);
4147       if (reachable(src)) {
4148         vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
4149       } else {
4150         lea(rscratch1, src);
4151         vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
4152       }
4153     }
4154   } else {
4155     if (reachable(src)) {
4156       vxorps(dst, nds, as_Address(src), Assembler::AVX_128bit);
4157     } else {
4158       lea(rscratch1, src);
4159       vxorps(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
4160     }
4161   }
4162 }
4163 
4164 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4165   int nds_enc = nds->encoding();
4166   int dst_enc = dst->encoding();
4167   bool dst_upper_bank = (dst_enc > 15);
4168   bool nds_upper_bank = (nds_enc > 15);
4169   if (VM_Version::supports_avx512novl() &&
4170       (nds_upper_bank || dst_upper_bank)) {
4171     if (dst_upper_bank) {
4172       subptr(rsp, 64);
4173       evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4174       movdbl(xmm0, nds);
4175       if (reachable(src)) {
4176         vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
4177       } else {
4178         lea(rscratch1, src);
4179         vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
4180       }
4181       movdbl(dst, xmm0);
4182       evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4183       addptr(rsp, 64);
4184     } else {
4185       movdbl(dst, nds);
4186       if (reachable(src)) {
4187         vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
4188       } else {
4189         lea(rscratch1, src);
4190         vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
4191       }
4192     }
4193   } else {
4194     if (reachable(src)) {
4195       vxorpd(dst, nds, as_Address(src), Assembler::AVX_128bit);
4196     } else {
4197       lea(rscratch1, src);
4198       vxorpd(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
4199     }
4200   }
4201 }
4202 
4203 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4204   if (reachable(src)) {
4205     vxorpd(dst, nds, as_Address(src), vector_len);
4206   } else {
4207     lea(rscratch1, src);
4208     vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
4209   }
4210 }
4211 
4212 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4213   if (reachable(src)) {
4214     vxorps(dst, nds, as_Address(src), vector_len);
4215   } else {
4216     lea(rscratch1, src);
4217     vxorps(dst, nds, Address(rscratch1, 0), vector_len);
4218   }
4219 }


4671 #ifdef _LP64
4672   if (var_size_in_bytes->is_valid()) {
4673     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
4674   } else {
4675     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
4676   }
4677 #else
4678   if (var_size_in_bytes->is_valid()) {
4679     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
4680   } else {
4681     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
4682   }
4683   adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
4684 #endif
4685 }
4686 
4687 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
4688   pusha();
4689 
4690   // if we are coming from c1, xmm registers may be live
4691   int off = 0;
4692   int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8);
4693   if (UseAVX > 2) {
4694     num_xmm_regs = LP64_ONLY(32) NOT_LP64(8);
4695   }
4696 
4697   if (UseSSE == 1)  {
4698     subptr(rsp, sizeof(jdouble)*8);
4699     for (int n = 0; n < 8; n++) {
4700       movflt(Address(rsp, off++*sizeof(jdouble)), as_XMMRegister(n));
4701     }
4702   } else if (UseSSE >= 2)  {
4703     if (UseAVX > 2) {
4704       push(rbx);
4705       movl(rbx, 0xffff);
4706       kmovwl(k1, rbx);
4707       pop(rbx);
4708     }
4709 #ifdef COMPILER2
4710     if (MaxVectorSize > 16) {
4711       if(UseAVX > 2) {
4712         // Save upper half of ZMM registes
4713         subptr(rsp, 32*num_xmm_regs);
4714         for (int n = 0; n < num_xmm_regs; n++) {
4715           vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n));
4716         }
4717         off = 0;
4718       }
4719       assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
4720       // Save upper half of YMM registes
4721       subptr(rsp, 16*num_xmm_regs);
4722       for (int n = 0; n < num_xmm_regs; n++) {
4723         vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
4724       }
4725     }
4726 #endif
4727     // Save whole 128bit (16 bytes) XMM registers
4728     subptr(rsp, 16*num_xmm_regs);
4729     off = 0;
4730 #ifdef _LP64
4731     if (VM_Version::supports_avx512novl()) {
4732       for (int n = 0; n < num_xmm_regs; n++) {
4733         vextractf32x4h(Address(rsp, off++*16), as_XMMRegister(n), 0);
4734       }
4735     } else {
4736       for (int n = 0; n < num_xmm_regs; n++) {
4737         movdqu(Address(rsp, off++*16), as_XMMRegister(n));
4738       }
4739     }
4740 #else
4741     for (int n = 0; n < num_xmm_regs; n++) {
4742       movdqu(Address(rsp, off++*16), as_XMMRegister(n));
4743     }
4744 #endif
4745   }
4746 
4747   // Preserve registers across runtime call
4748   int incoming_argument_and_return_value_offset = -1;
4749   if (num_fpu_regs_in_use > 1) {
4750     // Must preserve all other FPU regs (could alternatively convert
4751     // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
4752     // FPU state, but can not trust C compiler)
4753     NEEDS_CLEANUP;
4754     // NOTE that in this case we also push the incoming argument(s) to
4755     // the stack and restore it later; we also use this stack slot to
4756     // hold the return value from dsin, dcos etc.
4757     for (int i = 0; i < num_fpu_regs_in_use; i++) {
4758       subptr(rsp, sizeof(jdouble));
4759       fstp_d(Address(rsp, 0));
4760     }
4761     incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
4762     for (int i = nb_args-1; i >= 0; i--) {


4791 
4792   MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
4793 
4794 #ifdef _LP64
4795   movsd(Address(rsp, 0), xmm0);
4796   fld_d(Address(rsp, 0));
4797 #endif // _LP64
4798   addptr(rsp, sizeof(jdouble)*nb_args);
4799   if (num_fpu_regs_in_use > 1) {
4800     // Must save return value to stack and then restore entire FPU
4801     // stack except incoming arguments
4802     fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
4803     for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
4804       fld_d(Address(rsp, 0));
4805       addptr(rsp, sizeof(jdouble));
4806     }
4807     fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
4808     addptr(rsp, sizeof(jdouble)*nb_args);
4809   }
4810 
4811   off = 0;
4812   if (UseSSE == 1)  {
4813     for (int n = 0; n < 8; n++) {
4814       movflt(as_XMMRegister(n), Address(rsp, off++*sizeof(jdouble)));
4815     }
4816     addptr(rsp, sizeof(jdouble)*8);
4817   } else if (UseSSE >= 2)  {
4818     // Restore whole 128bit (16 bytes) XMM regiters
4819 #ifdef _LP64
4820     if (VM_Version::supports_avx512novl()) {
4821       for (int n = 0; n < num_xmm_regs; n++) {
4822         vinsertf32x4h(as_XMMRegister(n), Address(rsp, off++*16), 0);
4823       }
4824     }
4825     else {
4826       for (int n = 0; n < num_xmm_regs; n++) {
4827         movdqu(as_XMMRegister(n), Address(rsp, off++*16));
4828       }
4829     }
4830 #else
4831     for (int n = 0; n < num_xmm_regs; n++) {
4832       movdqu(as_XMMRegister(n), Address(rsp, off++ * 16));
4833     }
4834 #endif
4835     addptr(rsp, 16*num_xmm_regs);
4836 
4837 #ifdef COMPILER2
4838     if (MaxVectorSize > 16) {
4839       // Restore upper half of YMM registes.
4840       off = 0;
4841       for (int n = 0; n < num_xmm_regs; n++) {
4842         vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
4843       }
4844       addptr(rsp, 16*num_xmm_regs);
4845       if(UseAVX > 2) {
4846         off = 0;
4847         for (int n = 0; n < num_xmm_regs; n++) {
4848           vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32));
4849         }
4850         addptr(rsp, 32*num_xmm_regs);
4851       }
4852     }
4853 #endif
4854   }
4855   popa();
4856 }
4857 
4858 static const double     pi_4 =  0.7853981633974483;
4859 
4860 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
4861   // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
4862   // was attempted in this code; unfortunately it appears that the
4863   // switch to 80-bit precision and back causes this to be
4864   // unprofitable compared with simply performing a runtime call if
4865   // the argument is out of the (-pi/4, pi/4) range.
4866 
4867   Register tmp = noreg;
4868   if (!VM_Version::supports_cmov()) {


6814   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
6815         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
6816         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
6817         FOUND_SEQ_CHAR, DONE_LABEL;
6818 
6819   movptr(result, str1);
6820   if (UseAVX >= 2) {
6821     cmpl(cnt1, stride);
6822     jccb(Assembler::less, SCAN_TO_CHAR_LOOP);
6823     cmpl(cnt1, 2*stride);
6824     jccb(Assembler::less, SCAN_TO_8_CHAR_INIT);
6825     movdl(vec1, ch);
6826     vpbroadcastw(vec1, vec1);
6827     vpxor(vec2, vec2);
6828     movl(tmp, cnt1);
6829     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
6830     andl(cnt1,0x0000000F);  //tail count (in chars)
6831 
6832     bind(SCAN_TO_16_CHAR_LOOP);
6833     vmovdqu(vec3, Address(result, 0));
6834     vpcmpeqw(vec3, vec3, vec1, true);
6835     vptest(vec2, vec3);
6836     jcc(Assembler::carryClear, FOUND_CHAR);
6837     addptr(result, 32);
6838     subl(tmp, 2*stride);
6839     jccb(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
6840     jmp(SCAN_TO_8_CHAR);
6841     bind(SCAN_TO_8_CHAR_INIT);
6842     movdl(vec1, ch);
6843     pshuflw(vec1, vec1, 0x00);
6844     pshufd(vec1, vec1, 0);
6845     pxor(vec2, vec2);
6846   }
6847   if (UseAVX >= 2 || UseSSE42Intrinsics) {
6848     bind(SCAN_TO_8_CHAR);
6849     cmpl(cnt1, stride);
6850     if (UseAVX >= 2) {
6851       jccb(Assembler::less, SCAN_TO_CHAR);
6852     }
6853     if (!(UseAVX >= 2)) {
6854       jccb(Assembler::less, SCAN_TO_CHAR_LOOP);


7654       }
7655       movdl(xtmp, value);
7656       if (UseAVX > 2 && UseUnalignedLoadStores) {
7657         // Fill 64-byte chunks
7658         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7659         evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
7660 
7661         subl(count, 16 << shift);
7662         jcc(Assembler::less, L_check_fill_32_bytes);
7663         align(16);
7664 
7665         BIND(L_fill_64_bytes_loop);
7666         evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
7667         addptr(to, 64);
7668         subl(count, 16 << shift);
7669         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7670 
7671         BIND(L_check_fill_32_bytes);
7672         addl(count, 8 << shift);
7673         jccb(Assembler::less, L_check_fill_8_bytes);
7674         evmovdqul(Address(to, 0), xtmp, Assembler::AVX_256bit);
7675         addptr(to, 32);
7676         subl(count, 8 << shift);
7677 
7678         BIND(L_check_fill_8_bytes);
7679       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
7680         // Fill 64-byte chunks
7681         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7682         vpbroadcastd(xtmp, xtmp);
7683 
7684         subl(count, 16 << shift);
7685         jcc(Assembler::less, L_check_fill_32_bytes);
7686         align(16);
7687 
7688         BIND(L_fill_64_bytes_loop);
7689         vmovdqu(Address(to, 0), xtmp);
7690         vmovdqu(Address(to, 32), xtmp);
7691         addptr(to, 64);
7692         subl(count, 16 << shift);
7693         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7694 




3634   }
3635 }
3636 
3637 void MacroAssembler::movptr(Register dst, Register src) {
3638   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3639 }
3640 
3641 void MacroAssembler::movptr(Register dst, Address src) {
3642   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3643 }
3644 
3645 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3646 void MacroAssembler::movptr(Register dst, intptr_t src) {
3647   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3648 }
3649 
3650 void MacroAssembler::movptr(Address dst, Register src) {
3651   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3652 }
3653 
3654 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3655   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3656     Assembler::vextractf32x4h(dst, src, 0);
3657   } else {
3658     Assembler::movdqu(dst, src);
3659   }
3660 }
3661 
3662 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3663   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3664     Assembler::vinsertf32x4h(dst, src, 0);
3665   } else {
3666     Assembler::movdqu(dst, src);
3667   }
3668 }
3669 
3670 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3671   if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3672     Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3673   } else {
3674     Assembler::movdqu(dst, src);
3675   }
3676 }
3677 
3678 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
3679   if (reachable(src)) {
3680     movdqu(dst, as_Address(src));
3681   } else {
3682     lea(rscratch1, src);
3683     movdqu(dst, Address(rscratch1, 0));
3684   }
3685 }
3686 
3687 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3688   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3689     Assembler::vextractf64x4h(dst, src, 0);
3690   } else {
3691     Assembler::vmovdqu(dst, src);
3692   }
3693 }
3694 
3695 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3696   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3697     Assembler::vinsertf64x4h(dst, src, 0);
3698   } else {
3699     Assembler::vmovdqu(dst, src);
3700   }
3701 }
3702 
3703 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3704   if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3705     Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3706   }
3707   else {
3708     Assembler::vmovdqu(dst, src);
3709   }
3710 }
3711 
3712 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
3713   if (reachable(src)) {
3714     vmovdqu(dst, as_Address(src));
3715   }
3716   else {
3717     lea(rscratch1, src);
3718     vmovdqu(dst, Address(rscratch1, 0));
3719   }
3720 }
3721 
3722 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3723   if (reachable(src)) {
3724     Assembler::movdqa(dst, as_Address(src));
3725   } else {
3726     lea(rscratch1, src);
3727     Assembler::movdqa(dst, Address(rscratch1, 0));
3728   }
3729 }
3730 
3731 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3732   if (reachable(src)) {
3733     Assembler::movsd(dst, as_Address(src));
3734   } else {
3735     lea(rscratch1, src);
3736     Assembler::movsd(dst, Address(rscratch1, 0));
3737   }
3738 }


3768   if (needs_explicit_null_check(offset)) {
3769     // provoke OS NULL exception if reg = NULL by
3770     // accessing M[reg] w/o changing any (non-CC) registers
3771     // NOTE: cmpl is plenty here to provoke a segv
3772     cmpptr(rax, Address(reg, 0));
3773     // Note: should probably use testl(rax, Address(reg, 0));
3774     //       may be shorter code (however, this version of
3775     //       testl needs to be implemented first)
3776   } else {
3777     // nothing to do, (later) access of M[reg + offset]
3778     // will provoke OS NULL exception if reg = NULL
3779   }
3780 }
3781 
3782 void MacroAssembler::os_breakpoint() {
3783   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3784   // (e.g., MSVC can't call ps() otherwise)
3785   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3786 }
3787 
3788 #ifdef _LP64
3789 #define XSTATE_BV 0x200
3790 #endif
3791 
3792 void MacroAssembler::pop_CPU_state() {
3793   pop_FPU_state();
3794   pop_IU_state();
3795 }
3796 
3797 void MacroAssembler::pop_FPU_state() {
3798 #ifndef _LP64
3799   frstor(Address(rsp, 0));
3800 #else



















3801   fxrstor(Address(rsp, 0));

3802 #endif
3803   addptr(rsp, FPUStateSizeInWords * wordSize);
3804 }
3805 
3806 void MacroAssembler::pop_IU_state() {
3807   popa();
3808   LP64_ONLY(addq(rsp, 8));
3809   popf();
3810 }
3811 
3812 // Save Integer and Float state
3813 // Warning: Stack must be 16 byte aligned (64bit)
3814 void MacroAssembler::push_CPU_state() {
3815   push_IU_state();
3816   push_FPU_state();
3817 }
3818 




3819 void MacroAssembler::push_FPU_state() {
3820   subptr(rsp, FPUStateSizeInWords * wordSize);
3821 #ifndef _LP64
3822   fnsave(Address(rsp, 0));
3823   fwait();
3824 #else































3825   fxsave(Address(rsp, 0));

3826 #endif // LP64
3827 }
3828 
3829 void MacroAssembler::push_IU_state() {
3830   // Push flags first because pusha kills them
3831   pushf();
3832   // Make sure rsp stays 16-byte aligned
3833   LP64_ONLY(subq(rsp, 8));
3834   pusha();
3835 }
3836 
3837 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
3838   // determine java_thread register
3839   if (!java_thread->is_valid()) {
3840     java_thread = rdi;
3841     get_thread(java_thread);
3842   }
3843   // we must set sp to zero to clear frame
3844   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3845   if (clear_fp) {


3997 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3998   if (reachable(src)) {
3999     Assembler::ucomiss(dst, as_Address(src));
4000   } else {
4001     lea(rscratch1, src);
4002     Assembler::ucomiss(dst, Address(rscratch1, 0));
4003   }
4004 }
4005 
4006 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
4007   // Used in sign-bit flipping with aligned address.
4008   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
4009   if (reachable(src)) {
4010     Assembler::xorpd(dst, as_Address(src));
4011   } else {
4012     lea(rscratch1, src);
4013     Assembler::xorpd(dst, Address(rscratch1, 0));
4014   }
4015 }
4016 
4017 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
4018   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
4019     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
4020   }
4021   else {
4022     Assembler::xorpd(dst, src);
4023   }
4024 }
4025 
4026 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
4027   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
4028     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
4029   } else {
4030     Assembler::xorps(dst, src);
4031   }
4032 }
4033 
4034 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
4035   // Used in sign-bit flipping with aligned address.
4036   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
4037   if (reachable(src)) {
4038     Assembler::xorps(dst, as_Address(src));
4039   } else {
4040     lea(rscratch1, src);
4041     Assembler::xorps(dst, Address(rscratch1, 0));
4042   }
4043 }
4044 
4045 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
4046   // Used in sign-bit flipping with aligned address.
4047   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
4048   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
4049   if (reachable(src)) {
4050     Assembler::pshufb(dst, as_Address(src));
4051   } else {
4052     lea(rscratch1, src);
4053     Assembler::pshufb(dst, Address(rscratch1, 0));


4057 // AVX 3-operands instructions
4058 
4059 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4060   if (reachable(src)) {
4061     vaddsd(dst, nds, as_Address(src));
4062   } else {
4063     lea(rscratch1, src);
4064     vaddsd(dst, nds, Address(rscratch1, 0));
4065   }
4066 }
4067 
4068 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4069   if (reachable(src)) {
4070     vaddss(dst, nds, as_Address(src));
4071   } else {
4072     lea(rscratch1, src);
4073     vaddss(dst, nds, Address(rscratch1, 0));
4074   }
4075 }
4076 
4077 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
4078   int dst_enc = dst->encoding();
4079   int nds_enc = nds->encoding();
4080   int src_enc = src->encoding();
4081   if ((dst_enc < 16) && (nds_enc < 16)) {
4082     vandps(dst, nds, negate_field, vector_len);
4083   } else if ((src_enc < 16) && (dst_enc < 16)) {
4084     movss(src, nds);
4085     vandps(dst, src, negate_field, vector_len);
4086   } else if (src_enc < 16) {
4087     movss(src, nds);
4088     vandps(src, src, negate_field, vector_len);
4089     movss(dst, src);
4090   } else if (dst_enc < 16) {
4091     movdqu(src, xmm0);
4092     movss(xmm0, nds);
4093     vandps(dst, xmm0, negate_field, vector_len);
4094     movdqu(xmm0, src);
4095   } else if (nds_enc < 16) {
4096     movdqu(src, xmm0);
4097     vandps(xmm0, nds, negate_field, vector_len);
4098     movss(dst, xmm0);
4099     movdqu(xmm0, src);
4100   } else {
4101     movdqu(src, xmm0);
4102     movss(xmm0, nds);
4103     vandps(xmm0, xmm0, negate_field, vector_len);
4104     movss(dst, xmm0);
4105     movdqu(xmm0, src);
4106   }
4107 }
4108 
4109 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
4110   int dst_enc = dst->encoding();
4111   int nds_enc = nds->encoding();
4112   int src_enc = src->encoding();
4113   if ((dst_enc < 16) && (nds_enc < 16)) {
4114     vandpd(dst, nds, negate_field, vector_len);
4115   } else if ((src_enc < 16) && (dst_enc < 16)) {
4116     movsd(src, nds);
4117     vandpd(dst, src, negate_field, vector_len);
4118   } else if (src_enc < 16) {
4119     movsd(src, nds);
4120     vandpd(src, src, negate_field, vector_len);
4121     movsd(dst, src);
4122   } else if (dst_enc < 16) {
4123     movdqu(src, xmm0);
4124     movsd(xmm0, nds);
4125     vandpd(dst, xmm0, negate_field, vector_len);
4126     movdqu(xmm0, src);
4127   } else if (nds_enc < 16) {
4128     movdqu(src, xmm0);
4129     vandpd(xmm0, nds, negate_field, vector_len);
4130     movsd(dst, xmm0);
4131     movdqu(xmm0, src);
4132   } else {
4133     movdqu(src, xmm0);
4134     movsd(xmm0, nds);
4135     vandpd(xmm0, xmm0, negate_field, vector_len);
4136     movsd(dst, xmm0);
4137     movdqu(xmm0, src);
4138   }
4139 }
4140 
4141 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4142   int dst_enc = dst->encoding();
4143   int nds_enc = nds->encoding();
4144   int src_enc = src->encoding();
4145   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4146     Assembler::vpaddb(dst, nds, src, vector_len);
4147   } else if ((dst_enc < 16) && (src_enc < 16)) {
4148     Assembler::vpaddb(dst, dst, src, vector_len);
4149   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4150     // use nds as scratch for src
4151     evmovdqul(nds, src, Assembler::AVX_512bit);
4152     Assembler::vpaddb(dst, dst, nds, vector_len);
4153   } else if ((src_enc < 16) && (nds_enc < 16)) {
4154     // use nds as scratch for dst
4155     evmovdqul(nds, dst, Assembler::AVX_512bit);
4156     Assembler::vpaddb(nds, nds, src, vector_len);
4157     evmovdqul(dst, nds, Assembler::AVX_512bit);
4158   } else if (dst_enc < 16) {
4159     // use nds as scatch for xmm0 to hold src
4160     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4161     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4162     Assembler::vpaddb(dst, dst, xmm0, vector_len);
4163     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4164   } else {
4165     // worse case scenario, all regs are in the upper bank
4166     subptr(rsp, 64);
4167     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4168     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4169     evmovdqul(xmm1, src, Assembler::AVX_512bit);
4170     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4171     Assembler::vpaddb(xmm0, xmm0, xmm1, vector_len);
4172     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4173     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4174     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4175     addptr(rsp, 64);
4176   }
4177 }
4178 
4179 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4180   int dst_enc = dst->encoding();
4181   int nds_enc = nds->encoding();
4182   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4183     Assembler::vpaddb(dst, nds, src, vector_len);
4184   } else if (dst_enc < 16) {
4185     Assembler::vpaddb(dst, dst, src, vector_len);
4186   } else if (nds_enc < 16) {
4187     // implies dst_enc in upper bank with src as scratch
4188     evmovdqul(nds, dst, Assembler::AVX_512bit);
4189     Assembler::vpaddb(nds, nds, src, vector_len);
4190     evmovdqul(dst, nds, Assembler::AVX_512bit);
4191   } else {
4192     // worse case scenario, all regs in upper bank
4193     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4194     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4195     Assembler::vpaddb(xmm0, xmm0, src, vector_len);
4196     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4197   }
4198 }
4199 
4200 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4201   int dst_enc = dst->encoding();
4202   int nds_enc = nds->encoding();
4203   int src_enc = src->encoding();
4204   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4205     Assembler::vpaddw(dst, nds, src, vector_len);
4206   } else if ((dst_enc < 16) && (src_enc < 16)) {
4207     Assembler::vpaddw(dst, dst, src, vector_len);
4208   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4209     // use nds as scratch for src
4210     evmovdqul(nds, src, Assembler::AVX_512bit);
4211     Assembler::vpaddw(dst, dst, nds, vector_len);
4212   } else if ((src_enc < 16) && (nds_enc < 16)) {
4213     // use nds as scratch for dst
4214     evmovdqul(nds, dst, Assembler::AVX_512bit);
4215     Assembler::vpaddw(nds, nds, src, vector_len);
4216     evmovdqul(dst, nds, Assembler::AVX_512bit);
4217   } else if (dst_enc < 16) {
4218     // use nds as scatch for xmm0 to hold src
4219     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4220     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4221     Assembler::vpaddw(dst, dst, xmm0, vector_len);
4222     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4223   } else {
4224     // worse case scenario, all regs are in the upper bank
4225     subptr(rsp, 64);
4226     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4227     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4228     evmovdqul(xmm1, src, Assembler::AVX_512bit);
4229     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4230     Assembler::vpaddw(xmm0, xmm0, xmm1, vector_len);
4231     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4232     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4233     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4234     addptr(rsp, 64);
4235   }
4236 }
4237 
4238 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4239   int dst_enc = dst->encoding();
4240   int nds_enc = nds->encoding();
4241   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4242     Assembler::vpaddw(dst, nds, src, vector_len);
4243   } else if (dst_enc < 16) {
4244     Assembler::vpaddw(dst, dst, src, vector_len);
4245   } else if (nds_enc < 16) {
4246     // implies dst_enc in upper bank with src as scratch
4247     evmovdqul(nds, dst, Assembler::AVX_512bit);
4248     Assembler::vpaddw(nds, nds, src, vector_len);
4249     evmovdqul(dst, nds, Assembler::AVX_512bit);
4250   } else {
4251     // worse case scenario, all regs in upper bank
4252     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4253     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4254     Assembler::vpaddw(xmm0, xmm0, src, vector_len);
4255     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4256   }
4257 }
4258 
4259 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4260   int dst_enc = dst->encoding();
4261   int nds_enc = nds->encoding();
4262   int src_enc = src->encoding();
4263   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4264     Assembler::vpsubb(dst, nds, src, vector_len);
4265   } else if ((dst_enc < 16) && (src_enc < 16)) {
4266     Assembler::vpsubb(dst, dst, src, vector_len);
4267   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4268     // use nds as scratch for src
4269     evmovdqul(nds, src, Assembler::AVX_512bit);
4270     Assembler::vpsubb(dst, dst, nds, vector_len);
4271   } else if ((src_enc < 16) && (nds_enc < 16)) {
4272     // use nds as scratch for dst
4273     evmovdqul(nds, dst, Assembler::AVX_512bit);
4274     Assembler::vpsubb(nds, nds, src, vector_len);
4275     evmovdqul(dst, nds, Assembler::AVX_512bit);
4276   } else if (dst_enc < 16) {
4277     // use nds as scatch for xmm0 to hold src
4278     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4279     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4280     Assembler::vpsubb(dst, dst, xmm0, vector_len);
4281     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4282   } else {
4283     // worse case scenario, all regs are in the upper bank
4284     subptr(rsp, 64);
4285     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4286     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4287     evmovdqul(xmm1, src, Assembler::AVX_512bit);
4288     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4289     Assembler::vpsubb(xmm0, xmm0, xmm1, vector_len);
4290     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4291     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4292     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4293     addptr(rsp, 64);
4294   }
4295 }
4296 
4297 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4298   int dst_enc = dst->encoding();
4299   int nds_enc = nds->encoding();
4300   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4301     Assembler::vpsubb(dst, nds, src, vector_len);
4302   } else if (dst_enc < 16) {
4303     Assembler::vpsubb(dst, dst, src, vector_len);
4304   } else if (nds_enc < 16) {
4305     // implies dst_enc in upper bank with src as scratch
4306     evmovdqul(nds, dst, Assembler::AVX_512bit);
4307     Assembler::vpsubb(nds, nds, src, vector_len);
4308     evmovdqul(dst, nds, Assembler::AVX_512bit);
4309   } else {
4310     // worse case scenario, all regs in upper bank
4311     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4312     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4313     Assembler::vpsubw(xmm0, xmm0, src, vector_len);
4314     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4315   }
4316 }
4317 
4318 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4319   int dst_enc = dst->encoding();
4320   int nds_enc = nds->encoding();
4321   int src_enc = src->encoding();
4322   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4323     Assembler::vpsubw(dst, nds, src, vector_len);
4324   } else if ((dst_enc < 16) && (src_enc < 16)) {
4325     Assembler::vpsubw(dst, dst, src, vector_len);
4326   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4327     // use nds as scratch for src
4328     evmovdqul(nds, src, Assembler::AVX_512bit);
4329     Assembler::vpsubw(dst, dst, nds, vector_len);
4330   } else if ((src_enc < 16) && (nds_enc < 16)) {
4331     // use nds as scratch for dst
4332     evmovdqul(nds, dst, Assembler::AVX_512bit);
4333     Assembler::vpsubw(nds, nds, src, vector_len);
4334     evmovdqul(dst, nds, Assembler::AVX_512bit);
4335   } else if (dst_enc < 16) {
4336     // use nds as scatch for xmm0 to hold src
4337     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4338     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4339     Assembler::vpsubw(dst, dst, xmm0, vector_len);
4340     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4341   } else {
4342     // worse case scenario, all regs are in the upper bank
4343     subptr(rsp, 64);
4344     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4345     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4346     evmovdqul(xmm1, src, Assembler::AVX_512bit);
4347     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4348     Assembler::vpsubw(xmm0, xmm0, xmm1, vector_len);
4349     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4350     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4351     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4352     addptr(rsp, 64);
4353   }
4354 }
4355 
4356 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4357   int dst_enc = dst->encoding();
4358   int nds_enc = nds->encoding();
4359   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4360     Assembler::vpsubw(dst, nds, src, vector_len);
4361   } else if (dst_enc < 16) {
4362     Assembler::vpsubw(dst, dst, src, vector_len);
4363   } else if (nds_enc < 16) {
4364     // implies dst_enc in upper bank with src as scratch
4365     evmovdqul(nds, dst, Assembler::AVX_512bit);
4366     Assembler::vpsubw(nds, nds, src, vector_len);
4367     evmovdqul(dst, nds, Assembler::AVX_512bit);
4368   } else {
4369     // worse case scenario, all regs in upper bank
4370     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4371     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4372     Assembler::vpsubw(xmm0, xmm0, src, vector_len);
4373     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4374   }
4375 }
4376 
4377 
4378 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4379   int dst_enc = dst->encoding();
4380   int nds_enc = nds->encoding();
4381   int src_enc = src->encoding();
4382   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4383     Assembler::vpmullw(dst, nds, src, vector_len);
4384   } else if ((dst_enc < 16) && (src_enc < 16)) {
4385     Assembler::vpmullw(dst, dst, src, vector_len);
4386   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4387     // use nds as scratch for src
4388     evmovdqul(nds, src, Assembler::AVX_512bit);
4389     Assembler::vpmullw(dst, dst, nds, vector_len);
4390   } else if ((src_enc < 16) && (nds_enc < 16)) {
4391     // use nds as scratch for dst
4392     evmovdqul(nds, dst, Assembler::AVX_512bit);
4393     Assembler::vpmullw(nds, nds, src, vector_len);
4394     evmovdqul(dst, nds, Assembler::AVX_512bit);
4395   } else if (dst_enc < 16) {
4396     // use nds as scatch for xmm0 to hold src
4397     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4398     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4399     Assembler::vpmullw(dst, dst, xmm0, vector_len);
4400     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4401   } else {
4402     // worse case scenario, all regs are in the upper bank
4403     subptr(rsp, 64);
4404     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4405     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4406     evmovdqul(xmm1, src, Assembler::AVX_512bit);
4407     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4408     Assembler::vpmullw(xmm0, xmm0, xmm1, vector_len);
4409     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4410     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4411     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4412     addptr(rsp, 64);
4413   }
4414 }
4415 
4416 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4417   int dst_enc = dst->encoding();
4418   int nds_enc = nds->encoding();
4419   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4420     Assembler::vpmullw(dst, nds, src, vector_len);
4421   } else if (dst_enc < 16) {
4422     Assembler::vpmullw(dst, dst, src, vector_len);
4423   } else if (nds_enc < 16) {
4424     // implies dst_enc in upper bank with src as scratch
4425     evmovdqul(nds, dst, Assembler::AVX_512bit);
4426     Assembler::vpmullw(nds, nds, src, vector_len);
4427     evmovdqul(dst, nds, Assembler::AVX_512bit);
4428   } else {
4429     // worse case scenario, all regs in upper bank
4430     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4431     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4432     Assembler::vpmullw(xmm0, xmm0, src, vector_len);
4433     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4434   }
4435 }
4436 
4437 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4438   int dst_enc = dst->encoding();
4439   int nds_enc = nds->encoding();
4440   int shift_enc = shift->encoding();
4441   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4442     Assembler::vpsraw(dst, nds, shift, vector_len);
4443   } else if ((dst_enc < 16) && (shift_enc < 16)) {
4444     Assembler::vpsraw(dst, dst, shift, vector_len);
4445   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4446     // use nds_enc as scratch with shift
4447     evmovdqul(nds, shift, Assembler::AVX_512bit);
4448     Assembler::vpsraw(dst, dst, nds, vector_len);
4449   } else if ((shift_enc < 16) && (nds_enc < 16)) {
4450     // use nds as scratch with dst
4451     evmovdqul(nds, dst, Assembler::AVX_512bit);
4452     Assembler::vpsraw(nds, nds, shift, vector_len);
4453     evmovdqul(dst, nds, Assembler::AVX_512bit);
4454   } else if (dst_enc < 16) {
4455     // use nds to save a copy of xmm0 and hold shift
4456     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4457     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4458     Assembler::vpsraw(dst, dst, xmm0, vector_len);
4459     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4460   } else if (nds_enc < 16) {
4461     // use nds as dest as temps
4462     evmovdqul(nds, dst, Assembler::AVX_512bit);
4463     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4464     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4465     Assembler::vpsraw(nds, nds, xmm0, vector_len);
4466     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4467     evmovdqul(dst, nds, Assembler::AVX_512bit);
4468   } else {
4469     // worse case scenario, all regs are in the upper bank
4470     subptr(rsp, 64);
4471     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4472     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4473     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4474     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4475     Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4476     evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4477     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4478     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4479     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4480     addptr(rsp, 64);
4481   }
4482 }
4483 
4484 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4485   int dst_enc = dst->encoding();
4486   int nds_enc = nds->encoding();
4487   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4488     Assembler::vpsraw(dst, nds, shift, vector_len);
4489   } else if (dst_enc < 16) {
4490     Assembler::vpsraw(dst, dst, shift, vector_len);
4491   } else if (nds_enc < 16) {
4492     // use nds as scratch
4493     evmovdqul(nds, dst, Assembler::AVX_512bit);
4494     Assembler::vpsraw(nds, nds, shift, vector_len);
4495     evmovdqul(dst, nds, Assembler::AVX_512bit);
4496   } else {
4497     // use nds as scratch for xmm0
4498     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4499     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4500     Assembler::vpsraw(xmm0, xmm0, shift, vector_len);
4501     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4502   }
4503 }
4504 
4505 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4506   int dst_enc = dst->encoding();
4507   int nds_enc = nds->encoding();
4508   int shift_enc = shift->encoding();
4509   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4510     Assembler::vpsrlw(dst, nds, shift, vector_len);
4511   } else if ((dst_enc < 16) && (shift_enc < 16)) {
4512     Assembler::vpsrlw(dst, dst, shift, vector_len);
4513   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4514     // use nds_enc as scratch with shift
4515     evmovdqul(nds, shift, Assembler::AVX_512bit);
4516     Assembler::vpsrlw(dst, dst, nds, vector_len);
4517   } else if ((shift_enc < 16) && (nds_enc < 16)) {
4518     // use nds as scratch with dst
4519     evmovdqul(nds, dst, Assembler::AVX_512bit);
4520     Assembler::vpsrlw(nds, nds, shift, vector_len);
4521     evmovdqul(dst, nds, Assembler::AVX_512bit);
4522   } else if (dst_enc < 16) {
4523     // use nds to save a copy of xmm0 and hold shift
4524     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4525     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4526     Assembler::vpsrlw(dst, dst, xmm0, vector_len);
4527     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4528   } else if (nds_enc < 16) {
4529     // use nds as dest as temps
4530     evmovdqul(nds, dst, Assembler::AVX_512bit);
4531     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4532     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4533     Assembler::vpsrlw(nds, nds, xmm0, vector_len);
4534     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4535     evmovdqul(dst, nds, Assembler::AVX_512bit);
4536   } else {
4537     // worse case scenario, all regs are in the upper bank
4538     subptr(rsp, 64);
4539     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4540     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4541     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4542     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4543     Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4544     evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4545     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4546     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4547     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4548     addptr(rsp, 64);
4549   }
4550 }
4551 
4552 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4553   int dst_enc = dst->encoding();
4554   int nds_enc = nds->encoding();
4555   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4556     Assembler::vpsrlw(dst, nds, shift, vector_len);
4557   } else if (dst_enc < 16) {
4558     Assembler::vpsrlw(dst, dst, shift, vector_len);
4559   } else if (nds_enc < 16) {
4560     // use nds as scratch
4561     evmovdqul(nds, dst, Assembler::AVX_512bit);
4562     Assembler::vpsrlw(nds, nds, shift, vector_len);
4563     evmovdqul(dst, nds, Assembler::AVX_512bit);
4564   } else {
4565     // use nds as scratch for xmm0
4566     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4567     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4568     Assembler::vpsrlw(xmm0, xmm0, shift, vector_len);
4569     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4570   }
4571 }
4572 
4573 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4574   int dst_enc = dst->encoding();
4575   int nds_enc = nds->encoding();
4576   int shift_enc = shift->encoding();
4577   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4578     Assembler::vpsllw(dst, nds, shift, vector_len);
4579   } else if ((dst_enc < 16) && (shift_enc < 16)) {
4580     Assembler::vpsllw(dst, dst, shift, vector_len);
4581   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4582     // use nds_enc as scratch with shift
4583     evmovdqul(nds, shift, Assembler::AVX_512bit);
4584     Assembler::vpsllw(dst, dst, nds, vector_len);
4585   } else if ((shift_enc < 16) && (nds_enc < 16)) {
4586     // use nds as scratch with dst
4587     evmovdqul(nds, dst, Assembler::AVX_512bit);
4588     Assembler::vpsllw(nds, nds, shift, vector_len);
4589     evmovdqul(dst, nds, Assembler::AVX_512bit);
4590   } else if (dst_enc < 16) {
4591     // use nds to save a copy of xmm0 and hold shift
4592     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4593     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4594     Assembler::vpsllw(dst, dst, xmm0, vector_len);
4595     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4596   } else if (nds_enc < 16) {
4597     // use nds as dest as temps
4598     evmovdqul(nds, dst, Assembler::AVX_512bit);
4599     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4600     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4601     Assembler::vpsllw(nds, nds, xmm0, vector_len);
4602     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4603     evmovdqul(dst, nds, Assembler::AVX_512bit);
4604   } else {
4605     // worse case scenario, all regs are in the upper bank
4606     subptr(rsp, 64);
4607     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4608     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4609     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4610     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4611     Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4612     evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4613     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4614     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4615     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4616     addptr(rsp, 64);
4617   }
4618 }
4619 
4620 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4621   int dst_enc = dst->encoding();
4622   int nds_enc = nds->encoding();
4623   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4624     Assembler::vpsllw(dst, nds, shift, vector_len);
4625   } else if (dst_enc < 16) {
4626     Assembler::vpsllw(dst, dst, shift, vector_len);
4627   } else if (nds_enc < 16) {
4628     // use nds as scratch
4629     evmovdqul(nds, dst, Assembler::AVX_512bit);
4630     Assembler::vpsllw(nds, nds, shift, vector_len);
4631     evmovdqul(dst, nds, Assembler::AVX_512bit);
4632   } else {
4633     // use nds as scratch for xmm0
4634     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4635     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4636     Assembler::vpsllw(xmm0, xmm0, shift, vector_len);
4637     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4638   }
4639 }
4640 
4641 // This instruction exists within macros, ergo we cannot control its input
4642 // when emitted through those patterns.
4643 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
4644   if (VM_Version::supports_avx512nobw()) {
4645     int dst_enc = dst->encoding();
4646     int src_enc = src->encoding();
4647     if (dst_enc == src_enc) {
4648       if (dst_enc < 16) {
4649         Assembler::punpcklbw(dst, src);
4650       } else {
4651         subptr(rsp, 64);
4652         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4653         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4654         Assembler::punpcklbw(xmm0, xmm0);
4655         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4656         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4657         addptr(rsp, 64);
4658       }
4659     } else {
4660       if ((src_enc < 16) && (dst_enc < 16)) {
4661         Assembler::punpcklbw(dst, src);
4662       } else if (src_enc < 16) {
4663         subptr(rsp, 64);
4664         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4665         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4666         Assembler::punpcklbw(xmm0, src);
4667         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4668         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4669         addptr(rsp, 64);
4670       } else if (dst_enc < 16) {
4671         subptr(rsp, 64);
4672         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4673         evmovdqul(xmm0, src, Assembler::AVX_512bit);
4674         Assembler::punpcklbw(dst, xmm0);
4675         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4676         addptr(rsp, 64);
4677       } else {
4678         subptr(rsp, 64);
4679         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4680         subptr(rsp, 64);
4681         evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4682         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4683         evmovdqul(xmm1, src, Assembler::AVX_512bit);
4684         Assembler::punpcklbw(xmm0, xmm1);
4685         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4686         evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4687         addptr(rsp, 64);
4688         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4689         addptr(rsp, 64);
4690       }
4691     }
4692   } else {
4693     Assembler::punpcklbw(dst, src);
4694   }
4695 }
4696 
4697 // This instruction exists within macros, ergo we cannot control its input
4698 // when emitted through those patterns.
4699 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
4700   if (VM_Version::supports_avx512nobw()) {
4701     int dst_enc = dst->encoding();
4702     int src_enc = src->encoding();
4703     if (dst_enc == src_enc) {
4704       if (dst_enc < 16) {
4705         Assembler::pshuflw(dst, src, mode);
4706       } else {
4707         subptr(rsp, 64);
4708         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4709         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4710         Assembler::pshuflw(xmm0, xmm0, mode);
4711         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4712         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4713         addptr(rsp, 64);
4714       }
4715     } else {
4716       if ((src_enc < 16) && (dst_enc < 16)) {
4717         Assembler::pshuflw(dst, src, mode);
4718       } else if (src_enc < 16) {
4719         subptr(rsp, 64);
4720         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4721         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4722         Assembler::pshuflw(xmm0, src, mode);
4723         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4724         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4725         addptr(rsp, 64);
4726       } else if (dst_enc < 16) {
4727         subptr(rsp, 64);
4728         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4729         evmovdqul(xmm0, src, Assembler::AVX_512bit);
4730         Assembler::pshuflw(dst, xmm0, mode);
4731         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4732         addptr(rsp, 64);
4733       } else {
4734         subptr(rsp, 64);
4735         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4736         subptr(rsp, 64);
4737         evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4738         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4739         evmovdqul(xmm1, src, Assembler::AVX_512bit);
4740         Assembler::pshuflw(xmm0, xmm1, mode);
4741         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4742         evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4743         addptr(rsp, 64);
4744         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4745         addptr(rsp, 64);
4746       }
4747     }
4748   } else {
4749     Assembler::pshuflw(dst, src, mode);
4750   }
4751 }
4752 
4753 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4754   if (reachable(src)) {
4755     vandpd(dst, nds, as_Address(src), vector_len);
4756   } else {
4757     lea(rscratch1, src);
4758     vandpd(dst, nds, Address(rscratch1, 0), vector_len);
4759   }
4760 }
4761 
4762 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4763   if (reachable(src)) {
4764     vandps(dst, nds, as_Address(src), vector_len);
4765   } else {
4766     lea(rscratch1, src);
4767     vandps(dst, nds, Address(rscratch1, 0), vector_len);
4768   }
4769 }
4770 
4771 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4772   if (reachable(src)) {


4816 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4817   if (reachable(src)) {
4818     vsubss(dst, nds, as_Address(src));
4819   } else {
4820     lea(rscratch1, src);
4821     vsubss(dst, nds, Address(rscratch1, 0));
4822   }
4823 }
4824 
4825 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4826   int nds_enc = nds->encoding();
4827   int dst_enc = dst->encoding();
4828   bool dst_upper_bank = (dst_enc > 15);
4829   bool nds_upper_bank = (nds_enc > 15);
4830   if (VM_Version::supports_avx512novl() &&
4831       (nds_upper_bank || dst_upper_bank)) {
4832     if (dst_upper_bank) {
4833       subptr(rsp, 64);
4834       evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4835       movflt(xmm0, nds);
4836       vxorps(xmm0, xmm0, src, Assembler::AVX_128bit);





4837       movflt(dst, xmm0);
4838       evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4839       addptr(rsp, 64);
4840     } else {
4841       movflt(dst, nds);
4842       vxorps(dst, dst, src, Assembler::AVX_128bit);





4843     }
4844   } else {
4845     vxorps(dst, nds, src, Assembler::AVX_128bit);





4846   }
4847 }
4848 
4849 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4850   int nds_enc = nds->encoding();
4851   int dst_enc = dst->encoding();
4852   bool dst_upper_bank = (dst_enc > 15);
4853   bool nds_upper_bank = (nds_enc > 15);
4854   if (VM_Version::supports_avx512novl() &&
4855       (nds_upper_bank || dst_upper_bank)) {
4856     if (dst_upper_bank) {
4857       subptr(rsp, 64);
4858       evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4859       movdbl(xmm0, nds);
4860       vxorpd(xmm0, xmm0, src, Assembler::AVX_128bit);





4861       movdbl(dst, xmm0);
4862       evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4863       addptr(rsp, 64);
4864     } else {
4865       movdbl(dst, nds);
4866       vxorpd(dst, dst, src, Assembler::AVX_128bit);





4867     }
4868   } else {
4869     vxorpd(dst, nds, src, Assembler::AVX_128bit);





4870   }
4871 }
4872 
4873 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4874   if (reachable(src)) {
4875     vxorpd(dst, nds, as_Address(src), vector_len);
4876   } else {
4877     lea(rscratch1, src);
4878     vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
4879   }
4880 }
4881 
4882 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4883   if (reachable(src)) {
4884     vxorps(dst, nds, as_Address(src), vector_len);
4885   } else {
4886     lea(rscratch1, src);
4887     vxorps(dst, nds, Address(rscratch1, 0), vector_len);
4888   }
4889 }


5341 #ifdef _LP64
5342   if (var_size_in_bytes->is_valid()) {
5343     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
5344   } else {
5345     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
5346   }
5347 #else
5348   if (var_size_in_bytes->is_valid()) {
5349     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
5350   } else {
5351     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
5352   }
5353   adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
5354 #endif
5355 }
5356 
5357 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
5358   pusha();
5359 
5360   // if we are coming from c1, xmm registers may be live

5361   int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8);
5362   if (UseAVX > 2) {
5363     num_xmm_regs = LP64_ONLY(32) NOT_LP64(8);
5364   }
5365 
5366   if (UseSSE == 1)  {
5367     subptr(rsp, sizeof(jdouble)*8);
5368     for (int n = 0; n < 8; n++) {
5369       movflt(Address(rsp, n*sizeof(jdouble)), as_XMMRegister(n));
5370     }
5371   } else if (UseSSE >= 2)  {
5372     if (UseAVX > 2) {
5373       push(rbx);
5374       movl(rbx, 0xffff);
5375       kmovwl(k1, rbx);
5376       pop(rbx);
5377     }
5378 #ifdef COMPILER2
5379     if (MaxVectorSize > 16) {
5380       if(UseAVX > 2) {
5381         // Save upper half of ZMM registers
5382         subptr(rsp, 32*num_xmm_regs);
5383         for (int n = 0; n < num_xmm_regs; n++) {
5384           vextractf64x4h(Address(rsp, n*32), as_XMMRegister(n), 1);
5385         }

5386       }
5387       assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
5388       // Save upper half of YMM registers
5389       subptr(rsp, 16*num_xmm_regs);
5390       for (int n = 0; n < num_xmm_regs; n++) {
5391         vextractf128h(Address(rsp, n*16), as_XMMRegister(n));
5392       }
5393     }
5394 #endif
5395     // Save whole 128bit (16 bytes) XMM registers
5396     subptr(rsp, 16*num_xmm_regs);

5397 #ifdef _LP64
5398     if (VM_Version::supports_evex()) {
5399       for (int n = 0; n < num_xmm_regs; n++) {
5400         vextractf32x4h(Address(rsp, n*16), as_XMMRegister(n), 0);
5401       }
5402     } else {
5403       for (int n = 0; n < num_xmm_regs; n++) {
5404         movdqu(Address(rsp, n*16), as_XMMRegister(n));
5405       }
5406     }
5407 #else
5408     for (int n = 0; n < num_xmm_regs; n++) {
5409       movdqu(Address(rsp, n*16), as_XMMRegister(n));
5410     }
5411 #endif
5412   }
5413 
5414   // Preserve registers across runtime call
5415   int incoming_argument_and_return_value_offset = -1;
5416   if (num_fpu_regs_in_use > 1) {
5417     // Must preserve all other FPU regs (could alternatively convert
5418     // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
5419     // FPU state, but can not trust C compiler)
5420     NEEDS_CLEANUP;
5421     // NOTE that in this case we also push the incoming argument(s) to
5422     // the stack and restore it later; we also use this stack slot to
5423     // hold the return value from dsin, dcos etc.
5424     for (int i = 0; i < num_fpu_regs_in_use; i++) {
5425       subptr(rsp, sizeof(jdouble));
5426       fstp_d(Address(rsp, 0));
5427     }
5428     incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
5429     for (int i = nb_args-1; i >= 0; i--) {


5458 
5459   MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
5460 
5461 #ifdef _LP64
5462   movsd(Address(rsp, 0), xmm0);
5463   fld_d(Address(rsp, 0));
5464 #endif // _LP64
5465   addptr(rsp, sizeof(jdouble)*nb_args);
5466   if (num_fpu_regs_in_use > 1) {
5467     // Must save return value to stack and then restore entire FPU
5468     // stack except incoming arguments
5469     fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
5470     for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
5471       fld_d(Address(rsp, 0));
5472       addptr(rsp, sizeof(jdouble));
5473     }
5474     fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
5475     addptr(rsp, sizeof(jdouble)*nb_args);
5476   }
5477 

5478   if (UseSSE == 1)  {
5479     for (int n = 0; n < 8; n++) {
5480       movflt(as_XMMRegister(n), Address(rsp, n*sizeof(jdouble)));
5481     }
5482     addptr(rsp, sizeof(jdouble)*8);
5483   } else if (UseSSE >= 2)  {
5484     // Restore whole 128bit (16 bytes) XMM registers
5485 #ifdef _LP64
5486   if (VM_Version::supports_evex()) {
5487     for (int n = 0; n < num_xmm_regs; n++) {
5488       vinsertf32x4h(as_XMMRegister(n), Address(rsp, n*16), 0);
5489     }
5490   } else {

5491     for (int n = 0; n < num_xmm_regs; n++) {
5492       movdqu(as_XMMRegister(n), Address(rsp, n*16));
5493     }
5494   }
5495 #else
5496   for (int n = 0; n < num_xmm_regs; n++) {
5497     movdqu(as_XMMRegister(n), Address(rsp, n*16));
5498   }
5499 #endif
5500     addptr(rsp, 16*num_xmm_regs);
5501 
5502 #ifdef COMPILER2
5503     if (MaxVectorSize > 16) {
5504       // Restore upper half of YMM registers.

5505       for (int n = 0; n < num_xmm_regs; n++) {
5506         vinsertf128h(as_XMMRegister(n), Address(rsp, n*16));
5507       }
5508       addptr(rsp, 16*num_xmm_regs);
5509       if(UseAVX > 2) {

5510         for (int n = 0; n < num_xmm_regs; n++) {
5511           vinsertf64x4h(as_XMMRegister(n), Address(rsp, n*32), 1);
5512         }
5513         addptr(rsp, 32*num_xmm_regs);
5514       }
5515     }
5516 #endif
5517   }
5518   popa();
5519 }
5520 
5521 static const double     pi_4 =  0.7853981633974483;
5522 
5523 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
5524   // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
5525   // was attempted in this code; unfortunately it appears that the
5526   // switch to 80-bit precision and back causes this to be
5527   // unprofitable compared with simply performing a runtime call if
5528   // the argument is out of the (-pi/4, pi/4) range.
5529 
5530   Register tmp = noreg;
5531   if (!VM_Version::supports_cmov()) {


7477   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
7478         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
7479         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
7480         FOUND_SEQ_CHAR, DONE_LABEL;
7481 
7482   movptr(result, str1);
7483   if (UseAVX >= 2) {
7484     cmpl(cnt1, stride);
7485     jccb(Assembler::less, SCAN_TO_CHAR_LOOP);
7486     cmpl(cnt1, 2*stride);
7487     jccb(Assembler::less, SCAN_TO_8_CHAR_INIT);
7488     movdl(vec1, ch);
7489     vpbroadcastw(vec1, vec1);
7490     vpxor(vec2, vec2);
7491     movl(tmp, cnt1);
7492     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
7493     andl(cnt1,0x0000000F);  //tail count (in chars)
7494 
7495     bind(SCAN_TO_16_CHAR_LOOP);
7496     vmovdqu(vec3, Address(result, 0));
7497     vpcmpeqw(vec3, vec3, vec1, 1);
7498     vptest(vec2, vec3);
7499     jcc(Assembler::carryClear, FOUND_CHAR);
7500     addptr(result, 32);
7501     subl(tmp, 2*stride);
7502     jccb(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
7503     jmp(SCAN_TO_8_CHAR);
7504     bind(SCAN_TO_8_CHAR_INIT);
7505     movdl(vec1, ch);
7506     pshuflw(vec1, vec1, 0x00);
7507     pshufd(vec1, vec1, 0);
7508     pxor(vec2, vec2);
7509   }
7510   if (UseAVX >= 2 || UseSSE42Intrinsics) {
7511     bind(SCAN_TO_8_CHAR);
7512     cmpl(cnt1, stride);
7513     if (UseAVX >= 2) {
7514       jccb(Assembler::less, SCAN_TO_CHAR);
7515     }
7516     if (!(UseAVX >= 2)) {
7517       jccb(Assembler::less, SCAN_TO_CHAR_LOOP);


8317       }
8318       movdl(xtmp, value);
8319       if (UseAVX > 2 && UseUnalignedLoadStores) {
8320         // Fill 64-byte chunks
8321         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8322         evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
8323 
8324         subl(count, 16 << shift);
8325         jcc(Assembler::less, L_check_fill_32_bytes);
8326         align(16);
8327 
8328         BIND(L_fill_64_bytes_loop);
8329         evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
8330         addptr(to, 64);
8331         subl(count, 16 << shift);
8332         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8333 
8334         BIND(L_check_fill_32_bytes);
8335         addl(count, 8 << shift);
8336         jccb(Assembler::less, L_check_fill_8_bytes);
8337         vmovdqu(Address(to, 0), xtmp);
8338         addptr(to, 32);
8339         subl(count, 8 << shift);
8340 
8341         BIND(L_check_fill_8_bytes);
8342       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
8343         // Fill 64-byte chunks
8344         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8345         vpbroadcastd(xtmp, xtmp);
8346 
8347         subl(count, 16 << shift);
8348         jcc(Assembler::less, L_check_fill_32_bytes);
8349         align(16);
8350 
8351         BIND(L_fill_64_bytes_loop);
8352         vmovdqu(Address(to, 0), xtmp);
8353         vmovdqu(Address(to, 32), xtmp);
8354         addptr(to, 64);
8355         subl(count, 16 << shift);
8356         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8357 


< prev index next >