3631 }
3632 }
3633
3634 void MacroAssembler::movptr(Register dst, Register src) {
3635 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3636 }
3637
3638 void MacroAssembler::movptr(Register dst, Address src) {
3639 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3640 }
3641
3642 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3643 void MacroAssembler::movptr(Register dst, intptr_t src) {
3644 LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3645 }
3646
3647 void MacroAssembler::movptr(Address dst, Register src) {
3648 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3649 }
3650
3651 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
3652 if (reachable(src)) {
3653 Assembler::movdqu(dst, as_Address(src));
3654 } else {
3655 lea(rscratch1, src);
3656 Assembler::movdqu(dst, Address(rscratch1, 0));
3657 }
3658 }
3659
3660 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3661 if (reachable(src)) {
3662 Assembler::movdqa(dst, as_Address(src));
3663 } else {
3664 lea(rscratch1, src);
3665 Assembler::movdqa(dst, Address(rscratch1, 0));
3666 }
3667 }
3668
3669 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3670 if (reachable(src)) {
3671 Assembler::movsd(dst, as_Address(src));
3672 } else {
3673 lea(rscratch1, src);
3674 Assembler::movsd(dst, Address(rscratch1, 0));
3675 }
3676 }
3706 if (needs_explicit_null_check(offset)) {
3707 // provoke OS NULL exception if reg = NULL by
3708 // accessing M[reg] w/o changing any (non-CC) registers
3709 // NOTE: cmpl is plenty here to provoke a segv
3710 cmpptr(rax, Address(reg, 0));
3711 // Note: should probably use testl(rax, Address(reg, 0));
3712 // may be shorter code (however, this version of
3713 // testl needs to be implemented first)
3714 } else {
3715 // nothing to do, (later) access of M[reg + offset]
3716 // will provoke OS NULL exception if reg = NULL
3717 }
3718 }
3719
3720 void MacroAssembler::os_breakpoint() {
3721 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3722 // (e.g., MSVC can't call ps() otherwise)
3723 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3724 }
3725
3726 void MacroAssembler::pop_CPU_state() {
3727 pop_FPU_state();
3728 pop_IU_state();
3729 }
3730
3731 void MacroAssembler::pop_FPU_state() {
3732 #ifndef _LP64
3733 frstor(Address(rsp, 0));
3734 #else
3735 // AVX will continue to use the fxsave area.
3736 // EVEX needs to utilize the xsave area, which is under different
3737 // management.
3738 if(VM_Version::supports_evex()) {
3739 // EDX:EAX describe the XSAVE header and
3740 // are obtained while fetching info for XCR0 via cpuid.
3741 // These two registers make up 64-bits in the header for which bits
3742 // 62:10 are currently reserved for future implementations and unused. Bit 63
3743 // is unused for our implementation as we do not utilize
3744 // compressed XSAVE areas. Bits 9..8 are currently ignored as we do not use
3745 // the functionality for PKRU state and MSR tracing.
3746 // Ergo we are primarily concerned with bits 7..0, which define
3747 // which ISA extensions and features are enabled for a given machine and are
3748 // defined in XemXcr0Eax and is used to map the XSAVE area
3749 // for restoring registers as described via XCR0.
3750 movl(rdx,VM_Version::get_xsave_header_upper_segment());
3751 movl(rax,VM_Version::get_xsave_header_lower_segment());
3752 xrstor(Address(rsp, 0));
3753 } else {
3754 fxrstor(Address(rsp, 0));
3755 }
3756 #endif
3757 addptr(rsp, FPUStateSizeInWords * wordSize);
3758 }
3759
3760 void MacroAssembler::pop_IU_state() {
3761 popa();
3762 LP64_ONLY(addq(rsp, 8));
3763 popf();
3764 }
3765
3766 // Save Integer and Float state
3767 // Warning: Stack must be 16 byte aligned (64bit)
3768 void MacroAssembler::push_CPU_state() {
3769 push_IU_state();
3770 push_FPU_state();
3771 }
3772
3773 #ifdef _LP64
3774 #define XSTATE_BV 0x200
3775 #endif
3776
3777 void MacroAssembler::push_FPU_state() {
3778 subptr(rsp, FPUStateSizeInWords * wordSize);
3779 #ifndef _LP64
3780 fnsave(Address(rsp, 0));
3781 fwait();
3782 #else
3783 // AVX will continue to use the fxsave area.
3784 // EVEX needs to utilize the xsave area, which is under different
3785 // management.
3786 if(VM_Version::supports_evex()) {
3787 // Save a copy of EAX and EDX
3788 push(rax);
3789 push(rdx);
3790 // EDX:EAX describe the XSAVE header and
3791 // are obtained while fetching info for XCR0 via cpuid.
3792 // These two registers make up 64-bits in the header for which bits
3793 // 62:10 are currently reserved for future implementations and unused. Bit 63
3794 // is unused for our implementation as we do not utilize
3795 // compressed XSAVE areas. Bits 9..8 are currently ignored as we do not use
3796 // the functionality for PKRU state and MSR tracing.
3797 // Ergo we are primarily concerned with bits 7..0, which define
3798 // which ISA extensions and features are enabled for a given machine and are
3799 // defined in XemXcr0Eax and is used to program XSAVE area
3800 // for saving the required registers as defined in XCR0.
3801 int xcr0_edx = VM_Version::get_xsave_header_upper_segment();
3802 int xcr0_eax = VM_Version::get_xsave_header_lower_segment();
3803 movl(rdx,xcr0_edx);
3804 movl(rax,xcr0_eax);
3805 xsave(Address(rsp, wordSize*2));
3806 // now Apply control bits and clear bytes 8..23 in the header
3807 pop(rdx);
3808 pop(rax);
3809 movl(Address(rsp, XSTATE_BV), xcr0_eax);
3810 movl(Address(rsp, XSTATE_BV+4), xcr0_edx);
3811 andq(Address(rsp, XSTATE_BV+8), 0);
3812 andq(Address(rsp, XSTATE_BV+16), 0);
3813 } else {
3814 fxsave(Address(rsp, 0));
3815 }
3816 #endif // LP64
3817 }
3818
3819 void MacroAssembler::push_IU_state() {
3820 // Push flags first because pusha kills them
3821 pushf();
3822 // Make sure rsp stays 16-byte aligned
3823 LP64_ONLY(subq(rsp, 8));
3824 pusha();
3825 }
3826
3827 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
3828 // determine java_thread register
3829 if (!java_thread->is_valid()) {
3830 java_thread = rdi;
3831 get_thread(java_thread);
3832 }
3833 // we must set sp to zero to clear frame
3834 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3835 if (clear_fp) {
3987 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3988 if (reachable(src)) {
3989 Assembler::ucomiss(dst, as_Address(src));
3990 } else {
3991 lea(rscratch1, src);
3992 Assembler::ucomiss(dst, Address(rscratch1, 0));
3993 }
3994 }
3995
3996 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
3997 // Used in sign-bit flipping with aligned address.
3998 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3999 if (reachable(src)) {
4000 Assembler::xorpd(dst, as_Address(src));
4001 } else {
4002 lea(rscratch1, src);
4003 Assembler::xorpd(dst, Address(rscratch1, 0));
4004 }
4005 }
4006
4007 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
4008 // Used in sign-bit flipping with aligned address.
4009 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
4010 if (reachable(src)) {
4011 Assembler::xorps(dst, as_Address(src));
4012 } else {
4013 lea(rscratch1, src);
4014 Assembler::xorps(dst, Address(rscratch1, 0));
4015 }
4016 }
4017
4018 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
4019 // Used in sign-bit flipping with aligned address.
4020 bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
4021 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
4022 if (reachable(src)) {
4023 Assembler::pshufb(dst, as_Address(src));
4024 } else {
4025 lea(rscratch1, src);
4026 Assembler::pshufb(dst, Address(rscratch1, 0));
4030 // AVX 3-operands instructions
4031
4032 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4033 if (reachable(src)) {
4034 vaddsd(dst, nds, as_Address(src));
4035 } else {
4036 lea(rscratch1, src);
4037 vaddsd(dst, nds, Address(rscratch1, 0));
4038 }
4039 }
4040
4041 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4042 if (reachable(src)) {
4043 vaddss(dst, nds, as_Address(src));
4044 } else {
4045 lea(rscratch1, src);
4046 vaddss(dst, nds, Address(rscratch1, 0));
4047 }
4048 }
4049
4050 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4051 if (reachable(src)) {
4052 vandpd(dst, nds, as_Address(src), vector_len);
4053 } else {
4054 lea(rscratch1, src);
4055 vandpd(dst, nds, Address(rscratch1, 0), vector_len);
4056 }
4057 }
4058
4059 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4060 if (reachable(src)) {
4061 vandps(dst, nds, as_Address(src), vector_len);
4062 } else {
4063 lea(rscratch1, src);
4064 vandps(dst, nds, Address(rscratch1, 0), vector_len);
4065 }
4066 }
4067
4068 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4069 if (reachable(src)) {
4113 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4114 if (reachable(src)) {
4115 vsubss(dst, nds, as_Address(src));
4116 } else {
4117 lea(rscratch1, src);
4118 vsubss(dst, nds, Address(rscratch1, 0));
4119 }
4120 }
4121
4122 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4123 int nds_enc = nds->encoding();
4124 int dst_enc = dst->encoding();
4125 bool dst_upper_bank = (dst_enc > 15);
4126 bool nds_upper_bank = (nds_enc > 15);
4127 if (VM_Version::supports_avx512novl() &&
4128 (nds_upper_bank || dst_upper_bank)) {
4129 if (dst_upper_bank) {
4130 subptr(rsp, 64);
4131 evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4132 movflt(xmm0, nds);
4133 if (reachable(src)) {
4134 vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
4135 } else {
4136 lea(rscratch1, src);
4137 vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
4138 }
4139 movflt(dst, xmm0);
4140 evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4141 addptr(rsp, 64);
4142 } else {
4143 movflt(dst, nds);
4144 if (reachable(src)) {
4145 vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
4146 } else {
4147 lea(rscratch1, src);
4148 vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
4149 }
4150 }
4151 } else {
4152 if (reachable(src)) {
4153 vxorps(dst, nds, as_Address(src), Assembler::AVX_128bit);
4154 } else {
4155 lea(rscratch1, src);
4156 vxorps(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
4157 }
4158 }
4159 }
4160
4161 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4162 int nds_enc = nds->encoding();
4163 int dst_enc = dst->encoding();
4164 bool dst_upper_bank = (dst_enc > 15);
4165 bool nds_upper_bank = (nds_enc > 15);
4166 if (VM_Version::supports_avx512novl() &&
4167 (nds_upper_bank || dst_upper_bank)) {
4168 if (dst_upper_bank) {
4169 subptr(rsp, 64);
4170 evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4171 movdbl(xmm0, nds);
4172 if (reachable(src)) {
4173 vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
4174 } else {
4175 lea(rscratch1, src);
4176 vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
4177 }
4178 movdbl(dst, xmm0);
4179 evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4180 addptr(rsp, 64);
4181 } else {
4182 movdbl(dst, nds);
4183 if (reachable(src)) {
4184 vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
4185 } else {
4186 lea(rscratch1, src);
4187 vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
4188 }
4189 }
4190 } else {
4191 if (reachable(src)) {
4192 vxorpd(dst, nds, as_Address(src), Assembler::AVX_128bit);
4193 } else {
4194 lea(rscratch1, src);
4195 vxorpd(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
4196 }
4197 }
4198 }
4199
4200 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4201 if (reachable(src)) {
4202 vxorpd(dst, nds, as_Address(src), vector_len);
4203 } else {
4204 lea(rscratch1, src);
4205 vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
4206 }
4207 }
4208
4209 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4210 if (reachable(src)) {
4211 vxorps(dst, nds, as_Address(src), vector_len);
4212 } else {
4213 lea(rscratch1, src);
4214 vxorps(dst, nds, Address(rscratch1, 0), vector_len);
4215 }
4216 }
4668 #ifdef _LP64
4669 if (var_size_in_bytes->is_valid()) {
4670 addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
4671 } else {
4672 addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
4673 }
4674 #else
4675 if (var_size_in_bytes->is_valid()) {
4676 addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
4677 } else {
4678 addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
4679 }
4680 adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
4681 #endif
4682 }
4683
4684 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
4685 pusha();
4686
4687 // if we are coming from c1, xmm registers may be live
4688 int off = 0;
4689 int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8);
4690 if (UseAVX > 2) {
4691 num_xmm_regs = LP64_ONLY(32) NOT_LP64(8);
4692 }
4693
4694 if (UseSSE == 1) {
4695 subptr(rsp, sizeof(jdouble)*8);
4696 for (int n = 0; n < 8; n++) {
4697 movflt(Address(rsp, off++*sizeof(jdouble)), as_XMMRegister(n));
4698 }
4699 } else if (UseSSE >= 2) {
4700 if (UseAVX > 2) {
4701 push(rbx);
4702 movl(rbx, 0xffff);
4703 kmovwl(k1, rbx);
4704 pop(rbx);
4705 }
4706 #ifdef COMPILER2
4707 if (MaxVectorSize > 16) {
4708 if(UseAVX > 2) {
4709 // Save upper half of ZMM registes
4710 subptr(rsp, 32*num_xmm_regs);
4711 for (int n = 0; n < num_xmm_regs; n++) {
4712 vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n));
4713 }
4714 off = 0;
4715 }
4716 assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
4717 // Save upper half of YMM registes
4718 subptr(rsp, 16*num_xmm_regs);
4719 for (int n = 0; n < num_xmm_regs; n++) {
4720 vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
4721 }
4722 }
4723 #endif
4724 // Save whole 128bit (16 bytes) XMM registers
4725 subptr(rsp, 16*num_xmm_regs);
4726 off = 0;
4727 #ifdef _LP64
4728 if (VM_Version::supports_avx512novl()) {
4729 for (int n = 0; n < num_xmm_regs; n++) {
4730 vextractf32x4h(Address(rsp, off++*16), as_XMMRegister(n), 0);
4731 }
4732 } else {
4733 for (int n = 0; n < num_xmm_regs; n++) {
4734 movdqu(Address(rsp, off++*16), as_XMMRegister(n));
4735 }
4736 }
4737 #else
4738 for (int n = 0; n < num_xmm_regs; n++) {
4739 movdqu(Address(rsp, off++*16), as_XMMRegister(n));
4740 }
4741 #endif
4742 }
4743
4744 // Preserve registers across runtime call
4745 int incoming_argument_and_return_value_offset = -1;
4746 if (num_fpu_regs_in_use > 1) {
4747 // Must preserve all other FPU regs (could alternatively convert
4748 // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
4749 // FPU state, but can not trust C compiler)
4750 NEEDS_CLEANUP;
4751 // NOTE that in this case we also push the incoming argument(s) to
4752 // the stack and restore it later; we also use this stack slot to
4753 // hold the return value from dsin, dcos etc.
4754 for (int i = 0; i < num_fpu_regs_in_use; i++) {
4755 subptr(rsp, sizeof(jdouble));
4756 fstp_d(Address(rsp, 0));
4757 }
4758 incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
4759 for (int i = nb_args-1; i >= 0; i--) {
4788
4789 MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
4790
4791 #ifdef _LP64
4792 movsd(Address(rsp, 0), xmm0);
4793 fld_d(Address(rsp, 0));
4794 #endif // _LP64
4795 addptr(rsp, sizeof(jdouble)*nb_args);
4796 if (num_fpu_regs_in_use > 1) {
4797 // Must save return value to stack and then restore entire FPU
4798 // stack except incoming arguments
4799 fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
4800 for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
4801 fld_d(Address(rsp, 0));
4802 addptr(rsp, sizeof(jdouble));
4803 }
4804 fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
4805 addptr(rsp, sizeof(jdouble)*nb_args);
4806 }
4807
4808 off = 0;
4809 if (UseSSE == 1) {
4810 for (int n = 0; n < 8; n++) {
4811 movflt(as_XMMRegister(n), Address(rsp, off++*sizeof(jdouble)));
4812 }
4813 addptr(rsp, sizeof(jdouble)*8);
4814 } else if (UseSSE >= 2) {
4815 // Restore whole 128bit (16 bytes) XMM regiters
4816 #ifdef _LP64
4817 if (VM_Version::supports_avx512novl()) {
4818 for (int n = 0; n < num_xmm_regs; n++) {
4819 vinsertf32x4h(as_XMMRegister(n), Address(rsp, off++*16), 0);
4820 }
4821 }
4822 else {
4823 for (int n = 0; n < num_xmm_regs; n++) {
4824 movdqu(as_XMMRegister(n), Address(rsp, off++*16));
4825 }
4826 }
4827 #else
4828 for (int n = 0; n < num_xmm_regs; n++) {
4829 movdqu(as_XMMRegister(n), Address(rsp, off++ * 16));
4830 }
4831 #endif
4832 addptr(rsp, 16*num_xmm_regs);
4833
4834 #ifdef COMPILER2
4835 if (MaxVectorSize > 16) {
4836 // Restore upper half of YMM registes.
4837 off = 0;
4838 for (int n = 0; n < num_xmm_regs; n++) {
4839 vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
4840 }
4841 addptr(rsp, 16*num_xmm_regs);
4842 if(UseAVX > 2) {
4843 off = 0;
4844 for (int n = 0; n < num_xmm_regs; n++) {
4845 vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32));
4846 }
4847 addptr(rsp, 32*num_xmm_regs);
4848 }
4849 }
4850 #endif
4851 }
4852 popa();
4853 }
4854
4855 static const double pi_4 = 0.7853981633974483;
4856
4857 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
4858 // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
4859 // was attempted in this code; unfortunately it appears that the
4860 // switch to 80-bit precision and back causes this to be
4861 // unprofitable compared with simply performing a runtime call if
4862 // the argument is out of the (-pi/4, pi/4) range.
4863
4864 Register tmp = noreg;
4865 if (!VM_Version::supports_cmov()) {
7191 }
7192 movdl(xtmp, value);
7193 if (UseAVX > 2 && UseUnalignedLoadStores) {
7194 // Fill 64-byte chunks
7195 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7196 evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
7197
7198 subl(count, 16 << shift);
7199 jcc(Assembler::less, L_check_fill_32_bytes);
7200 align(16);
7201
7202 BIND(L_fill_64_bytes_loop);
7203 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
7204 addptr(to, 64);
7205 subl(count, 16 << shift);
7206 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7207
7208 BIND(L_check_fill_32_bytes);
7209 addl(count, 8 << shift);
7210 jccb(Assembler::less, L_check_fill_8_bytes);
7211 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_256bit);
7212 addptr(to, 32);
7213 subl(count, 8 << shift);
7214
7215 BIND(L_check_fill_8_bytes);
7216 } else if (UseAVX == 2 && UseUnalignedLoadStores) {
7217 // Fill 64-byte chunks
7218 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7219 vpbroadcastd(xtmp, xtmp);
7220
7221 subl(count, 16 << shift);
7222 jcc(Assembler::less, L_check_fill_32_bytes);
7223 align(16);
7224
7225 BIND(L_fill_64_bytes_loop);
7226 vmovdqu(Address(to, 0), xtmp);
7227 vmovdqu(Address(to, 32), xtmp);
7228 addptr(to, 64);
7229 subl(count, 16 << shift);
7230 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7231
|
3631 }
3632 }
3633
3634 void MacroAssembler::movptr(Register dst, Register src) {
3635 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3636 }
3637
3638 void MacroAssembler::movptr(Register dst, Address src) {
3639 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3640 }
3641
3642 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3643 void MacroAssembler::movptr(Register dst, intptr_t src) {
3644 LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3645 }
3646
3647 void MacroAssembler::movptr(Address dst, Register src) {
3648 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3649 }
3650
3651 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3652 if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3653 Assembler::vextractf32x4h(dst, src, 0);
3654 } else {
3655 Assembler::movdqu(dst, src);
3656 }
3657 }
3658
3659 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3660 if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3661 Assembler::vinsertf32x4h(dst, src, 0);
3662 } else {
3663 Assembler::movdqu(dst, src);
3664 }
3665 }
3666
3667 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3668 if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3669 Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3670 } else {
3671 Assembler::movdqu(dst, src);
3672 }
3673 }
3674
3675 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
3676 if (reachable(src)) {
3677 movdqu(dst, as_Address(src));
3678 } else {
3679 lea(rscratch1, src);
3680 movdqu(dst, Address(rscratch1, 0));
3681 }
3682 }
3683
3684 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3685 if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3686 Assembler::vextractf64x4h(dst, src, 0);
3687 } else {
3688 Assembler::vmovdqu(dst, src);
3689 }
3690 }
3691
3692 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3693 if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3694 Assembler::vinsertf64x4h(dst, src, 0);
3695 } else {
3696 Assembler::vmovdqu(dst, src);
3697 }
3698 }
3699
3700 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3701 if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3702 Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3703 }
3704 else {
3705 Assembler::vmovdqu(dst, src);
3706 }
3707 }
3708
3709 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
3710 if (reachable(src)) {
3711 vmovdqu(dst, as_Address(src));
3712 }
3713 else {
3714 lea(rscratch1, src);
3715 vmovdqu(dst, Address(rscratch1, 0));
3716 }
3717 }
3718
3719 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3720 if (reachable(src)) {
3721 Assembler::movdqa(dst, as_Address(src));
3722 } else {
3723 lea(rscratch1, src);
3724 Assembler::movdqa(dst, Address(rscratch1, 0));
3725 }
3726 }
3727
3728 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3729 if (reachable(src)) {
3730 Assembler::movsd(dst, as_Address(src));
3731 } else {
3732 lea(rscratch1, src);
3733 Assembler::movsd(dst, Address(rscratch1, 0));
3734 }
3735 }
3765 if (needs_explicit_null_check(offset)) {
3766 // provoke OS NULL exception if reg = NULL by
3767 // accessing M[reg] w/o changing any (non-CC) registers
3768 // NOTE: cmpl is plenty here to provoke a segv
3769 cmpptr(rax, Address(reg, 0));
3770 // Note: should probably use testl(rax, Address(reg, 0));
3771 // may be shorter code (however, this version of
3772 // testl needs to be implemented first)
3773 } else {
3774 // nothing to do, (later) access of M[reg + offset]
3775 // will provoke OS NULL exception if reg = NULL
3776 }
3777 }
3778
3779 void MacroAssembler::os_breakpoint() {
3780 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3781 // (e.g., MSVC can't call ps() otherwise)
3782 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3783 }
3784
3785 #ifdef _LP64
3786 #define XSTATE_BV 0x200
3787 #endif
3788
3789 void MacroAssembler::pop_CPU_state() {
3790 pop_FPU_state();
3791 pop_IU_state();
3792 }
3793
3794 void MacroAssembler::pop_FPU_state() {
3795 #ifndef _LP64
3796 frstor(Address(rsp, 0));
3797 #else
3798 fxrstor(Address(rsp, 0));
3799 #endif
3800 addptr(rsp, FPUStateSizeInWords * wordSize);
3801 }
3802
3803 void MacroAssembler::pop_IU_state() {
3804 popa();
3805 LP64_ONLY(addq(rsp, 8));
3806 popf();
3807 }
3808
3809 // Save Integer and Float state
3810 // Warning: Stack must be 16 byte aligned (64bit)
3811 void MacroAssembler::push_CPU_state() {
3812 push_IU_state();
3813 push_FPU_state();
3814 }
3815
3816 void MacroAssembler::push_FPU_state() {
3817 subptr(rsp, FPUStateSizeInWords * wordSize);
3818 #ifndef _LP64
3819 fnsave(Address(rsp, 0));
3820 fwait();
3821 #else
3822 fxsave(Address(rsp, 0));
3823 #endif // LP64
3824 }
3825
3826 void MacroAssembler::push_IU_state() {
3827 // Push flags first because pusha kills them
3828 pushf();
3829 // Make sure rsp stays 16-byte aligned
3830 LP64_ONLY(subq(rsp, 8));
3831 pusha();
3832 }
3833
3834 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
3835 // determine java_thread register
3836 if (!java_thread->is_valid()) {
3837 java_thread = rdi;
3838 get_thread(java_thread);
3839 }
3840 // we must set sp to zero to clear frame
3841 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3842 if (clear_fp) {
3994 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3995 if (reachable(src)) {
3996 Assembler::ucomiss(dst, as_Address(src));
3997 } else {
3998 lea(rscratch1, src);
3999 Assembler::ucomiss(dst, Address(rscratch1, 0));
4000 }
4001 }
4002
4003 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
4004 // Used in sign-bit flipping with aligned address.
4005 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
4006 if (reachable(src)) {
4007 Assembler::xorpd(dst, as_Address(src));
4008 } else {
4009 lea(rscratch1, src);
4010 Assembler::xorpd(dst, Address(rscratch1, 0));
4011 }
4012 }
4013
4014 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
4015 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
4016 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
4017 }
4018 else {
4019 Assembler::xorpd(dst, src);
4020 }
4021 }
4022
4023 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
4024 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
4025 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
4026 } else {
4027 Assembler::xorps(dst, src);
4028 }
4029 }
4030
4031 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
4032 // Used in sign-bit flipping with aligned address.
4033 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
4034 if (reachable(src)) {
4035 Assembler::xorps(dst, as_Address(src));
4036 } else {
4037 lea(rscratch1, src);
4038 Assembler::xorps(dst, Address(rscratch1, 0));
4039 }
4040 }
4041
4042 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
4043 // Used in sign-bit flipping with aligned address.
4044 bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
4045 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
4046 if (reachable(src)) {
4047 Assembler::pshufb(dst, as_Address(src));
4048 } else {
4049 lea(rscratch1, src);
4050 Assembler::pshufb(dst, Address(rscratch1, 0));
4054 // AVX 3-operands instructions
4055
4056 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4057 if (reachable(src)) {
4058 vaddsd(dst, nds, as_Address(src));
4059 } else {
4060 lea(rscratch1, src);
4061 vaddsd(dst, nds, Address(rscratch1, 0));
4062 }
4063 }
4064
4065 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4066 if (reachable(src)) {
4067 vaddss(dst, nds, as_Address(src));
4068 } else {
4069 lea(rscratch1, src);
4070 vaddss(dst, nds, Address(rscratch1, 0));
4071 }
4072 }
4073
4074 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
4075 int dst_enc = dst->encoding();
4076 int nds_enc = nds->encoding();
4077 int src_enc = src->encoding();
4078 if ((dst_enc < 16) && (nds_enc < 16)) {
4079 vandps(dst, nds, negate_field, vector_len);
4080 } else if ((src_enc < 16) && (dst_enc < 16)) {
4081 movss(src, nds);
4082 vandps(dst, src, negate_field, vector_len);
4083 } else if (src_enc < 16) {
4084 movss(src, nds);
4085 vandps(src, src, negate_field, vector_len);
4086 movss(dst, src);
4087 } else if (dst_enc < 16) {
4088 movdqu(src, xmm0);
4089 movss(xmm0, nds);
4090 vandps(dst, xmm0, negate_field, vector_len);
4091 movdqu(xmm0, src);
4092 } else if (nds_enc < 16) {
4093 movdqu(src, xmm0);
4094 vandps(xmm0, nds, negate_field, vector_len);
4095 movss(dst, xmm0);
4096 movdqu(xmm0, src);
4097 } else {
4098 movdqu(src, xmm0);
4099 movss(xmm0, nds);
4100 vandps(xmm0, xmm0, negate_field, vector_len);
4101 movss(dst, xmm0);
4102 movdqu(xmm0, src);
4103 }
4104 }
4105
4106 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
4107 int dst_enc = dst->encoding();
4108 int nds_enc = nds->encoding();
4109 int src_enc = src->encoding();
4110 if ((dst_enc < 16) && (nds_enc < 16)) {
4111 vandpd(dst, nds, negate_field, vector_len);
4112 } else if ((src_enc < 16) && (dst_enc < 16)) {
4113 movsd(src, nds);
4114 vandpd(dst, src, negate_field, vector_len);
4115 } else if (src_enc < 16) {
4116 movsd(src, nds);
4117 vandpd(src, src, negate_field, vector_len);
4118 movsd(dst, src);
4119 } else if (dst_enc < 16) {
4120 movdqu(src, xmm0);
4121 movsd(xmm0, nds);
4122 vandpd(dst, xmm0, negate_field, vector_len);
4123 movdqu(xmm0, src);
4124 } else if (nds_enc < 16) {
4125 movdqu(src, xmm0);
4126 vandpd(xmm0, nds, negate_field, vector_len);
4127 movsd(dst, xmm0);
4128 movdqu(xmm0, src);
4129 } else {
4130 movdqu(src, xmm0);
4131 movsd(xmm0, nds);
4132 vandpd(xmm0, xmm0, negate_field, vector_len);
4133 movsd(dst, xmm0);
4134 movdqu(xmm0, src);
4135 }
4136 }
4137
4138 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4139 int dst_enc = dst->encoding();
4140 int nds_enc = nds->encoding();
4141 int src_enc = src->encoding();
4142 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4143 Assembler::vpaddb(dst, nds, src, vector_len);
4144 } else if ((dst_enc < 16) && (src_enc < 16)) {
4145 Assembler::vpaddb(dst, dst, src, vector_len);
4146 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4147 // use nds as scratch for src
4148 evmovdqul(nds, src, Assembler::AVX_512bit);
4149 Assembler::vpaddb(dst, dst, nds, vector_len);
4150 } else if ((src_enc < 16) && (nds_enc < 16)) {
4151 // use nds as scratch for dst
4152 evmovdqul(nds, dst, Assembler::AVX_512bit);
4153 Assembler::vpaddb(nds, nds, src, vector_len);
4154 evmovdqul(dst, nds, Assembler::AVX_512bit);
4155 } else if (dst_enc < 16) {
4156 // use nds as scatch for xmm0 to hold src
4157 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4158 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4159 Assembler::vpaddb(dst, dst, xmm0, vector_len);
4160 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4161 } else {
4162 // worse case scenario, all regs are in the upper bank
4163 subptr(rsp, 64);
4164 evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4165 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4166 evmovdqul(xmm1, src, Assembler::AVX_512bit);
4167 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4168 Assembler::vpaddb(xmm0, xmm0, xmm1, vector_len);
4169 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4170 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4171 evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4172 addptr(rsp, 64);
4173 }
4174 }
4175
4176 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4177 int dst_enc = dst->encoding();
4178 int nds_enc = nds->encoding();
4179 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4180 Assembler::vpaddb(dst, nds, src, vector_len);
4181 } else if (dst_enc < 16) {
4182 Assembler::vpaddb(dst, dst, src, vector_len);
4183 } else if (nds_enc < 16) {
4184 // implies dst_enc in upper bank with src as scratch
4185 evmovdqul(nds, dst, Assembler::AVX_512bit);
4186 Assembler::vpaddb(nds, nds, src, vector_len);
4187 evmovdqul(dst, nds, Assembler::AVX_512bit);
4188 } else {
4189 // worse case scenario, all regs in upper bank
4190 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4191 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4192 Assembler::vpaddb(xmm0, xmm0, src, vector_len);
4193 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4194 }
4195 }
4196
4197 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4198 int dst_enc = dst->encoding();
4199 int nds_enc = nds->encoding();
4200 int src_enc = src->encoding();
4201 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4202 Assembler::vpaddw(dst, nds, src, vector_len);
4203 } else if ((dst_enc < 16) && (src_enc < 16)) {
4204 Assembler::vpaddw(dst, dst, src, vector_len);
4205 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4206 // use nds as scratch for src
4207 evmovdqul(nds, src, Assembler::AVX_512bit);
4208 Assembler::vpaddw(dst, dst, nds, vector_len);
4209 } else if ((src_enc < 16) && (nds_enc < 16)) {
4210 // use nds as scratch for dst
4211 evmovdqul(nds, dst, Assembler::AVX_512bit);
4212 Assembler::vpaddw(nds, nds, src, vector_len);
4213 evmovdqul(dst, nds, Assembler::AVX_512bit);
4214 } else if (dst_enc < 16) {
4215 // use nds as scatch for xmm0 to hold src
4216 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4217 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4218 Assembler::vpaddw(dst, dst, xmm0, vector_len);
4219 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4220 } else {
4221 // worse case scenario, all regs are in the upper bank
4222 subptr(rsp, 64);
4223 evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4224 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4225 evmovdqul(xmm1, src, Assembler::AVX_512bit);
4226 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4227 Assembler::vpaddw(xmm0, xmm0, xmm1, vector_len);
4228 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4229 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4230 evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4231 addptr(rsp, 64);
4232 }
4233 }
4234
4235 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4236 int dst_enc = dst->encoding();
4237 int nds_enc = nds->encoding();
4238 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4239 Assembler::vpaddw(dst, nds, src, vector_len);
4240 } else if (dst_enc < 16) {
4241 Assembler::vpaddw(dst, dst, src, vector_len);
4242 } else if (nds_enc < 16) {
4243 // implies dst_enc in upper bank with src as scratch
4244 evmovdqul(nds, dst, Assembler::AVX_512bit);
4245 Assembler::vpaddw(nds, nds, src, vector_len);
4246 evmovdqul(dst, nds, Assembler::AVX_512bit);
4247 } else {
4248 // worse case scenario, all regs in upper bank
4249 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4250 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4251 Assembler::vpaddw(xmm0, xmm0, src, vector_len);
4252 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4253 }
4254 }
4255
4256 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4257 int dst_enc = dst->encoding();
4258 int nds_enc = nds->encoding();
4259 int src_enc = src->encoding();
4260 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4261 Assembler::vpsubb(dst, nds, src, vector_len);
4262 } else if ((dst_enc < 16) && (src_enc < 16)) {
4263 Assembler::vpsubb(dst, dst, src, vector_len);
4264 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4265 // use nds as scratch for src
4266 evmovdqul(nds, src, Assembler::AVX_512bit);
4267 Assembler::vpsubb(dst, dst, nds, vector_len);
4268 } else if ((src_enc < 16) && (nds_enc < 16)) {
4269 // use nds as scratch for dst
4270 evmovdqul(nds, dst, Assembler::AVX_512bit);
4271 Assembler::vpsubb(nds, nds, src, vector_len);
4272 evmovdqul(dst, nds, Assembler::AVX_512bit);
4273 } else if (dst_enc < 16) {
4274 // use nds as scatch for xmm0 to hold src
4275 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4276 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4277 Assembler::vpsubb(dst, dst, xmm0, vector_len);
4278 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4279 } else {
4280 // worse case scenario, all regs are in the upper bank
4281 subptr(rsp, 64);
4282 evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4283 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4284 evmovdqul(xmm1, src, Assembler::AVX_512bit);
4285 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4286 Assembler::vpsubb(xmm0, xmm0, xmm1, vector_len);
4287 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4288 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4289 evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4290 addptr(rsp, 64);
4291 }
4292 }
4293
4294 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4295 int dst_enc = dst->encoding();
4296 int nds_enc = nds->encoding();
4297 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4298 Assembler::vpsubb(dst, nds, src, vector_len);
4299 } else if (dst_enc < 16) {
4300 Assembler::vpsubb(dst, dst, src, vector_len);
4301 } else if (nds_enc < 16) {
4302 // implies dst_enc in upper bank with src as scratch
4303 evmovdqul(nds, dst, Assembler::AVX_512bit);
4304 Assembler::vpsubb(nds, nds, src, vector_len);
4305 evmovdqul(dst, nds, Assembler::AVX_512bit);
4306 } else {
4307 // worse case scenario, all regs in upper bank
4308 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4309 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4310 Assembler::vpsubw(xmm0, xmm0, src, vector_len);
4311 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4312 }
4313 }
4314
4315 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4316 int dst_enc = dst->encoding();
4317 int nds_enc = nds->encoding();
4318 int src_enc = src->encoding();
4319 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4320 Assembler::vpsubw(dst, nds, src, vector_len);
4321 } else if ((dst_enc < 16) && (src_enc < 16)) {
4322 Assembler::vpsubw(dst, dst, src, vector_len);
4323 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4324 // use nds as scratch for src
4325 evmovdqul(nds, src, Assembler::AVX_512bit);
4326 Assembler::vpsubw(dst, dst, nds, vector_len);
4327 } else if ((src_enc < 16) && (nds_enc < 16)) {
4328 // use nds as scratch for dst
4329 evmovdqul(nds, dst, Assembler::AVX_512bit);
4330 Assembler::vpsubw(nds, nds, src, vector_len);
4331 evmovdqul(dst, nds, Assembler::AVX_512bit);
4332 } else if (dst_enc < 16) {
4333 // use nds as scatch for xmm0 to hold src
4334 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4335 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4336 Assembler::vpsubw(dst, dst, xmm0, vector_len);
4337 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4338 } else {
4339 // worse case scenario, all regs are in the upper bank
4340 subptr(rsp, 64);
4341 evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4342 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4343 evmovdqul(xmm1, src, Assembler::AVX_512bit);
4344 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4345 Assembler::vpsubw(xmm0, xmm0, xmm1, vector_len);
4346 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4347 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4348 evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4349 addptr(rsp, 64);
4350 }
4351 }
4352
4353 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4354 int dst_enc = dst->encoding();
4355 int nds_enc = nds->encoding();
4356 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4357 Assembler::vpsubw(dst, nds, src, vector_len);
4358 } else if (dst_enc < 16) {
4359 Assembler::vpsubw(dst, dst, src, vector_len);
4360 } else if (nds_enc < 16) {
4361 // implies dst_enc in upper bank with src as scratch
4362 evmovdqul(nds, dst, Assembler::AVX_512bit);
4363 Assembler::vpsubw(nds, nds, src, vector_len);
4364 evmovdqul(dst, nds, Assembler::AVX_512bit);
4365 } else {
4366 // worse case scenario, all regs in upper bank
4367 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4368 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4369 Assembler::vpsubw(xmm0, xmm0, src, vector_len);
4370 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4371 }
4372 }
4373
4374
4375 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4376 int dst_enc = dst->encoding();
4377 int nds_enc = nds->encoding();
4378 int src_enc = src->encoding();
4379 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4380 Assembler::vpmullw(dst, nds, src, vector_len);
4381 } else if ((dst_enc < 16) && (src_enc < 16)) {
4382 Assembler::vpmullw(dst, dst, src, vector_len);
4383 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4384 // use nds as scratch for src
4385 evmovdqul(nds, src, Assembler::AVX_512bit);
4386 Assembler::vpmullw(dst, dst, nds, vector_len);
4387 } else if ((src_enc < 16) && (nds_enc < 16)) {
4388 // use nds as scratch for dst
4389 evmovdqul(nds, dst, Assembler::AVX_512bit);
4390 Assembler::vpmullw(nds, nds, src, vector_len);
4391 evmovdqul(dst, nds, Assembler::AVX_512bit);
4392 } else if (dst_enc < 16) {
4393 // use nds as scatch for xmm0 to hold src
4394 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4395 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4396 Assembler::vpmullw(dst, dst, xmm0, vector_len);
4397 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4398 } else {
4399 // worse case scenario, all regs are in the upper bank
4400 subptr(rsp, 64);
4401 evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4402 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4403 evmovdqul(xmm1, src, Assembler::AVX_512bit);
4404 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4405 Assembler::vpmullw(xmm0, xmm0, xmm1, vector_len);
4406 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4407 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4408 evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4409 addptr(rsp, 64);
4410 }
4411 }
4412
4413 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4414 int dst_enc = dst->encoding();
4415 int nds_enc = nds->encoding();
4416 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4417 Assembler::vpmullw(dst, nds, src, vector_len);
4418 } else if (dst_enc < 16) {
4419 Assembler::vpmullw(dst, dst, src, vector_len);
4420 } else if (nds_enc < 16) {
4421 // implies dst_enc in upper bank with src as scratch
4422 evmovdqul(nds, dst, Assembler::AVX_512bit);
4423 Assembler::vpmullw(nds, nds, src, vector_len);
4424 evmovdqul(dst, nds, Assembler::AVX_512bit);
4425 } else {
4426 // worse case scenario, all regs in upper bank
4427 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4428 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4429 Assembler::vpmullw(xmm0, xmm0, src, vector_len);
4430 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4431 }
4432 }
4433
4434 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4435 int dst_enc = dst->encoding();
4436 int nds_enc = nds->encoding();
4437 int shift_enc = shift->encoding();
4438 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4439 Assembler::vpsraw(dst, nds, shift, vector_len);
4440 } else if ((dst_enc < 16) && (shift_enc < 16)) {
4441 Assembler::vpsraw(dst, dst, shift, vector_len);
4442 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4443 // use nds_enc as scratch with shift
4444 evmovdqul(nds, shift, Assembler::AVX_512bit);
4445 Assembler::vpsraw(dst, dst, nds, vector_len);
4446 } else if ((shift_enc < 16) && (nds_enc < 16)) {
4447 // use nds as scratch with dst
4448 evmovdqul(nds, dst, Assembler::AVX_512bit);
4449 Assembler::vpsraw(nds, nds, shift, vector_len);
4450 evmovdqul(dst, nds, Assembler::AVX_512bit);
4451 } else if (dst_enc < 16) {
4452 // use nds to save a copy of xmm0 and hold shift
4453 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4454 evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4455 Assembler::vpsraw(dst, dst, xmm0, vector_len);
4456 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4457 } else if (nds_enc < 16) {
4458 // use nds as dest as temps
4459 evmovdqul(nds, dst, Assembler::AVX_512bit);
4460 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4461 evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4462 Assembler::vpsraw(nds, nds, xmm0, vector_len);
4463 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4464 evmovdqul(dst, nds, Assembler::AVX_512bit);
4465 } else {
4466 // worse case scenario, all regs are in the upper bank
4467 subptr(rsp, 64);
4468 evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4469 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4470 evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4471 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4472 Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4473 evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4474 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4475 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4476 evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4477 addptr(rsp, 64);
4478 }
4479 }
4480
4481 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4482 int dst_enc = dst->encoding();
4483 int nds_enc = nds->encoding();
4484 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4485 Assembler::vpsraw(dst, nds, shift, vector_len);
4486 } else if (dst_enc < 16) {
4487 Assembler::vpsraw(dst, dst, shift, vector_len);
4488 } else if (nds_enc < 16) {
4489 // use nds as scratch
4490 evmovdqul(nds, dst, Assembler::AVX_512bit);
4491 Assembler::vpsraw(nds, nds, shift, vector_len);
4492 evmovdqul(dst, nds, Assembler::AVX_512bit);
4493 } else {
4494 // use nds as scratch for xmm0
4495 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4496 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4497 Assembler::vpsraw(xmm0, xmm0, shift, vector_len);
4498 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4499 }
4500 }
4501
4502 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4503 int dst_enc = dst->encoding();
4504 int nds_enc = nds->encoding();
4505 int shift_enc = shift->encoding();
4506 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4507 Assembler::vpsrlw(dst, nds, shift, vector_len);
4508 } else if ((dst_enc < 16) && (shift_enc < 16)) {
4509 Assembler::vpsrlw(dst, dst, shift, vector_len);
4510 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4511 // use nds_enc as scratch with shift
4512 evmovdqul(nds, shift, Assembler::AVX_512bit);
4513 Assembler::vpsrlw(dst, dst, nds, vector_len);
4514 } else if ((shift_enc < 16) && (nds_enc < 16)) {
4515 // use nds as scratch with dst
4516 evmovdqul(nds, dst, Assembler::AVX_512bit);
4517 Assembler::vpsrlw(nds, nds, shift, vector_len);
4518 evmovdqul(dst, nds, Assembler::AVX_512bit);
4519 } else if (dst_enc < 16) {
4520 // use nds to save a copy of xmm0 and hold shift
4521 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4522 evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4523 Assembler::vpsrlw(dst, dst, xmm0, vector_len);
4524 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4525 } else if (nds_enc < 16) {
4526 // use nds as dest as temps
4527 evmovdqul(nds, dst, Assembler::AVX_512bit);
4528 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4529 evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4530 Assembler::vpsrlw(nds, nds, xmm0, vector_len);
4531 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4532 evmovdqul(dst, nds, Assembler::AVX_512bit);
4533 } else {
4534 // worse case scenario, all regs are in the upper bank
4535 subptr(rsp, 64);
4536 evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4537 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4538 evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4539 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4540 Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4541 evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4542 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4543 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4544 evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4545 addptr(rsp, 64);
4546 }
4547 }
4548
4549 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4550 int dst_enc = dst->encoding();
4551 int nds_enc = nds->encoding();
4552 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4553 Assembler::vpsrlw(dst, nds, shift, vector_len);
4554 } else if (dst_enc < 16) {
4555 Assembler::vpsrlw(dst, dst, shift, vector_len);
4556 } else if (nds_enc < 16) {
4557 // use nds as scratch
4558 evmovdqul(nds, dst, Assembler::AVX_512bit);
4559 Assembler::vpsrlw(nds, nds, shift, vector_len);
4560 evmovdqul(dst, nds, Assembler::AVX_512bit);
4561 } else {
4562 // use nds as scratch for xmm0
4563 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4564 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4565 Assembler::vpsrlw(xmm0, xmm0, shift, vector_len);
4566 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4567 }
4568 }
4569
4570 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4571 int dst_enc = dst->encoding();
4572 int nds_enc = nds->encoding();
4573 int shift_enc = shift->encoding();
4574 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4575 Assembler::vpsllw(dst, nds, shift, vector_len);
4576 } else if ((dst_enc < 16) && (shift_enc < 16)) {
4577 Assembler::vpsllw(dst, dst, shift, vector_len);
4578 } else if ((dst_enc < 16) && (nds_enc < 16)) {
4579 // use nds_enc as scratch with shift
4580 evmovdqul(nds, shift, Assembler::AVX_512bit);
4581 Assembler::vpsllw(dst, dst, nds, vector_len);
4582 } else if ((shift_enc < 16) && (nds_enc < 16)) {
4583 // use nds as scratch with dst
4584 evmovdqul(nds, dst, Assembler::AVX_512bit);
4585 Assembler::vpsllw(nds, nds, shift, vector_len);
4586 evmovdqul(dst, nds, Assembler::AVX_512bit);
4587 } else if (dst_enc < 16) {
4588 // use nds to save a copy of xmm0 and hold shift
4589 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4590 evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4591 Assembler::vpsllw(dst, dst, xmm0, vector_len);
4592 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4593 } else if (nds_enc < 16) {
4594 // use nds as dest as temps
4595 evmovdqul(nds, dst, Assembler::AVX_512bit);
4596 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4597 evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4598 Assembler::vpsllw(nds, nds, xmm0, vector_len);
4599 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4600 evmovdqul(dst, nds, Assembler::AVX_512bit);
4601 } else {
4602 // worse case scenario, all regs are in the upper bank
4603 subptr(rsp, 64);
4604 evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4605 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4606 evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4607 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4608 Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4609 evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4610 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4611 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4612 evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4613 addptr(rsp, 64);
4614 }
4615 }
4616
4617 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4618 int dst_enc = dst->encoding();
4619 int nds_enc = nds->encoding();
4620 if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
4621 Assembler::vpsllw(dst, nds, shift, vector_len);
4622 } else if (dst_enc < 16) {
4623 Assembler::vpsllw(dst, dst, shift, vector_len);
4624 } else if (nds_enc < 16) {
4625 // use nds as scratch
4626 evmovdqul(nds, dst, Assembler::AVX_512bit);
4627 Assembler::vpsllw(nds, nds, shift, vector_len);
4628 evmovdqul(dst, nds, Assembler::AVX_512bit);
4629 } else {
4630 // use nds as scratch for xmm0
4631 evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4632 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4633 Assembler::vpsllw(xmm0, xmm0, shift, vector_len);
4634 evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4635 }
4636 }
4637
4638 // This instruction exists within macros, ergo we cannot control its input
4639 // when emitted through those patterns.
4640 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
4641 if (VM_Version::supports_avx512nobw()) {
4642 int dst_enc = dst->encoding();
4643 int src_enc = src->encoding();
4644 if (dst_enc == src_enc) {
4645 if (dst_enc < 16) {
4646 Assembler::punpcklbw(dst, src);
4647 } else {
4648 subptr(rsp, 64);
4649 evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4650 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4651 Assembler::punpcklbw(xmm0, xmm0);
4652 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4653 evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4654 addptr(rsp, 64);
4655 }
4656 } else {
4657 if ((src_enc < 16) && (dst_enc < 16)) {
4658 Assembler::punpcklbw(dst, src);
4659 } else if (src_enc < 16) {
4660 subptr(rsp, 64);
4661 evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4662 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4663 Assembler::punpcklbw(xmm0, src);
4664 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4665 evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4666 addptr(rsp, 64);
4667 } else if (dst_enc < 16) {
4668 subptr(rsp, 64);
4669 evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4670 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4671 Assembler::punpcklbw(dst, xmm0);
4672 evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4673 addptr(rsp, 64);
4674 } else {
4675 subptr(rsp, 64);
4676 evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4677 subptr(rsp, 64);
4678 evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4679 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4680 evmovdqul(xmm1, src, Assembler::AVX_512bit);
4681 Assembler::punpcklbw(xmm0, xmm1);
4682 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4683 evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4684 addptr(rsp, 64);
4685 evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4686 addptr(rsp, 64);
4687 }
4688 }
4689 } else {
4690 Assembler::punpcklbw(dst, src);
4691 }
4692 }
4693
4694 // This instruction exists within macros, ergo we cannot control its input
4695 // when emitted through those patterns.
4696 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
4697 if (VM_Version::supports_avx512nobw()) {
4698 int dst_enc = dst->encoding();
4699 int src_enc = src->encoding();
4700 if (dst_enc == src_enc) {
4701 if (dst_enc < 16) {
4702 Assembler::pshuflw(dst, src, mode);
4703 } else {
4704 subptr(rsp, 64);
4705 evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4706 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4707 Assembler::pshuflw(xmm0, xmm0, mode);
4708 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4709 evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4710 addptr(rsp, 64);
4711 }
4712 } else {
4713 if ((src_enc < 16) && (dst_enc < 16)) {
4714 Assembler::pshuflw(dst, src, mode);
4715 } else if (src_enc < 16) {
4716 subptr(rsp, 64);
4717 evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4718 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4719 Assembler::pshuflw(xmm0, src, mode);
4720 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4721 evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4722 addptr(rsp, 64);
4723 } else if (dst_enc < 16) {
4724 subptr(rsp, 64);
4725 evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4726 evmovdqul(xmm0, src, Assembler::AVX_512bit);
4727 Assembler::pshuflw(dst, xmm0, mode);
4728 evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4729 addptr(rsp, 64);
4730 } else {
4731 subptr(rsp, 64);
4732 evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4733 subptr(rsp, 64);
4734 evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4735 evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4736 evmovdqul(xmm1, src, Assembler::AVX_512bit);
4737 Assembler::pshuflw(xmm0, xmm1, mode);
4738 evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4739 evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4740 addptr(rsp, 64);
4741 evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4742 addptr(rsp, 64);
4743 }
4744 }
4745 } else {
4746 Assembler::pshuflw(dst, src, mode);
4747 }
4748 }
4749
4750 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4751 if (reachable(src)) {
4752 vandpd(dst, nds, as_Address(src), vector_len);
4753 } else {
4754 lea(rscratch1, src);
4755 vandpd(dst, nds, Address(rscratch1, 0), vector_len);
4756 }
4757 }
4758
4759 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4760 if (reachable(src)) {
4761 vandps(dst, nds, as_Address(src), vector_len);
4762 } else {
4763 lea(rscratch1, src);
4764 vandps(dst, nds, Address(rscratch1, 0), vector_len);
4765 }
4766 }
4767
4768 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4769 if (reachable(src)) {
4813 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4814 if (reachable(src)) {
4815 vsubss(dst, nds, as_Address(src));
4816 } else {
4817 lea(rscratch1, src);
4818 vsubss(dst, nds, Address(rscratch1, 0));
4819 }
4820 }
4821
4822 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4823 int nds_enc = nds->encoding();
4824 int dst_enc = dst->encoding();
4825 bool dst_upper_bank = (dst_enc > 15);
4826 bool nds_upper_bank = (nds_enc > 15);
4827 if (VM_Version::supports_avx512novl() &&
4828 (nds_upper_bank || dst_upper_bank)) {
4829 if (dst_upper_bank) {
4830 subptr(rsp, 64);
4831 evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4832 movflt(xmm0, nds);
4833 vxorps(xmm0, xmm0, src, Assembler::AVX_128bit);
4834 movflt(dst, xmm0);
4835 evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4836 addptr(rsp, 64);
4837 } else {
4838 movflt(dst, nds);
4839 vxorps(dst, dst, src, Assembler::AVX_128bit);
4840 }
4841 } else {
4842 vxorps(dst, nds, src, Assembler::AVX_128bit);
4843 }
4844 }
4845
4846 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4847 int nds_enc = nds->encoding();
4848 int dst_enc = dst->encoding();
4849 bool dst_upper_bank = (dst_enc > 15);
4850 bool nds_upper_bank = (nds_enc > 15);
4851 if (VM_Version::supports_avx512novl() &&
4852 (nds_upper_bank || dst_upper_bank)) {
4853 if (dst_upper_bank) {
4854 subptr(rsp, 64);
4855 evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4856 movdbl(xmm0, nds);
4857 vxorpd(xmm0, xmm0, src, Assembler::AVX_128bit);
4858 movdbl(dst, xmm0);
4859 evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4860 addptr(rsp, 64);
4861 } else {
4862 movdbl(dst, nds);
4863 vxorpd(dst, dst, src, Assembler::AVX_128bit);
4864 }
4865 } else {
4866 vxorpd(dst, nds, src, Assembler::AVX_128bit);
4867 }
4868 }
4869
4870 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4871 if (reachable(src)) {
4872 vxorpd(dst, nds, as_Address(src), vector_len);
4873 } else {
4874 lea(rscratch1, src);
4875 vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
4876 }
4877 }
4878
4879 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4880 if (reachable(src)) {
4881 vxorps(dst, nds, as_Address(src), vector_len);
4882 } else {
4883 lea(rscratch1, src);
4884 vxorps(dst, nds, Address(rscratch1, 0), vector_len);
4885 }
4886 }
5338 #ifdef _LP64
5339 if (var_size_in_bytes->is_valid()) {
5340 addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
5341 } else {
5342 addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
5343 }
5344 #else
5345 if (var_size_in_bytes->is_valid()) {
5346 addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
5347 } else {
5348 addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
5349 }
5350 adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
5351 #endif
5352 }
5353
5354 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
5355 pusha();
5356
5357 // if we are coming from c1, xmm registers may be live
5358 int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8);
5359 if (UseAVX > 2) {
5360 num_xmm_regs = LP64_ONLY(32) NOT_LP64(8);
5361 }
5362
5363 if (UseSSE == 1) {
5364 subptr(rsp, sizeof(jdouble)*8);
5365 for (int n = 0; n < 8; n++) {
5366 movflt(Address(rsp, n*sizeof(jdouble)), as_XMMRegister(n));
5367 }
5368 } else if (UseSSE >= 2) {
5369 if (UseAVX > 2) {
5370 push(rbx);
5371 movl(rbx, 0xffff);
5372 kmovwl(k1, rbx);
5373 pop(rbx);
5374 }
5375 #ifdef COMPILER2
5376 if (MaxVectorSize > 16) {
5377 if(UseAVX > 2) {
5378 // Save upper half of ZMM registers
5379 subptr(rsp, 32*num_xmm_regs);
5380 for (int n = 0; n < num_xmm_regs; n++) {
5381 vextractf64x4h(Address(rsp, n*32), as_XMMRegister(n), 1);
5382 }
5383 }
5384 assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
5385 // Save upper half of YMM registers
5386 subptr(rsp, 16*num_xmm_regs);
5387 for (int n = 0; n < num_xmm_regs; n++) {
5388 vextractf128h(Address(rsp, n*16), as_XMMRegister(n));
5389 }
5390 }
5391 #endif
5392 // Save whole 128bit (16 bytes) XMM registers
5393 subptr(rsp, 16*num_xmm_regs);
5394 #ifdef _LP64
5395 if (VM_Version::supports_evex()) {
5396 for (int n = 0; n < num_xmm_regs; n++) {
5397 vextractf32x4h(Address(rsp, n*16), as_XMMRegister(n), 0);
5398 }
5399 } else {
5400 for (int n = 0; n < num_xmm_regs; n++) {
5401 movdqu(Address(rsp, n*16), as_XMMRegister(n));
5402 }
5403 }
5404 #else
5405 for (int n = 0; n < num_xmm_regs; n++) {
5406 movdqu(Address(rsp, n*16), as_XMMRegister(n));
5407 }
5408 #endif
5409 }
5410
5411 // Preserve registers across runtime call
5412 int incoming_argument_and_return_value_offset = -1;
5413 if (num_fpu_regs_in_use > 1) {
5414 // Must preserve all other FPU regs (could alternatively convert
5415 // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
5416 // FPU state, but can not trust C compiler)
5417 NEEDS_CLEANUP;
5418 // NOTE that in this case we also push the incoming argument(s) to
5419 // the stack and restore it later; we also use this stack slot to
5420 // hold the return value from dsin, dcos etc.
5421 for (int i = 0; i < num_fpu_regs_in_use; i++) {
5422 subptr(rsp, sizeof(jdouble));
5423 fstp_d(Address(rsp, 0));
5424 }
5425 incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
5426 for (int i = nb_args-1; i >= 0; i--) {
5455
5456 MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
5457
5458 #ifdef _LP64
5459 movsd(Address(rsp, 0), xmm0);
5460 fld_d(Address(rsp, 0));
5461 #endif // _LP64
5462 addptr(rsp, sizeof(jdouble)*nb_args);
5463 if (num_fpu_regs_in_use > 1) {
5464 // Must save return value to stack and then restore entire FPU
5465 // stack except incoming arguments
5466 fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
5467 for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
5468 fld_d(Address(rsp, 0));
5469 addptr(rsp, sizeof(jdouble));
5470 }
5471 fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
5472 addptr(rsp, sizeof(jdouble)*nb_args);
5473 }
5474
5475 if (UseSSE == 1) {
5476 for (int n = 0; n < 8; n++) {
5477 movflt(as_XMMRegister(n), Address(rsp, n*sizeof(jdouble)));
5478 }
5479 addptr(rsp, sizeof(jdouble)*8);
5480 } else if (UseSSE >= 2) {
5481 // Restore whole 128bit (16 bytes) XMM registers
5482 #ifdef _LP64
5483 if (VM_Version::supports_evex()) {
5484 for (int n = 0; n < num_xmm_regs; n++) {
5485 vinsertf32x4h(as_XMMRegister(n), Address(rsp, n*16), 0);
5486 }
5487 } else {
5488 for (int n = 0; n < num_xmm_regs; n++) {
5489 movdqu(as_XMMRegister(n), Address(rsp, n*16));
5490 }
5491 }
5492 #else
5493 for (int n = 0; n < num_xmm_regs; n++) {
5494 movdqu(as_XMMRegister(n), Address(rsp, n*16));
5495 }
5496 #endif
5497 addptr(rsp, 16*num_xmm_regs);
5498
5499 #ifdef COMPILER2
5500 if (MaxVectorSize > 16) {
5501 // Restore upper half of YMM registers.
5502 for (int n = 0; n < num_xmm_regs; n++) {
5503 vinsertf128h(as_XMMRegister(n), Address(rsp, n*16));
5504 }
5505 addptr(rsp, 16*num_xmm_regs);
5506 if(UseAVX > 2) {
5507 for (int n = 0; n < num_xmm_regs; n++) {
5508 vinsertf64x4h(as_XMMRegister(n), Address(rsp, n*32), 1);
5509 }
5510 addptr(rsp, 32*num_xmm_regs);
5511 }
5512 }
5513 #endif
5514 }
5515 popa();
5516 }
5517
5518 static const double pi_4 = 0.7853981633974483;
5519
5520 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
5521 // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
5522 // was attempted in this code; unfortunately it appears that the
5523 // switch to 80-bit precision and back causes this to be
5524 // unprofitable compared with simply performing a runtime call if
5525 // the argument is out of the (-pi/4, pi/4) range.
5526
5527 Register tmp = noreg;
5528 if (!VM_Version::supports_cmov()) {
7854 }
7855 movdl(xtmp, value);
7856 if (UseAVX > 2 && UseUnalignedLoadStores) {
7857 // Fill 64-byte chunks
7858 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7859 evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
7860
7861 subl(count, 16 << shift);
7862 jcc(Assembler::less, L_check_fill_32_bytes);
7863 align(16);
7864
7865 BIND(L_fill_64_bytes_loop);
7866 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
7867 addptr(to, 64);
7868 subl(count, 16 << shift);
7869 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7870
7871 BIND(L_check_fill_32_bytes);
7872 addl(count, 8 << shift);
7873 jccb(Assembler::less, L_check_fill_8_bytes);
7874 vmovdqu(Address(to, 0), xtmp);
7875 addptr(to, 32);
7876 subl(count, 8 << shift);
7877
7878 BIND(L_check_fill_8_bytes);
7879 } else if (UseAVX == 2 && UseUnalignedLoadStores) {
7880 // Fill 64-byte chunks
7881 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7882 vpbroadcastd(xtmp, xtmp);
7883
7884 subl(count, 16 << shift);
7885 jcc(Assembler::less, L_check_fill_32_bytes);
7886 align(16);
7887
7888 BIND(L_fill_64_bytes_loop);
7889 vmovdqu(Address(to, 0), xtmp);
7890 vmovdqu(Address(to, 32), xtmp);
7891 addptr(to, 64);
7892 subl(count, 16 << shift);
7893 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7894
|