< prev index next >
src/cpu/x86/vm/macroAssembler_x86.cpp
Print this page
*** 3646,3661 ****
void MacroAssembler::movptr(Address dst, Register src) {
LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
}
void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
if (reachable(src)) {
! Assembler::movdqu(dst, as_Address(src));
} else {
lea(rscratch1, src);
! Assembler::movdqu(dst, Address(rscratch1, 0));
}
}
void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
if (reachable(src)) {
--- 3646,3720 ----
void MacroAssembler::movptr(Address dst, Register src) {
LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
}
+ void MacroAssembler::movdqu(Address dst, XMMRegister src) {
+ if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
+ Assembler::vextractf32x4h(dst, src, 0);
+ } else {
+ Assembler::movdqu(dst, src);
+ }
+ }
+
+ void MacroAssembler::movdqu(XMMRegister dst, Address src) {
+ if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
+ Assembler::vinsertf32x4h(dst, src, 0);
+ } else {
+ Assembler::movdqu(dst, src);
+ }
+ }
+
+ void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
+ if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
+ Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
+ } else {
+ Assembler::movdqu(dst, src);
+ }
+ }
+
void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
if (reachable(src)) {
! movdqu(dst, as_Address(src));
! } else {
! lea(rscratch1, src);
! movdqu(dst, Address(rscratch1, 0));
! }
! }
!
! void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
! if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
! Assembler::vextractf64x4h(dst, src, 0);
! } else {
! Assembler::vmovdqu(dst, src);
! }
! }
!
! void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
! if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
! Assembler::vinsertf64x4h(dst, src, 0);
} else {
+ Assembler::vmovdqu(dst, src);
+ }
+ }
+
+ void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
+ if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
+ Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
+ }
+ else {
+ Assembler::vmovdqu(dst, src);
+ }
+ }
+
+ void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
+ if (reachable(src)) {
+ vmovdqu(dst, as_Address(src));
+ }
+ else {
lea(rscratch1, src);
! vmovdqu(dst, Address(rscratch1, 0));
}
}
void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
if (reachable(src)) {
*** 3721,3760 ****
// instead of directly emitting a breakpoint, call os:breakpoint for better debugability
// (e.g., MSVC can't call ps() otherwise)
call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
}
void MacroAssembler::pop_CPU_state() {
pop_FPU_state();
pop_IU_state();
}
void MacroAssembler::pop_FPU_state() {
#ifndef _LP64
frstor(Address(rsp, 0));
#else
- // AVX will continue to use the fxsave area.
- // EVEX needs to utilize the xsave area, which is under different
- // management.
- if(VM_Version::supports_evex()) {
- // EDX:EAX describe the XSAVE header and
- // are obtained while fetching info for XCR0 via cpuid.
- // These two registers make up 64-bits in the header for which bits
- // 62:10 are currently reserved for future implementations and unused. Bit 63
- // is unused for our implementation as we do not utilize
- // compressed XSAVE areas. Bits 9..8 are currently ignored as we do not use
- // the functionality for PKRU state and MSR tracing.
- // Ergo we are primarily concerned with bits 7..0, which define
- // which ISA extensions and features are enabled for a given machine and are
- // defined in XemXcr0Eax and is used to map the XSAVE area
- // for restoring registers as described via XCR0.
- movl(rdx,VM_Version::get_xsave_header_upper_segment());
- movl(rax,VM_Version::get_xsave_header_lower_segment());
- xrstor(Address(rsp, 0));
- } else {
fxrstor(Address(rsp, 0));
- }
#endif
addptr(rsp, FPUStateSizeInWords * wordSize);
}
void MacroAssembler::pop_IU_state() {
--- 3780,3803 ----
// instead of directly emitting a breakpoint, call os:breakpoint for better debugability
// (e.g., MSVC can't call ps() otherwise)
call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
}
+ #ifdef _LP64
+ #define XSTATE_BV 0x200
+ #endif
+
void MacroAssembler::pop_CPU_state() {
pop_FPU_state();
pop_IU_state();
}
void MacroAssembler::pop_FPU_state() {
#ifndef _LP64
frstor(Address(rsp, 0));
#else
fxrstor(Address(rsp, 0));
#endif
addptr(rsp, FPUStateSizeInWords * wordSize);
}
void MacroAssembler::pop_IU_state() {
*** 3768,3820 ****
void MacroAssembler::push_CPU_state() {
push_IU_state();
push_FPU_state();
}
- #ifdef _LP64
- #define XSTATE_BV 0x200
- #endif
-
void MacroAssembler::push_FPU_state() {
subptr(rsp, FPUStateSizeInWords * wordSize);
#ifndef _LP64
fnsave(Address(rsp, 0));
fwait();
#else
- // AVX will continue to use the fxsave area.
- // EVEX needs to utilize the xsave area, which is under different
- // management.
- if(VM_Version::supports_evex()) {
- // Save a copy of EAX and EDX
- push(rax);
- push(rdx);
- // EDX:EAX describe the XSAVE header and
- // are obtained while fetching info for XCR0 via cpuid.
- // These two registers make up 64-bits in the header for which bits
- // 62:10 are currently reserved for future implementations and unused. Bit 63
- // is unused for our implementation as we do not utilize
- // compressed XSAVE areas. Bits 9..8 are currently ignored as we do not use
- // the functionality for PKRU state and MSR tracing.
- // Ergo we are primarily concerned with bits 7..0, which define
- // which ISA extensions and features are enabled for a given machine and are
- // defined in XemXcr0Eax and is used to program XSAVE area
- // for saving the required registers as defined in XCR0.
- int xcr0_edx = VM_Version::get_xsave_header_upper_segment();
- int xcr0_eax = VM_Version::get_xsave_header_lower_segment();
- movl(rdx,xcr0_edx);
- movl(rax,xcr0_eax);
- xsave(Address(rsp, wordSize*2));
- // now Apply control bits and clear bytes 8..23 in the header
- pop(rdx);
- pop(rax);
- movl(Address(rsp, XSTATE_BV), xcr0_eax);
- movl(Address(rsp, XSTATE_BV+4), xcr0_edx);
- andq(Address(rsp, XSTATE_BV+8), 0);
- andq(Address(rsp, XSTATE_BV+16), 0);
- } else {
fxsave(Address(rsp, 0));
- }
#endif // LP64
}
void MacroAssembler::push_IU_state() {
// Push flags first because pusha kills them
--- 3811,3827 ----
*** 4002,4011 ****
--- 4009,4035 ----
lea(rscratch1, src);
Assembler::xorpd(dst, Address(rscratch1, 0));
}
}
+ void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
+ if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
+ Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
+ }
+ else {
+ Assembler::xorpd(dst, src);
+ }
+ }
+
+ void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
+ if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
+ Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
+ } else {
+ Assembler::xorps(dst, src);
+ }
+ }
+
void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
// Used in sign-bit flipping with aligned address.
assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
if (reachable(src)) {
Assembler::xorps(dst, as_Address(src));
*** 4045,4054 ****
--- 4069,4754 ----
lea(rscratch1, src);
vaddss(dst, nds, Address(rscratch1, 0));
}
}
+ void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ int src_enc = src->encoding();
+ if ((dst_enc < 16) && (nds_enc < 16)) {
+ vandps(dst, nds, negate_field, vector_len);
+ } else if ((src_enc < 16) && (dst_enc < 16)) {
+ movss(src, nds);
+ vandps(dst, src, negate_field, vector_len);
+ } else if (src_enc < 16) {
+ movss(src, nds);
+ vandps(src, src, negate_field, vector_len);
+ movss(dst, src);
+ } else if (dst_enc < 16) {
+ movdqu(src, xmm0);
+ movss(xmm0, nds);
+ vandps(dst, xmm0, negate_field, vector_len);
+ movdqu(xmm0, src);
+ } else if (nds_enc < 16) {
+ movdqu(src, xmm0);
+ vandps(xmm0, nds, negate_field, vector_len);
+ movss(dst, xmm0);
+ movdqu(xmm0, src);
+ } else {
+ movdqu(src, xmm0);
+ movss(xmm0, nds);
+ vandps(xmm0, xmm0, negate_field, vector_len);
+ movss(dst, xmm0);
+ movdqu(xmm0, src);
+ }
+ }
+
+ void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ int src_enc = src->encoding();
+ if ((dst_enc < 16) && (nds_enc < 16)) {
+ vandpd(dst, nds, negate_field, vector_len);
+ } else if ((src_enc < 16) && (dst_enc < 16)) {
+ movsd(src, nds);
+ vandpd(dst, src, negate_field, vector_len);
+ } else if (src_enc < 16) {
+ movsd(src, nds);
+ vandpd(src, src, negate_field, vector_len);
+ movsd(dst, src);
+ } else if (dst_enc < 16) {
+ movdqu(src, xmm0);
+ movsd(xmm0, nds);
+ vandpd(dst, xmm0, negate_field, vector_len);
+ movdqu(xmm0, src);
+ } else if (nds_enc < 16) {
+ movdqu(src, xmm0);
+ vandpd(xmm0, nds, negate_field, vector_len);
+ movsd(dst, xmm0);
+ movdqu(xmm0, src);
+ } else {
+ movdqu(src, xmm0);
+ movsd(xmm0, nds);
+ vandpd(xmm0, xmm0, negate_field, vector_len);
+ movsd(dst, xmm0);
+ movdqu(xmm0, src);
+ }
+ }
+
+ void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ int src_enc = src->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpaddb(dst, nds, src, vector_len);
+ } else if ((dst_enc < 16) && (src_enc < 16)) {
+ Assembler::vpaddb(dst, dst, src, vector_len);
+ } else if ((dst_enc < 16) && (nds_enc < 16)) {
+ // use nds as scratch for src
+ evmovdqul(nds, src, Assembler::AVX_512bit);
+ Assembler::vpaddb(dst, dst, nds, vector_len);
+ } else if ((src_enc < 16) && (nds_enc < 16)) {
+ // use nds as scratch for dst
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpaddb(nds, nds, src, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else if (dst_enc < 16) {
+ // use nds as scatch for xmm0 to hold src
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, src, Assembler::AVX_512bit);
+ Assembler::vpaddb(dst, dst, xmm0, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ } else {
+ // worse case scenario, all regs are in the upper bank
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm1, src, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpaddb(xmm0, xmm0, xmm1, vector_len);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ }
+ }
+
+ void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpaddb(dst, nds, src, vector_len);
+ } else if (dst_enc < 16) {
+ Assembler::vpaddb(dst, dst, src, vector_len);
+ } else if (nds_enc < 16) {
+ // implies dst_enc in upper bank with src as scratch
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpaddb(nds, nds, src, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else {
+ // worse case scenario, all regs in upper bank
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpaddb(xmm0, xmm0, src, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ }
+ }
+
+ void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ int src_enc = src->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpaddw(dst, nds, src, vector_len);
+ } else if ((dst_enc < 16) && (src_enc < 16)) {
+ Assembler::vpaddw(dst, dst, src, vector_len);
+ } else if ((dst_enc < 16) && (nds_enc < 16)) {
+ // use nds as scratch for src
+ evmovdqul(nds, src, Assembler::AVX_512bit);
+ Assembler::vpaddw(dst, dst, nds, vector_len);
+ } else if ((src_enc < 16) && (nds_enc < 16)) {
+ // use nds as scratch for dst
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpaddw(nds, nds, src, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else if (dst_enc < 16) {
+ // use nds as scatch for xmm0 to hold src
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, src, Assembler::AVX_512bit);
+ Assembler::vpaddw(dst, dst, xmm0, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ } else {
+ // worse case scenario, all regs are in the upper bank
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm1, src, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpaddw(xmm0, xmm0, xmm1, vector_len);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ }
+ }
+
+ void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpaddw(dst, nds, src, vector_len);
+ } else if (dst_enc < 16) {
+ Assembler::vpaddw(dst, dst, src, vector_len);
+ } else if (nds_enc < 16) {
+ // implies dst_enc in upper bank with src as scratch
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpaddw(nds, nds, src, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else {
+ // worse case scenario, all regs in upper bank
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpaddw(xmm0, xmm0, src, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ }
+ }
+
+ void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ int src_enc = src->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpsubb(dst, nds, src, vector_len);
+ } else if ((dst_enc < 16) && (src_enc < 16)) {
+ Assembler::vpsubb(dst, dst, src, vector_len);
+ } else if ((dst_enc < 16) && (nds_enc < 16)) {
+ // use nds as scratch for src
+ evmovdqul(nds, src, Assembler::AVX_512bit);
+ Assembler::vpsubb(dst, dst, nds, vector_len);
+ } else if ((src_enc < 16) && (nds_enc < 16)) {
+ // use nds as scratch for dst
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpsubb(nds, nds, src, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else if (dst_enc < 16) {
+ // use nds as scatch for xmm0 to hold src
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, src, Assembler::AVX_512bit);
+ Assembler::vpsubb(dst, dst, xmm0, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ } else {
+ // worse case scenario, all regs are in the upper bank
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm1, src, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpsubb(xmm0, xmm0, xmm1, vector_len);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ }
+ }
+
+ void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpsubb(dst, nds, src, vector_len);
+ } else if (dst_enc < 16) {
+ Assembler::vpsubb(dst, dst, src, vector_len);
+ } else if (nds_enc < 16) {
+ // implies dst_enc in upper bank with src as scratch
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpsubb(nds, nds, src, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else {
+ // worse case scenario, all regs in upper bank
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpsubw(xmm0, xmm0, src, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ }
+ }
+
+ void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ int src_enc = src->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpsubw(dst, nds, src, vector_len);
+ } else if ((dst_enc < 16) && (src_enc < 16)) {
+ Assembler::vpsubw(dst, dst, src, vector_len);
+ } else if ((dst_enc < 16) && (nds_enc < 16)) {
+ // use nds as scratch for src
+ evmovdqul(nds, src, Assembler::AVX_512bit);
+ Assembler::vpsubw(dst, dst, nds, vector_len);
+ } else if ((src_enc < 16) && (nds_enc < 16)) {
+ // use nds as scratch for dst
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpsubw(nds, nds, src, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else if (dst_enc < 16) {
+ // use nds as scatch for xmm0 to hold src
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, src, Assembler::AVX_512bit);
+ Assembler::vpsubw(dst, dst, xmm0, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ } else {
+ // worse case scenario, all regs are in the upper bank
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm1, src, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpsubw(xmm0, xmm0, xmm1, vector_len);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ }
+ }
+
+ void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpsubw(dst, nds, src, vector_len);
+ } else if (dst_enc < 16) {
+ Assembler::vpsubw(dst, dst, src, vector_len);
+ } else if (nds_enc < 16) {
+ // implies dst_enc in upper bank with src as scratch
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpsubw(nds, nds, src, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else {
+ // worse case scenario, all regs in upper bank
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpsubw(xmm0, xmm0, src, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ }
+ }
+
+
+ void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ int src_enc = src->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpmullw(dst, nds, src, vector_len);
+ } else if ((dst_enc < 16) && (src_enc < 16)) {
+ Assembler::vpmullw(dst, dst, src, vector_len);
+ } else if ((dst_enc < 16) && (nds_enc < 16)) {
+ // use nds as scratch for src
+ evmovdqul(nds, src, Assembler::AVX_512bit);
+ Assembler::vpmullw(dst, dst, nds, vector_len);
+ } else if ((src_enc < 16) && (nds_enc < 16)) {
+ // use nds as scratch for dst
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpmullw(nds, nds, src, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else if (dst_enc < 16) {
+ // use nds as scatch for xmm0 to hold src
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, src, Assembler::AVX_512bit);
+ Assembler::vpmullw(dst, dst, xmm0, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ } else {
+ // worse case scenario, all regs are in the upper bank
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm1, src, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpmullw(xmm0, xmm0, xmm1, vector_len);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ }
+ }
+
+ void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpmullw(dst, nds, src, vector_len);
+ } else if (dst_enc < 16) {
+ Assembler::vpmullw(dst, dst, src, vector_len);
+ } else if (nds_enc < 16) {
+ // implies dst_enc in upper bank with src as scratch
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpmullw(nds, nds, src, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else {
+ // worse case scenario, all regs in upper bank
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpmullw(xmm0, xmm0, src, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ }
+ }
+
+ void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ int shift_enc = shift->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpsraw(dst, nds, shift, vector_len);
+ } else if ((dst_enc < 16) && (shift_enc < 16)) {
+ Assembler::vpsraw(dst, dst, shift, vector_len);
+ } else if ((dst_enc < 16) && (nds_enc < 16)) {
+ // use nds_enc as scratch with shift
+ evmovdqul(nds, shift, Assembler::AVX_512bit);
+ Assembler::vpsraw(dst, dst, nds, vector_len);
+ } else if ((shift_enc < 16) && (nds_enc < 16)) {
+ // use nds as scratch with dst
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpsraw(nds, nds, shift, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else if (dst_enc < 16) {
+ // use nds to save a copy of xmm0 and hold shift
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, shift, Assembler::AVX_512bit);
+ Assembler::vpsraw(dst, dst, xmm0, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ } else if (nds_enc < 16) {
+ // use nds as dest as temps
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, shift, Assembler::AVX_512bit);
+ Assembler::vpsraw(nds, nds, xmm0, vector_len);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else {
+ // worse case scenario, all regs are in the upper bank
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm1, shift, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
+ evmovdqul(xmm1, dst, Assembler::AVX_512bit);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ }
+ }
+
+ void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpsraw(dst, nds, shift, vector_len);
+ } else if (dst_enc < 16) {
+ Assembler::vpsraw(dst, dst, shift, vector_len);
+ } else if (nds_enc < 16) {
+ // use nds as scratch
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpsraw(nds, nds, shift, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else {
+ // use nds as scratch for xmm0
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpsraw(xmm0, xmm0, shift, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ }
+ }
+
+ void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ int shift_enc = shift->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpsrlw(dst, nds, shift, vector_len);
+ } else if ((dst_enc < 16) && (shift_enc < 16)) {
+ Assembler::vpsrlw(dst, dst, shift, vector_len);
+ } else if ((dst_enc < 16) && (nds_enc < 16)) {
+ // use nds_enc as scratch with shift
+ evmovdqul(nds, shift, Assembler::AVX_512bit);
+ Assembler::vpsrlw(dst, dst, nds, vector_len);
+ } else if ((shift_enc < 16) && (nds_enc < 16)) {
+ // use nds as scratch with dst
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpsrlw(nds, nds, shift, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else if (dst_enc < 16) {
+ // use nds to save a copy of xmm0 and hold shift
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, shift, Assembler::AVX_512bit);
+ Assembler::vpsrlw(dst, dst, xmm0, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ } else if (nds_enc < 16) {
+ // use nds as dest as temps
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, shift, Assembler::AVX_512bit);
+ Assembler::vpsrlw(nds, nds, xmm0, vector_len);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else {
+ // worse case scenario, all regs are in the upper bank
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm1, shift, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
+ evmovdqul(xmm1, dst, Assembler::AVX_512bit);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ }
+ }
+
+ void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpsrlw(dst, nds, shift, vector_len);
+ } else if (dst_enc < 16) {
+ Assembler::vpsrlw(dst, dst, shift, vector_len);
+ } else if (nds_enc < 16) {
+ // use nds as scratch
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpsrlw(nds, nds, shift, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else {
+ // use nds as scratch for xmm0
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpsrlw(xmm0, xmm0, shift, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ }
+ }
+
+ void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ int shift_enc = shift->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpsllw(dst, nds, shift, vector_len);
+ } else if ((dst_enc < 16) && (shift_enc < 16)) {
+ Assembler::vpsllw(dst, dst, shift, vector_len);
+ } else if ((dst_enc < 16) && (nds_enc < 16)) {
+ // use nds_enc as scratch with shift
+ evmovdqul(nds, shift, Assembler::AVX_512bit);
+ Assembler::vpsllw(dst, dst, nds, vector_len);
+ } else if ((shift_enc < 16) && (nds_enc < 16)) {
+ // use nds as scratch with dst
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpsllw(nds, nds, shift, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else if (dst_enc < 16) {
+ // use nds to save a copy of xmm0 and hold shift
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, shift, Assembler::AVX_512bit);
+ Assembler::vpsllw(dst, dst, xmm0, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ } else if (nds_enc < 16) {
+ // use nds as dest as temps
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, shift, Assembler::AVX_512bit);
+ Assembler::vpsllw(nds, nds, xmm0, vector_len);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else {
+ // worse case scenario, all regs are in the upper bank
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm1, shift, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
+ evmovdqul(xmm1, dst, Assembler::AVX_512bit);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ }
+ }
+
+ void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
+ int dst_enc = dst->encoding();
+ int nds_enc = nds->encoding();
+ if (VM_Version::supports_avx256only() || VM_Version::supports_avx512bw()) {
+ Assembler::vpsllw(dst, nds, shift, vector_len);
+ } else if (dst_enc < 16) {
+ Assembler::vpsllw(dst, dst, shift, vector_len);
+ } else if (nds_enc < 16) {
+ // use nds as scratch
+ evmovdqul(nds, dst, Assembler::AVX_512bit);
+ Assembler::vpsllw(nds, nds, shift, vector_len);
+ evmovdqul(dst, nds, Assembler::AVX_512bit);
+ } else {
+ // use nds as scratch for xmm0
+ evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::vpsllw(xmm0, xmm0, shift, vector_len);
+ evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+ }
+ }
+
+ // This instruction exists within macros, ergo we cannot control its input
+ // when emitted through those patterns.
+ void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
+ if (VM_Version::supports_avx512nobw()) {
+ int dst_enc = dst->encoding();
+ int src_enc = src->encoding();
+ if (dst_enc == src_enc) {
+ if (dst_enc < 16) {
+ Assembler::punpcklbw(dst, src);
+ } else {
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::punpcklbw(xmm0, xmm0);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ }
+ } else {
+ if ((src_enc < 16) && (dst_enc < 16)) {
+ Assembler::punpcklbw(dst, src);
+ } else if (src_enc < 16) {
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::punpcklbw(xmm0, src);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ } else if (dst_enc < 16) {
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, src, Assembler::AVX_512bit);
+ Assembler::punpcklbw(dst, xmm0);
+ evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ } else {
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ evmovdqul(xmm1, src, Assembler::AVX_512bit);
+ Assembler::punpcklbw(xmm0, xmm1);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ }
+ }
+ } else {
+ Assembler::punpcklbw(dst, src);
+ }
+ }
+
+ // This instruction exists within macros, ergo we cannot control its input
+ // when emitted through those patterns.
+ void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
+ if (VM_Version::supports_avx512nobw()) {
+ int dst_enc = dst->encoding();
+ int src_enc = src->encoding();
+ if (dst_enc == src_enc) {
+ if (dst_enc < 16) {
+ Assembler::pshuflw(dst, src, mode);
+ } else {
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::pshuflw(xmm0, xmm0, mode);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ }
+ } else {
+ if ((src_enc < 16) && (dst_enc < 16)) {
+ Assembler::pshuflw(dst, src, mode);
+ } else if (src_enc < 16) {
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ Assembler::pshuflw(xmm0, src, mode);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ } else if (dst_enc < 16) {
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm0, src, Assembler::AVX_512bit);
+ Assembler::pshuflw(dst, xmm0, mode);
+ evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ } else {
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+ subptr(rsp, 64);
+ evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+ evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+ evmovdqul(xmm1, src, Assembler::AVX_512bit);
+ Assembler::pshuflw(xmm0, xmm1, mode);
+ evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+ evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+ addptr(rsp, 64);
+ }
+ }
+ } else {
+ Assembler::pshuflw(dst, src, mode);
+ }
+ }
+
void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
if (reachable(src)) {
vandpd(dst, nds, as_Address(src), vector_len);
} else {
lea(rscratch1, src);
*** 4128,4162 ****
(nds_upper_bank || dst_upper_bank)) {
if (dst_upper_bank) {
subptr(rsp, 64);
evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
movflt(xmm0, nds);
! if (reachable(src)) {
! vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
! } else {
! lea(rscratch1, src);
! vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
! }
movflt(dst, xmm0);
evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
addptr(rsp, 64);
} else {
movflt(dst, nds);
! if (reachable(src)) {
! vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
! } else {
! lea(rscratch1, src);
! vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
! }
}
} else {
! if (reachable(src)) {
! vxorps(dst, nds, as_Address(src), Assembler::AVX_128bit);
! } else {
! lea(rscratch1, src);
! vxorps(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
! }
}
}
void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
int nds_enc = nds->encoding();
--- 4828,4847 ----
(nds_upper_bank || dst_upper_bank)) {
if (dst_upper_bank) {
subptr(rsp, 64);
evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
movflt(xmm0, nds);
! vxorps(xmm0, xmm0, src, Assembler::AVX_128bit);
movflt(dst, xmm0);
evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
addptr(rsp, 64);
} else {
movflt(dst, nds);
! vxorps(dst, dst, src, Assembler::AVX_128bit);
}
} else {
! vxorps(dst, nds, src, Assembler::AVX_128bit);
}
}
void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
int nds_enc = nds->encoding();
*** 4167,4201 ****
(nds_upper_bank || dst_upper_bank)) {
if (dst_upper_bank) {
subptr(rsp, 64);
evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
movdbl(xmm0, nds);
! if (reachable(src)) {
! vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
! } else {
! lea(rscratch1, src);
! vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
! }
movdbl(dst, xmm0);
evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
addptr(rsp, 64);
} else {
movdbl(dst, nds);
! if (reachable(src)) {
! vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
! } else {
! lea(rscratch1, src);
! vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
! }
}
} else {
! if (reachable(src)) {
! vxorpd(dst, nds, as_Address(src), Assembler::AVX_128bit);
! } else {
! lea(rscratch1, src);
! vxorpd(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
! }
}
}
void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
if (reachable(src)) {
--- 4852,4871 ----
(nds_upper_bank || dst_upper_bank)) {
if (dst_upper_bank) {
subptr(rsp, 64);
evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
movdbl(xmm0, nds);
! vxorpd(xmm0, xmm0, src, Assembler::AVX_128bit);
movdbl(dst, xmm0);
evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
addptr(rsp, 64);
} else {
movdbl(dst, nds);
! vxorpd(dst, dst, src, Assembler::AVX_128bit);
}
} else {
! vxorpd(dst, nds, src, Assembler::AVX_128bit);
}
}
void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
if (reachable(src)) {
*** 4683,4702 ****
void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
pusha();
// if we are coming from c1, xmm registers may be live
- int off = 0;
int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8);
if (UseAVX > 2) {
num_xmm_regs = LP64_ONLY(32) NOT_LP64(8);
}
if (UseSSE == 1) {
subptr(rsp, sizeof(jdouble)*8);
for (int n = 0; n < 8; n++) {
! movflt(Address(rsp, off++*sizeof(jdouble)), as_XMMRegister(n));
}
} else if (UseSSE >= 2) {
if (UseAVX > 2) {
push(rbx);
movl(rbx, 0xffff);
--- 5353,5371 ----
void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
pusha();
// if we are coming from c1, xmm registers may be live
int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8);
if (UseAVX > 2) {
num_xmm_regs = LP64_ONLY(32) NOT_LP64(8);
}
if (UseSSE == 1) {
subptr(rsp, sizeof(jdouble)*8);
for (int n = 0; n < 8; n++) {
! movflt(Address(rsp, n*sizeof(jdouble)), as_XMMRegister(n));
}
} else if (UseSSE >= 2) {
if (UseAVX > 2) {
push(rbx);
movl(rbx, 0xffff);
*** 4704,4744 ****
pop(rbx);
}
#ifdef COMPILER2
if (MaxVectorSize > 16) {
if(UseAVX > 2) {
! // Save upper half of ZMM registes
subptr(rsp, 32*num_xmm_regs);
for (int n = 0; n < num_xmm_regs; n++) {
! vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n));
}
- off = 0;
}
assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
! // Save upper half of YMM registes
subptr(rsp, 16*num_xmm_regs);
for (int n = 0; n < num_xmm_regs; n++) {
! vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
}
}
#endif
// Save whole 128bit (16 bytes) XMM registers
subptr(rsp, 16*num_xmm_regs);
- off = 0;
#ifdef _LP64
! if (VM_Version::supports_avx512novl()) {
for (int n = 0; n < num_xmm_regs; n++) {
! vextractf32x4h(Address(rsp, off++*16), as_XMMRegister(n), 0);
}
} else {
for (int n = 0; n < num_xmm_regs; n++) {
! movdqu(Address(rsp, off++*16), as_XMMRegister(n));
}
}
#else
for (int n = 0; n < num_xmm_regs; n++) {
! movdqu(Address(rsp, off++*16), as_XMMRegister(n));
}
#endif
}
// Preserve registers across runtime call
--- 5373,5411 ----
pop(rbx);
}
#ifdef COMPILER2
if (MaxVectorSize > 16) {
if(UseAVX > 2) {
! // Save upper half of ZMM registers
subptr(rsp, 32*num_xmm_regs);
for (int n = 0; n < num_xmm_regs; n++) {
! vextractf64x4h(Address(rsp, n*32), as_XMMRegister(n), 1);
}
}
assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
! // Save upper half of YMM registers
subptr(rsp, 16*num_xmm_regs);
for (int n = 0; n < num_xmm_regs; n++) {
! vextractf128h(Address(rsp, n*16), as_XMMRegister(n));
}
}
#endif
// Save whole 128bit (16 bytes) XMM registers
subptr(rsp, 16*num_xmm_regs);
#ifdef _LP64
! if (VM_Version::supports_evex()) {
for (int n = 0; n < num_xmm_regs; n++) {
! vextractf32x4h(Address(rsp, n*16), as_XMMRegister(n), 0);
}
} else {
for (int n = 0; n < num_xmm_regs; n++) {
! movdqu(Address(rsp, n*16), as_XMMRegister(n));
}
}
#else
for (int n = 0; n < num_xmm_regs; n++) {
! movdqu(Address(rsp, n*16), as_XMMRegister(n));
}
#endif
}
// Preserve registers across runtime call
*** 4803,4850 ****
}
fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
addptr(rsp, sizeof(jdouble)*nb_args);
}
- off = 0;
if (UseSSE == 1) {
for (int n = 0; n < 8; n++) {
! movflt(as_XMMRegister(n), Address(rsp, off++*sizeof(jdouble)));
}
addptr(rsp, sizeof(jdouble)*8);
} else if (UseSSE >= 2) {
! // Restore whole 128bit (16 bytes) XMM regiters
#ifdef _LP64
! if (VM_Version::supports_avx512novl()) {
for (int n = 0; n < num_xmm_regs; n++) {
! vinsertf32x4h(as_XMMRegister(n), Address(rsp, off++*16), 0);
}
! }
! else {
for (int n = 0; n < num_xmm_regs; n++) {
! movdqu(as_XMMRegister(n), Address(rsp, off++*16));
}
}
#else
for (int n = 0; n < num_xmm_regs; n++) {
! movdqu(as_XMMRegister(n), Address(rsp, off++ * 16));
}
#endif
addptr(rsp, 16*num_xmm_regs);
#ifdef COMPILER2
if (MaxVectorSize > 16) {
! // Restore upper half of YMM registes.
! off = 0;
for (int n = 0; n < num_xmm_regs; n++) {
! vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
}
addptr(rsp, 16*num_xmm_regs);
if(UseAVX > 2) {
- off = 0;
for (int n = 0; n < num_xmm_regs; n++) {
! vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32));
}
addptr(rsp, 32*num_xmm_regs);
}
}
#endif
--- 5470,5513 ----
}
fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
addptr(rsp, sizeof(jdouble)*nb_args);
}
if (UseSSE == 1) {
for (int n = 0; n < 8; n++) {
! movflt(as_XMMRegister(n), Address(rsp, n*sizeof(jdouble)));
}
addptr(rsp, sizeof(jdouble)*8);
} else if (UseSSE >= 2) {
! // Restore whole 128bit (16 bytes) XMM registers
#ifdef _LP64
! if (VM_Version::supports_evex()) {
for (int n = 0; n < num_xmm_regs; n++) {
! vinsertf32x4h(as_XMMRegister(n), Address(rsp, n*16), 0);
}
! } else {
for (int n = 0; n < num_xmm_regs; n++) {
! movdqu(as_XMMRegister(n), Address(rsp, n*16));
}
}
#else
for (int n = 0; n < num_xmm_regs; n++) {
! movdqu(as_XMMRegister(n), Address(rsp, n*16));
}
#endif
addptr(rsp, 16*num_xmm_regs);
#ifdef COMPILER2
if (MaxVectorSize > 16) {
! // Restore upper half of YMM registers.
for (int n = 0; n < num_xmm_regs; n++) {
! vinsertf128h(as_XMMRegister(n), Address(rsp, n*16));
}
addptr(rsp, 16*num_xmm_regs);
if(UseAVX > 2) {
for (int n = 0; n < num_xmm_regs; n++) {
! vinsertf64x4h(as_XMMRegister(n), Address(rsp, n*32), 1);
}
addptr(rsp, 32*num_xmm_regs);
}
}
#endif
*** 7206,7216 ****
jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
BIND(L_check_fill_32_bytes);
addl(count, 8 << shift);
jccb(Assembler::less, L_check_fill_8_bytes);
! evmovdqul(Address(to, 0), xtmp, Assembler::AVX_256bit);
addptr(to, 32);
subl(count, 8 << shift);
BIND(L_check_fill_8_bytes);
} else if (UseAVX == 2 && UseUnalignedLoadStores) {
--- 7869,7879 ----
jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
BIND(L_check_fill_32_bytes);
addl(count, 8 << shift);
jccb(Assembler::less, L_check_fill_8_bytes);
! vmovdqu(Address(to, 0), xtmp);
addptr(to, 32);
subl(count, 8 << shift);
BIND(L_check_fill_8_bytes);
} else if (UseAVX == 2 && UseUnalignedLoadStores) {
< prev index next >