--- old/src/hotspot/cpu/x86/assembler_x86.cpp 2019-04-30 17:18:46.566965497 -0700 +++ new/src/hotspot/cpu/x86/assembler_x86.cpp 2019-04-30 17:18:46.466965500 -0700 @@ -1894,6 +1894,69 @@ emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::pabsb(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_ssse3(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x1C); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::pabsw(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_ssse3(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x1D); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::pabsd(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_ssse3(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x1E); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::vpabsb(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit? VM_Version::supports_avx() : + vector_len == AVX_256bit? VM_Version::supports_avx2() : + vector_len == AVX_512bit? VM_Version::supports_avx512bw() : 0, ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x1C); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::vpabsw(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit? VM_Version::supports_avx() : + vector_len == AVX_256bit? VM_Version::supports_avx2() : + vector_len == AVX_512bit? VM_Version::supports_avx512bw() : 0, ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x1D); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::vpabsd(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit? VM_Version::supports_avx() : + vector_len == AVX_256bit? VM_Version::supports_avx2() : + vector_len == AVX_512bit? VM_Version::supports_evex() : 0, ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x1E); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::evpabsq(XMMRegister dst, XMMRegister src, int vector_len) { + assert(UseAVX > 2, ""); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x1F); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::decl(Address dst) { // Don't use it directly. Use MacroAssembler::decrement() instead. InstructionMark im(this); @@ -3416,10 +3479,19 @@ InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int8(0x00); - emit_int8(0xC0 | encode); + emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); } +void Assembler::vpermq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 2, "requires AVX512F"); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x36); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::vperm2i128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8) { assert(VM_Version::supports_avx2(), ""); InstructionAttr attributes(AVX_256bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); @@ -3884,6 +3956,14 @@ emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::pmovsxbw(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x20); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); @@ -3905,6 +3985,15 @@ emit_int8((unsigned char) (0xC0 | encode)); } +void Assembler::vpmovsxbw(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit? VM_Version::supports_avx() : + vector_len == AVX_256bit? VM_Version::supports_avx2() : + vector_len == AVX_512bit? VM_Version::supports_avx512bw() : 0, ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x20); + emit_int8((unsigned char)(0xC0 | encode)); +} void Assembler::evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vector_len) { assert(VM_Version::supports_avx512vlbw(), ""); @@ -6277,6 +6366,26 @@ emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::evpsraq(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + assert(UseAVX > 2, "requires AVX512"); + assert ((VM_Version::supports_avx512vl() || vector_len == 2), "requires AVX512vl"); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(xmm4->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0x72); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(shift & 0xFF); +} + +void Assembler::evpsraq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 2, "requires AVX512"); + assert ((VM_Version::supports_avx512vl() || vector_len == 2), "requires AVX512vl"); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0xE2); + emit_int8((unsigned char)(0xC0 | encode)); +} // logical operations packed integers void Assembler::pand(XMMRegister dst, XMMRegister src) { --- old/src/hotspot/cpu/x86/assembler_x86.hpp 2019-04-30 17:18:46.998965484 -0700 +++ new/src/hotspot/cpu/x86/assembler_x86.hpp 2019-04-30 17:18:46.894965487 -0700 @@ -1102,6 +1102,15 @@ void cvttpd2dq(XMMRegister dst, XMMRegister src); + //Abs of packed Integer values + void pabsb(XMMRegister dst, XMMRegister src); + void pabsw(XMMRegister dst, XMMRegister src); + void pabsd(XMMRegister dst, XMMRegister src); + void vpabsb(XMMRegister dst, XMMRegister src, int vector_len); + void vpabsw(XMMRegister dst, XMMRegister src, int vector_len); + void vpabsd(XMMRegister dst, XMMRegister src, int vector_len); + void evpabsq(XMMRegister dst, XMMRegister src, int vector_len); + // Divide Scalar Double-Precision Floating-Point Values void divsd(XMMRegister dst, Address src); void divsd(XMMRegister dst, XMMRegister src); @@ -1589,6 +1598,7 @@ // Pemutation of 64bit words void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len); void vpermq(XMMRegister dst, XMMRegister src, int imm8); + void vpermq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vperm2i128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8); void vperm2f128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8); void evpermi2q(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); @@ -1668,6 +1678,10 @@ void evpmovdb(Address dst, XMMRegister src, int vector_len); + // Sign extend moves + void pmovsxbw(XMMRegister dst, XMMRegister src); + void vpmovsxbw(XMMRegister dst, XMMRegister src, int vector_len); + // Multiply add void pmaddwd(XMMRegister dst, XMMRegister src); void vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); @@ -2094,6 +2108,8 @@ void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len); void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void evpsraq(XMMRegister dst, XMMRegister src, int shift, int vector_len); + void evpsraq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); // And packed integers void pand(XMMRegister dst, XMMRegister src); --- old/src/hotspot/cpu/x86/macroAssembler_x86.cpp 2019-04-30 17:18:47.302965475 -0700 +++ new/src/hotspot/cpu/x86/macroAssembler_x86.cpp 2019-04-30 17:18:47.194965479 -0700 @@ -1003,25 +1003,25 @@ } } -void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) { +void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) { // Used in sign-masking with aligned address. assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); if (reachable(src)) { Assembler::andpd(dst, as_Address(src)); } else { - lea(rscratch1, src); - Assembler::andpd(dst, Address(rscratch1, 0)); + lea(scratch_reg, src); + Assembler::andpd(dst, Address(scratch_reg, 0)); } } -void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) { +void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) { // Used in sign-masking with aligned address. assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); if (reachable(src)) { Assembler::andps(dst, as_Address(src)); } else { - lea(rscratch1, src); - Assembler::andps(dst, Address(rscratch1, 0)); + lea(scratch_reg, src); + Assembler::andps(dst, Address(scratch_reg, 0)); } } @@ -3340,13 +3340,13 @@ Assembler::vmovdqu(dst, src); } -void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) { +void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) { if (reachable(src)) { vmovdqu(dst, as_Address(src)); } else { - lea(rscratch1, src); - vmovdqu(dst, Address(rscratch1, 0)); + lea(scratch_reg, src); + vmovdqu(dst, Address(scratch_reg, 0)); } } @@ -3698,14 +3698,14 @@ } } -void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) { +void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) { // Used in sign-bit flipping with aligned address. assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); if (reachable(src)) { Assembler::xorpd(dst, as_Address(src)); } else { - lea(rscratch1, src); - Assembler::xorpd(dst, Address(rscratch1, 0)); + lea(scratch_reg, src); + Assembler::xorpd(dst, Address(scratch_reg, 0)); } } @@ -3726,14 +3726,14 @@ } } -void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) { +void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) { // Used in sign-bit flipping with aligned address. assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); if (reachable(src)) { Assembler::xorps(dst, as_Address(src)); } else { - lea(rscratch1, src); - Assembler::xorps(dst, Address(rscratch1, 0)); + lea(scratch_reg, src); + Assembler::xorps(dst, Address(scratch_reg, 0)); } } @@ -3799,12 +3799,12 @@ Assembler::vpaddw(dst, nds, src, vector_len); } -void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) { +void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { if (reachable(src)) { Assembler::vpand(dst, nds, as_Address(src), vector_len); } else { - lea(rscratch1, src); - Assembler::vpand(dst, nds, Address(rscratch1, 0), vector_len); + lea(scratch_reg, src); + Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len); } } @@ -3873,6 +3873,22 @@ Assembler::vpsraw(dst, nds, shift, vector_len); } +void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { + assert(UseAVX > 2,""); + if (!VM_Version::supports_avx512vl() && vector_len < 2) { + vector_len = 2; + } + Assembler::evpsraq(dst, nds, shift, vector_len); +} + +void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { + assert(UseAVX > 2,""); + if (!VM_Version::supports_avx512vl() && vector_len < 2) { + vector_len = 2; + } + Assembler::evpsraq(dst, nds, shift, vector_len); +} + void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); Assembler::vpsrlw(dst, nds, shift, vector_len); @@ -3913,21 +3929,21 @@ Assembler::pshuflw(dst, src, mode); } -void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) { +void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { if (reachable(src)) { vandpd(dst, nds, as_Address(src), vector_len); } else { - lea(rscratch1, src); - vandpd(dst, nds, Address(rscratch1, 0), vector_len); + lea(scratch_reg, src); + vandpd(dst, nds, Address(scratch_reg, 0), vector_len); } } -void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) { +void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { if (reachable(src)) { vandps(dst, nds, as_Address(src), vector_len); } else { - lea(rscratch1, src); - vandps(dst, nds, Address(rscratch1, 0), vector_len); + lea(scratch_reg, src); + vandps(dst, nds, Address(scratch_reg, 0), vector_len); } } @@ -3995,21 +4011,35 @@ vxorpd(dst, nds, src, Assembler::AVX_128bit); } -void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) { +void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { if (reachable(src)) { vxorpd(dst, nds, as_Address(src), vector_len); } else { - lea(rscratch1, src); - vxorpd(dst, nds, Address(rscratch1, 0), vector_len); + lea(scratch_reg, src); + vxorpd(dst, nds, Address(scratch_reg, 0), vector_len); } } -void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) { +void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { if (reachable(src)) { vxorps(dst, nds, as_Address(src), vector_len); } else { - lea(rscratch1, src); - vxorps(dst, nds, Address(rscratch1, 0), vector_len); + lea(scratch_reg, src); + vxorps(dst, nds, Address(scratch_reg, 0), vector_len); + } +} + +void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { + if (UseAVX > 1 || (vector_len < 1)) { + if (reachable(src)) { + Assembler::vpxor(dst, nds, as_Address(src), vector_len); + } else { + lea(scratch_reg, src); + Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len); + } + } + else { + MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg); } } --- old/src/hotspot/cpu/x86/macroAssembler_x86.hpp 2019-04-30 17:18:47.662965465 -0700 +++ new/src/hotspot/cpu/x86/macroAssembler_x86.hpp 2019-04-30 17:18:47.550965468 -0700 @@ -877,12 +877,12 @@ // Floating void andpd(XMMRegister dst, Address src) { Assembler::andpd(dst, src); } - void andpd(XMMRegister dst, AddressLiteral src); + void andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); void andpd(XMMRegister dst, XMMRegister src) { Assembler::andpd(dst, src); } void andps(XMMRegister dst, XMMRegister src) { Assembler::andps(dst, src); } void andps(XMMRegister dst, Address src) { Assembler::andps(dst, src); } - void andps(XMMRegister dst, AddressLiteral src); + void andps(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); void comiss(XMMRegister dst, XMMRegister src) { Assembler::comiss(dst, src); } void comiss(XMMRegister dst, Address src) { Assembler::comiss(dst, src); } @@ -1066,8 +1066,8 @@ // these are private because users should be doing movflt/movdbl - void movss(Address dst, XMMRegister src) { Assembler::movss(dst, src); } void movss(XMMRegister dst, XMMRegister src) { Assembler::movss(dst, src); } + void movss(Address dst, XMMRegister src) { Assembler::movss(dst, src); } void movss(XMMRegister dst, Address src) { Assembler::movss(dst, src); } void movss(XMMRegister dst, AddressLiteral src); @@ -1105,7 +1105,7 @@ void vmovdqu(Address dst, XMMRegister src); void vmovdqu(XMMRegister dst, Address src); void vmovdqu(XMMRegister dst, XMMRegister src); - void vmovdqu(XMMRegister dst, AddressLiteral src); + void vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); void evmovdquq(XMMRegister dst, Address src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); } void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); } void evmovdquq(Address dst, XMMRegister src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); } @@ -1183,12 +1183,12 @@ // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values void xorpd(XMMRegister dst, XMMRegister src); void xorpd(XMMRegister dst, Address src) { Assembler::xorpd(dst, src); } - void xorpd(XMMRegister dst, AddressLiteral src); + void xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values void xorps(XMMRegister dst, XMMRegister src); void xorps(XMMRegister dst, Address src) { Assembler::xorps(dst, src); } - void xorps(XMMRegister dst, AddressLiteral src); + void xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); // Shuffle Bytes void pshufb(XMMRegister dst, XMMRegister src) { Assembler::pshufb(dst, src); } @@ -1215,7 +1215,7 @@ void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); } void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); } - void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len); + void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1); void vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len); void vpbroadcastw(XMMRegister dst, Address src, int vector_len) { Assembler::vpbroadcastw(dst, src, vector_len); } @@ -1241,6 +1241,9 @@ void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len); void vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len); + void evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len); + void evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len); + void vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len); void vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len); @@ -1260,11 +1263,11 @@ void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); } void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); } - void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len); + void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1); void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); } void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); } - void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len); + void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1); void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); } void vdivsd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vdivsd(dst, nds, src); } @@ -1297,11 +1300,11 @@ void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); } void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); } - void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len); + void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1); void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); } void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); } - void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len); + void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1); void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2 @@ -1315,6 +1318,7 @@ else Assembler::vxorpd(dst, nds, src, vector_len); } + void vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1); // Simple version for AVX2 256bit vectors void vpxor(XMMRegister dst, XMMRegister src) { Assembler::vpxor(dst, dst, src, true); } --- old/src/hotspot/cpu/x86/stubGenerator_x86_32.cpp 2019-04-30 17:18:48.058965453 -0700 +++ new/src/hotspot/cpu/x86/stubGenerator_x86_32.cpp 2019-04-30 17:18:47.954965456 -0700 @@ -602,7 +602,59 @@ return start; } + //--------------------------------------------------------------------------------------------------- + address generate_vector_mask(const char *stub_name, int32_t mask) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + + for (int i = 0; i < 16; i++) { + __ emit_data(mask, relocInfo::none, 0); + } + + return start; + } + + address generate_vector_mask_long_double(const char *stub_name, int32_t maskhi, int32_t masklo) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + + for (int i = 0; i < 8; i++) { + __ emit_data(masklo, relocInfo::none, 0); + __ emit_data(maskhi, relocInfo::none, 0); + } + + return start; + } + + //---------------------------------------------------------------------------------------------------- + + address generate_vector_byte_perm_mask(const char *stub_name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + + __ emit_data(0x00000001, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000003, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000005, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000007, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000002, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000004, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000006, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + + return start; + } //---------------------------------------------------------------------------------------------------- // Non-destructive plausibility checks for oops @@ -3823,6 +3875,14 @@ //------------------------------------------------------------------------------------------------------------------------ // entry points that are platform specific + StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF); + StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x80000000); + StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask_long_double("vector_double_sign_mask", 0x7FFFFFFF, 0xFFFFFFFF); + StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask_long_double("vector_double_sign_flip", 0x80000000, 0x00000000); + StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff); + StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask"); + StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask_long_double("vector_long_sign_mask", 0x80000000, 0x00000000); + // support for verify_oop (must happen after universe_init) StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); --- old/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp 2019-04-30 17:18:48.470965441 -0700 +++ new/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp 2019-04-30 17:18:48.370965444 -0700 @@ -979,6 +979,40 @@ return start; } + address generate_vector_mask(const char *stub_name, int64_t mask) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + + return start; + } + + address generate_vector_byte_perm_mask(const char *stub_name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + + __ emit_data64(0x0000000000000001, relocInfo::none); + __ emit_data64(0x0000000000000003, relocInfo::none); + __ emit_data64(0x0000000000000005, relocInfo::none); + __ emit_data64(0x0000000000000007, relocInfo::none); + __ emit_data64(0x0000000000000000, relocInfo::none); + __ emit_data64(0x0000000000000002, relocInfo::none); + __ emit_data64(0x0000000000000004, relocInfo::none); + __ emit_data64(0x0000000000000006, relocInfo::none); + + return start; + } + // Non-destructive plausibility checks for oops // // Arguments: @@ -5871,6 +5905,13 @@ StubRoutines::x86::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000); StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF); StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000); + StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF); + StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000); + StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF); + StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000); + StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff); + StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask"); + StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000); // support for verify_oop (must happen after universe_init) StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); --- old/src/hotspot/cpu/x86/stubRoutines_x86.cpp 2019-04-30 17:18:48.790965431 -0700 +++ new/src/hotspot/cpu/x86/stubRoutines_x86.cpp 2019-04-30 17:18:48.690965434 -0700 @@ -43,6 +43,13 @@ address StubRoutines::x86::_upper_word_mask_addr = NULL; address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL; address StubRoutines::x86::_k256_adr = NULL; +address StubRoutines::x86::_vector_short_to_byte_mask = NULL; +address StubRoutines::x86::_vector_float_sign_mask = NULL; +address StubRoutines::x86::_vector_float_sign_flip = NULL; +address StubRoutines::x86::_vector_double_sign_mask = NULL; +address StubRoutines::x86::_vector_double_sign_flip = NULL; +address StubRoutines::x86::_vector_byte_perm_mask = NULL; +address StubRoutines::x86::_vector_long_sign_mask = NULL; #ifdef _LP64 address StubRoutines::x86::_k256_W_adr = NULL; address StubRoutines::x86::_k512_W_addr = NULL; --- old/src/hotspot/cpu/x86/stubRoutines_x86.hpp 2019-04-30 17:18:49.074965423 -0700 +++ new/src/hotspot/cpu/x86/stubRoutines_x86.hpp 2019-04-30 17:18:48.970965426 -0700 @@ -102,6 +102,7 @@ static address double_sign_flip() { return _double_sign_flip; } + #else // !LP64 private: @@ -139,6 +140,13 @@ //k256 table for sha256 static juint _k256[]; static address _k256_adr; + static address _vector_short_to_byte_mask; + static address _vector_float_sign_mask; + static address _vector_float_sign_flip; + static address _vector_double_sign_mask; + static address _vector_double_sign_flip; + static address _vector_byte_perm_mask; + static address _vector_long_sign_mask; #ifdef _LP64 static juint _k256_W[]; static address _k256_W_adr; @@ -212,6 +220,33 @@ static address upper_word_mask_addr() { return _upper_word_mask_addr; } static address shuffle_byte_flip_mask_addr() { return _shuffle_byte_flip_mask_addr; } static address k256_addr() { return _k256_adr; } + + static address vector_short_to_byte_mask() { + return _vector_short_to_byte_mask; + } + static address vector_float_sign_mask() { + return _vector_float_sign_mask; + } + + static address vector_float_sign_flip() { + return _vector_float_sign_flip; + } + + static address vector_double_sign_mask() { + return _vector_double_sign_mask; + } + + static address vector_double_sign_flip() { + return _vector_double_sign_flip; + } + + static address vector_byte_perm_mask() { + return _vector_byte_perm_mask; + } + + static address vector_long_sign_mask() { + return _vector_long_sign_mask; + } #ifdef _LP64 static address k256_W_addr() { return _k256_W_adr; } static address k512_W_addr() { return _k512_W_addr; } --- old/src/hotspot/cpu/x86/x86.ad 2019-04-30 17:18:49.358965415 -0700 +++ new/src/hotspot/cpu/x86/x86.ad 2019-04-30 17:18:49.254965418 -0700 @@ -1372,14 +1372,240 @@ static address double_signmask() { return (address)double_signmask_pool; } static address double_signflip() { return (address)double_signflip_pool; } #endif + static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); } + static address vector_float_signmask() { return StubRoutines::x86::vector_float_sign_mask(); } + static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip(); } + static address vector_double_signmask() { return StubRoutines::x86::vector_double_sign_mask(); } + static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip(); } + static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); } + static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); } + +//============================================================================= + + +typedef void (MacroAssembler::*XX_Inst)(XMMRegister, XMMRegister); +typedef void (MacroAssembler::*XAR_Inst)(XMMRegister, AddressLiteral, Register); +typedef void (MacroAssembler::*XXI_Inst)(XMMRegister, XMMRegister, int); +typedef void (MacroAssembler::*XXAIR_Inst)(XMMRegister, XMMRegister, AddressLiteral, int, Register); +typedef void (MacroAssembler::*XXXI_Inst)(XMMRegister, XMMRegister, XMMRegister, int); + +XX_Inst get_xx_inst(int opcode) { + XX_Inst inst; + switch(opcode) { + case Op_RShiftVB: + case Op_RShiftVS: + return &MacroAssembler::psraw; + case Op_LShiftVB: + case Op_LShiftVS: + return &MacroAssembler::psllw; + case Op_URShiftVB: + case Op_URShiftVS: + return &MacroAssembler::psrlw; + case Op_RShiftVI: + return &MacroAssembler::psrad; + case Op_LShiftVI: + return &MacroAssembler::pslld; + case Op_URShiftVI: + return &MacroAssembler::psrld; + case Op_LShiftVL: + return &MacroAssembler::psllq; + case Op_RShiftVL: + case Op_URShiftVL: + return &MacroAssembler::psrlq; + default: + return NULL; + } +} + +XAR_Inst get_xar_inst(int opcode) { + XAR_Inst inst; + switch(opcode) { + case Op_AbsVF: + return &MacroAssembler::andps; + case Op_AbsVD: + return &MacroAssembler::andpd; + case Op_NegVF: + return &MacroAssembler::xorps; + case Op_NegVD: + return &MacroAssembler::xorpd; + default: + return NULL; + } +} + +XXAIR_Inst get_xxair_inst(int opcode) { + XXAIR_Inst inst; + switch(opcode) { + case Op_AbsVF: + return &MacroAssembler::vandps; + case Op_AbsVD: + return &MacroAssembler::vandpd; + case Op_NegVF: + return &MacroAssembler::vxorps; + case Op_NegVD: + return &MacroAssembler::vxorpd; + default: + return NULL; + } +} + +XXXI_Inst get_xxxi_inst(int opcode) { + XXXI_Inst inst; + switch(opcode) { + case Op_RShiftVB: + case Op_RShiftVS: + return &MacroAssembler::vpsraw; + case Op_LShiftVB: + case Op_LShiftVS: + return &MacroAssembler::vpsllw; + case Op_URShiftVB: + case Op_URShiftVS: + return &MacroAssembler::vpsrlw; + case Op_RShiftVI: + return &MacroAssembler::vpsrad; + case Op_LShiftVI: + return &MacroAssembler::vpslld; + case Op_URShiftVI: + return &MacroAssembler::vpsrld; + case Op_RShiftVL: + return &MacroAssembler::evpsraq; + case Op_LShiftVL: + return &MacroAssembler::vpsllq; + case Op_URShiftVL: + return &MacroAssembler::vpsrlq; + default: + return NULL; + } +} + +XX_Inst get_extend_inst(bool sign) { + XX_Inst inst; + if (sign) + inst = &MacroAssembler::pmovsxbw; + else + inst = &MacroAssembler::pmovzxbw; + return inst; +} + +XXI_Inst get_avx_extend_inst(bool sign) { + XXI_Inst inst; + if (sign) + inst = &MacroAssembler::vpmovsxbw; + else + inst = &MacroAssembler::vpmovzxbw; + return inst; +} + +AddressLiteral get_mask(int opcode) { + switch(opcode) { + case Op_AbsVF: + return ExternalAddress(vector_float_signmask()); + case Op_AbsVD: + return ExternalAddress(vector_double_signmask()); + case Op_NegVF: + return ExternalAddress(vector_float_signflip()); + case Op_NegVD: + return ExternalAddress(vector_double_signflip()); + default: + return ExternalAddress(vector_double_signflip()); + } +} +// need a scratch register to load mask TBD +void emit_vshift4Bor8B_code(MacroAssembler& _masm, int opcode, XMMRegister dst, + XMMRegister src, XMMRegister shift, + XMMRegister tmp, Register scratch) { + XX_Inst extendinst = get_extend_inst(opcode == Op_URShiftVB ? false : true); + XX_Inst shiftinst = get_xx_inst(opcode); + + (_masm.*extendinst)(tmp, src); + (_masm.*shiftinst)(tmp, shift); + __ movdqu(dst, ExternalAddress(vector_short_to_byte_mask()), scratch); + __ pand(dst, tmp); + __ packuswb(dst, dst); +} +// need a scratch register to load mask TBD +void emit_vshift16B_code(MacroAssembler& _masm, int opcode, XMMRegister dst, + XMMRegister src, XMMRegister shift, + XMMRegister tmp1, XMMRegister tmp2, Register scratch) { + XX_Inst extendinst = get_extend_inst(opcode == Op_URShiftVB ? false : true); + XX_Inst shiftinst = get_xx_inst(opcode); + + (_masm.*extendinst)(tmp1, src); + (_masm.*shiftinst)(tmp1, shift); + __ pshufd(tmp2, src, 0xE); + (_masm.*extendinst)(tmp2, tmp2); + (_masm.*shiftinst)(tmp2, shift); + __ movdqu(dst, ExternalAddress(vector_short_to_byte_mask()), scratch); + __ pand(tmp2, dst); + __ pand(dst, tmp1); + __ packuswb(dst, tmp2); +} + + +void emit_vshift16B_avx_code(MacroAssembler& _masm, int opcode, XMMRegister dst, + XMMRegister src, XMMRegister shift, + XMMRegister tmp, Register scratch) { + XXI_Inst extendinst = get_avx_extend_inst(opcode == Op_URShiftVB ? false : true); + XXXI_Inst shiftinst = get_xxxi_inst(opcode); + + int vector_len = 1; + (_masm.*extendinst)(tmp, src, vector_len); + (_masm.*shiftinst)(tmp, tmp, shift, vector_len); + __ vpand(tmp, tmp, ExternalAddress(vector_short_to_byte_mask()), vector_len, scratch); + __ vextracti128_high(dst, tmp); + __ vpackuswb(dst, tmp, dst, 0); +} + +void emit_vshift32B_avx_code(MacroAssembler& _masm, int opcode, XMMRegister dst, + XMMRegister src, XMMRegister shift, + XMMRegister tmp, Register scratch) { + XXI_Inst extendinst = get_avx_extend_inst(opcode == Op_URShiftVB ? false : true); + XXXI_Inst shiftinst = get_xxxi_inst(opcode); + + int vector_len = 1; + __ vextracti128_high(tmp, src); + (_masm.*extendinst)(tmp, tmp, vector_len); + (_masm.*extendinst)(dst, src, vector_len); + (_masm.*shiftinst)(tmp, tmp, shift, vector_len); + (_masm.*shiftinst)(dst, dst, shift, vector_len); + __ vpand(tmp, tmp, ExternalAddress(vector_short_to_byte_mask()), vector_len, scratch); + __ vpand(dst, dst, ExternalAddress(vector_short_to_byte_mask()), vector_len, scratch); + __ vpackuswb(dst, dst, tmp, vector_len); + __ vpermq(dst, dst, 0xD8, vector_len); +} + +void emit_vshift64B_avx_code(MacroAssembler& _masm, int opcode, XMMRegister dst, + XMMRegister src, XMMRegister shift, + XMMRegister tmp1, XMMRegister tmp2, Register scratch) { + XXI_Inst extendinst = get_avx_extend_inst(opcode == Op_URShiftVB ? false : true); + XXXI_Inst shiftinst = get_xxxi_inst(opcode); + + int vector_len = 2; + __ vextracti64x4(tmp1, src, 1); + (_masm.*extendinst)(tmp1, tmp1, vector_len); + (_masm.*extendinst)(tmp2, src, vector_len); + (_masm.*shiftinst)(tmp1, tmp1, shift, vector_len); + (_masm.*shiftinst)(tmp2, tmp2, shift, vector_len); + __ vmovdqu(dst, ExternalAddress(vector_short_to_byte_mask()), scratch); + __ vpbroadcastd(dst, dst, vector_len); + __ vpand(tmp1, tmp1, dst, vector_len); + __ vpand(tmp2, tmp2, dst, vector_len); + __ vpackuswb(dst, tmp1, tmp2, vector_len); + __ evmovdquq(tmp2, ExternalAddress(vector_byte_perm_mask()), vector_len, scratch); + __ vpermq(dst, tmp2, dst, vector_len); +} +//============================================================================= const bool Matcher::match_rule_supported(int opcode) { if (!has_match_rule(opcode)) return false; bool ret_value = true; switch (opcode) { + case Op_AbsVL: + if (UseAVX < 3) + ret_value = false; case Op_PopCountI: case Op_PopCountL: if (!UsePopCountInstruction) @@ -1402,6 +1628,9 @@ if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here ret_value = false; break; + case Op_AbsVB: + case Op_AbsVS: + case Op_AbsVI: case Op_AddReductionVI: if (UseSSE < 3) // requires at least SSE3 ret_value = false; @@ -1447,9 +1676,19 @@ ret_value = false; break; case Op_MulAddVS2VI: + case Op_RShiftVL: + case Op_AbsVD: + case Op_NegVD: if (UseSSE < 2) ret_value = false; break; + case Op_MulVB: + case Op_LShiftVB: + case Op_RShiftVB: + case Op_URShiftVB: + if (UseSSE < 4) + ret_value = false; + break; #ifdef _LP64 case Op_MaxD: case Op_MaxF: @@ -1470,24 +1709,42 @@ bool ret_value = match_rule_supported(opcode); if (ret_value) { switch (opcode) { + case Op_AbsVB: case Op_AddVB: case Op_SubVB: if ((vlen == 64) && (VM_Version::supports_avx512bw() == false)) ret_value = false; break; - case Op_URShiftVS: - case Op_RShiftVS: - case Op_LShiftVS: - case Op_MulVS: + case Op_AbsVS: case Op_AddVS: case Op_SubVS: + case Op_MulVS: + case Op_LShiftVS: + case Op_RShiftVS: + case Op_URShiftVS: if ((vlen == 32) && (VM_Version::supports_avx512bw() == false)) ret_value = false; break; + case Op_MulVB: + case Op_LShiftVB: + case Op_RShiftVB: + case Op_URShiftVB: + if ((vlen == 32 && UseAVX < 2) || + ((vlen == 64) && (VM_Version::supports_avx512bw() == false))) + ret_value = false; + break; + case Op_NegVF: + if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) + ret_value = false; + break; case Op_CMoveVF: if (vlen != 8) ret_value = false; break; + case Op_NegVD: + if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) + ret_value = false; + break; case Op_CMoveVD: if (vlen != 4) ret_value = false; @@ -7302,6 +7559,186 @@ // --------------------------------- MUL -------------------------------------- +// Byte vector mul +instruct mul4B_reg(vecS dst, vecS src1, vecS src2, vecS tmp, rRegI scratch) %{ + predicate(UseSSE > 3 && n->as_Vector()->length() == 4); + match(Set dst (MulVB src1 src2)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{"pmovsxbw $tmp,$src1\n\t" + "pmovsxbw $dst,$src2\n\t" + "pmullw $tmp,$dst\n\t" + "movdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "pand $dst,$tmp\n\t" + "packuswb $dst,$dst\t! mul packed4B" %} + ins_encode %{ + __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister); + __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister); + __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister); + __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ pand($dst$$XMMRegister, $tmp$$XMMRegister); + __ packuswb($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct mul8B_reg(vecD dst, vecD src1, vecD src2, vecD tmp, rRegI scratch) %{ + predicate(UseSSE > 3 && n->as_Vector()->length() == 8); + match(Set dst (MulVB src1 src2)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{"pmovsxbw $tmp,$src1\n\t" + "pmovsxbw $dst,$src2\n\t" + "pmullw $tmp,$dst\n\t" + "movdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "pand $dst,$tmp\n\t" + "packuswb $dst,$dst\t! mul packed8B" %} + ins_encode %{ + __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister); + __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister); + __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister); + __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ pand($dst$$XMMRegister, $tmp$$XMMRegister); + __ packuswb($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct mul16B_reg(vecX dst, vecX src1, vecX src2, vecX tmp1, vecX tmp2, rRegI scratch) %{ + predicate(UseSSE > 3 && n->as_Vector()->length() == 16); + match(Set dst (MulVB src1 src2)); + effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); + format %{"pmovsxbw $tmp1,$src1\n\t" + "pmovsxbw $tmp2,$src2\n\t" + "pmullw $tmp1,$tmp2\n\t" + "pshufd $tmp2,$src1,0xEE\n\t" + "pshufd $dst,$src2,0xEE\n\t" + "pmovsxbw $tmp2,$tmp2\n\t" + "pmovsxbw $dst,$dst\n\t" + "pmullw $tmp2,$dst\n\t" + "movdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "pand $tmp2,$dst\n\t" + "pand $dst,$tmp1\n\t" + "packuswb $dst,$tmp2\t! mul packed16B" %} + ins_encode %{ + __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister); + __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister); + __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister); + __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE); + __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE); + __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister); + __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister); + __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister); + __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ pand($tmp2$$XMMRegister, $dst$$XMMRegister); + __ pand($dst$$XMMRegister, $tmp1$$XMMRegister); + __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul16B_reg_avx(vecX dst, vecX src1, vecX src2, vecX tmp, rRegI scratch) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (MulVB src1 src2)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{"vpmovsxbw $tmp,$src1\n\t" + "vpmovsxbw $dst,$src2\n\t" + "vpmullw $tmp,$tmp,$dst\n\t" + "vmovdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "vpand $dst,$dst,$tmp\n\t" + "vextracti128_high $tmp,$dst\n\t" + "vpackuswb $dst,$dst,$dst\n\t! mul packed16B" %} + ins_encode %{ + int vector_len = 1; + __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vector_len); + __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len); + __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len); + __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister); + __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul32B_reg_avx(vecY dst, vecY src1, vecY src2, vecY tmp1, vecY tmp2, rRegI scratch) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 32); + match(Set dst (MulVB src1 src2)); + effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); + format %{"vextracti128_high $tmp1,$src1\n\t" + "vextracti128_high $dst,$src2\n\t" + "vpmovsxbw $tmp1,$tmp1\n\t" + "vpmovsxbw $dst,$dst\n\t" + "vpmullw $tmp1,$tmp1,$dst\n\t" + "vpmovsxbw $tmp2,$src1\n\t" + "vpmovsxbw $dst,$src2\n\t" + "vpmullw $tmp2,$tmp2,$dst\n\t" + "vmovdqu $dst, [0x00ff00ff0x00ff00ff]\n\t" + "vpbroadcastd $dst, $dst\n\t" + "vpand $tmp1,$tmp1,$dst\n\t" + "vpand $dst,$dst,$tmp2\n\t" + "vpackuswb $dst,$dst,$tmp1\n\t" + "vpermq $dst, $dst, 0xD8\t! mul packed32B" %} + ins_encode %{ + int vector_len = 1; + __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister); + __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister); + __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len); + __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len); + __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len); + __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len); + __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vector_len); + __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul64B_reg_avx(vecZ dst, vecZ src1, vecZ src2, vecZ tmp1, vecZ tmp2, rRegI scratch) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 64); + match(Set dst (MulVB src1 src2)); + effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); + format %{"vextracti64x4_high $tmp1,$src1\n\t" + "vextracti64x4_high $dst,$src2\n\t" + "vpmovsxbw $tmp1,$tmp1\n\t" + "vpmovsxbw $dst,$dst\n\t" + "vpmullw $tmp1,$tmp1,$dst\n\t" + "vpmovsxbw $tmp2,$src1\n\t" + "vpmovsxbw $dst,$src2\n\t" + "vpmullw $tmp2,$tmp2,$dst\n\t" + "vmovdqu $dst, [0x00ff00ff0x00ff00ff]\n\t" + "vpbroadcastd $dst, $dst\n\t" + "vpand $tmp1,$tmp1,$dst\n\t" + "vpand $tmp2,$tmp2,$dst\n\t" + "vpackuswb $dst,$tmp1,$tmp2\n\t" + "evmovdquq $tmp2,[0x0604020007050301]\n\t" + "vpermq $dst,$tmp2,$dst,0x01\t! mul packed64B" %} + + ins_encode %{ + int vector_len = 2; + __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister); + __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister); + __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len); + __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len); + __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len); + __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len); + __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register); + __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); + + %} + ins_pipe( pipe_slow ); +%} + // Shorts/Chars vector mul instruct vmul2S(vecS dst, vecS src) %{ predicate(UseAVX == 0 && n->as_Vector()->length() == 2); @@ -8024,20 +8461,6 @@ ins_pipe( pipe_slow ); %} -// ------------------------------ Shift --------------------------------------- - -// Left and right shift count vectors are the same on x86 -// (only lowest bits of xmm reg are used for count). -instruct vshiftcnt(vecS dst, rRegI cnt) %{ - match(Set dst (LShiftCntV cnt)); - match(Set dst (RShiftCntV cnt)); - format %{ "movd $dst,$cnt\t! load shift count" %} - ins_encode %{ - __ movdl($dst$$XMMRegister, $cnt$$Register); - %} - ins_pipe( pipe_slow ); -%} - // --------------------------------- Sqrt -------------------------------------- // Floating point vector sqrt @@ -8195,1141 +8618,479 @@ ins_pipe( pipe_slow ); %} -// ------------------------------ LeftShift ----------------------------------- - -// Shorts/Chars vector left shift -instruct vsll2S(vecS dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVS dst shift)); - format %{ "psllw $dst,$shift\t! left shift packed2S" %} - ins_encode %{ - __ psllw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll2S_imm(vecS dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVS dst shift)); - format %{ "psllw $dst,$shift\t! left shift packed2S" %} - ins_encode %{ - __ psllw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} +// ------------------------------ Shift --------------------------------------- -instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed2S" %} +// Left and right shift count vectors are the same on x86 +// (only lowest bits of xmm reg are used for count). +instruct vshiftcnt(vecS dst, rRegI cnt) %{ + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "movdl $dst,$cnt\t! load shift count" %} ins_encode %{ - int vector_len = 0; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + __ movdl($dst$$XMMRegister, $cnt$$Register); %} ins_pipe( pipe_slow ); %} -instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed2S" %} +instruct vshiftcntimm(vecS dst, immI8 cnt, rRegI tmp) %{ + match(Set dst cnt); + effect(TEMP tmp); + format %{ "movl $tmp,$cnt\t" + "movdl $dst,$tmp\t! load shift count" %} ins_encode %{ - int vector_len = 0; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + __ movl($tmp$$Register, $cnt$$constant); + __ movdl($dst$$XMMRegister, $tmp$$Register); %} ins_pipe( pipe_slow ); %} -instruct vsll4S(vecD dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVS dst shift)); - format %{ "psllw $dst,$shift\t! left shift packed4S" %} +// Byte vector shift +instruct vshift4B(vecS dst, vecS src, vecS shift, vecS tmp, rRegI scratch) %{ + predicate(UseSSE > 3 && n->as_Vector()->length() == 4); + match(Set dst (LShiftVB src shift)); + match(Set dst (RShiftVB src shift)); + match(Set dst (URShiftVB src shift)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{"pmovxbw $tmp,$src\n\t" + "shiftop $tmp,$shift\n\t" + "movdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "pand $dst,$tmp\n\t" + "packuswb $dst,$dst\n\t ! packed4B shift" %} ins_encode %{ - __ psllw($dst$$XMMRegister, $shift$$XMMRegister); + emit_vshift4Bor8B_code(_masm, this->as_Mach()->ideal_Opcode() , $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, $tmp$$XMMRegister, $scratch$$Register); %} ins_pipe( pipe_slow ); %} -instruct vsll4S_imm(vecD dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVS dst shift)); - format %{ "psllw $dst,$shift\t! left shift packed4S" %} +instruct vshift8B(vecD dst, vecD src, vecS shift, vecD tmp, rRegI scratch) %{ + predicate(UseSSE > 3 && n->as_Vector()->length() == 8); + match(Set dst (LShiftVB src shift)); + match(Set dst (RShiftVB src shift)); + match(Set dst (URShiftVB src shift)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{"pmovxbw $tmp,$src\n\t" + "shiftop $tmp,$shift\n\t" + "movdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "pand $dst,$tmp\n\t" + "packuswb $dst,$dst\n\t ! packed8B shift" %} ins_encode %{ - __ psllw($dst$$XMMRegister, (int)$shift$$constant); + emit_vshift4Bor8B_code(_masm, this->as_Mach()->ideal_Opcode() , $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, $tmp$$XMMRegister, $scratch$$Register); %} ins_pipe( pipe_slow ); %} -instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed4S" %} +instruct vshift16B(vecX dst, vecX src, vecS shift, vecX tmp1, vecX tmp2, rRegI scratch) %{ + predicate(UseSSE > 3 && UseAVX <= 1 && n->as_Vector()->length() == 16); + match(Set dst (LShiftVB src shift)); + match(Set dst (RShiftVB src shift)); + match(Set dst (URShiftVB src shift)); + effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); + format %{"pmovxbw $tmp1,$src\n\t" + "shiftop $tmp1,$shift\n\t" + "pshufd $tmp2,$src\n\t" + "pmovxbw $tmp2,$tmp2\n\t" + "shiftop $tmp2,$shift\n\t" + "movdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "pand $tmp2,$dst\n\t" + "pand $dst,$tmp1\n\t" + "packuswb $dst,$tmp2\n\t! packed16B shift" %} ins_encode %{ - int vector_len = 0; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + emit_vshift16B_code(_masm, this->as_Mach()->ideal_Opcode() , $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, $scratch$$Register); %} ins_pipe( pipe_slow ); %} -instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed4S" %} +instruct vshift16B_avx(vecX dst, vecX src, vecS shift, vecX tmp, rRegI scratch) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (LShiftVB src shift)); + match(Set dst (RShiftVB src shift)); + match(Set dst (URShiftVB src shift)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{"vpmovxbw $tmp,$src\n\t" + "shiftop $tmp,$tmp,$shift\n\t" + "vpand $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t" + "vextracti128_high $dst,$tmp\n\t" + "vpackuswb $dst,$tmp,$dst\n\t! packed16B shift" %} ins_encode %{ - int vector_len = 0; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + emit_vshift16B_avx_code(_masm, this->as_Mach()->ideal_Opcode() , $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, $tmp$$XMMRegister, $scratch$$Register); %} ins_pipe( pipe_slow ); %} -instruct vsll8S(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 8); - match(Set dst (LShiftVS dst shift)); - format %{ "psllw $dst,$shift\t! left shift packed8S" %} - ins_encode %{ - __ psllw($dst$$XMMRegister, $shift$$XMMRegister); +instruct vshift32B_avx(vecY dst, vecY src, vecS shift, vecY tmp, rRegI scratch) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 32); + match(Set dst (LShiftVB src shift)); + match(Set dst (RShiftVB src shift)); + match(Set dst (URShiftVB src shift)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{"vextracti128_high $tmp,$src\n\t" + "vpmovxbw $tmp,$tmp\n\t" + "vpmovxbw $dst,$src\n\t" + "shiftop $tmp,$tmp,$shift\n\t" + "shiftop $dst,$dst,$shift\n\t" + "vpand $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t" + "vpand $dst,$dst,[0x00ff00ff0x00ff00ff]\n\t" + "vpackuswb $dst,$dst,$tmp\n\t" + "vpermq $dst,$dst,0xD8\n\t! packed32B shift" %} + ins_encode %{ + emit_vshift32B_avx_code(_masm, this->as_Mach()->ideal_Opcode() , $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, $tmp$$XMMRegister, $scratch$$Register); %} ins_pipe( pipe_slow ); %} -instruct vsll8S_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 8); - match(Set dst (LShiftVS dst shift)); - format %{ "psllw $dst,$shift\t! left shift packed8S" %} +instruct vshift64B_avx(vecZ dst, vecZ src, vecS shift, vecZ tmp1, vecZ tmp2, rRegI scratch) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 64); + match(Set dst (LShiftVB src shift)); + match(Set dst (RShiftVB src shift)); + match(Set dst (URShiftVB src shift)); + effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); + format %{"vextracti64x4 $tmp1,$src\n\t" + "vpmovxbw $tmp1,$tmp1\n\t" + "vpmovxbw $tmp2,$src\n\t" + "shiftop $tmp1,$tmp1,$shift\n\t" + "shiftop $tmp2,$tmp2,$shift\n\t" + "vmovdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "vpbroadcastd $dst,$dst\n\t" + "vpand $tmp1,$tmp1,$dst\n\t" + "vpand $tmp2,$tmp2,$dst\n\t" + "vpackuswb $dst,$tmp1,$tmp2\n\t" + "evmovdquq $tmp2, [0x0604020007050301]\n\t" + "vpermq $dst,$tmp2,$dst\n\t! packed64B shift" %} ins_encode %{ - __ psllw($dst$$XMMRegister, (int)$shift$$constant); + emit_vshift64B_avx_code(_masm, this->as_Mach()->ideal_Opcode() , $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, $scratch$$Register); %} ins_pipe( pipe_slow ); %} -instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 8); +// Shorts vector logical right shift produces incorrect Java result +// for negative data because java code convert short value into int with +// sign extension before a shift. But char vectors are fine since chars are +// unsigned values. +// Shorts/Chars vector left shift +instruct vshist2S(vecS dst, vecS src, vecS shift) %{ + predicate(n->as_Vector()->length() == 2); match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed8S" %} + match(Set dst (RShiftVS src shift)); + match(Set dst (URShiftVS src shift)); + format %{ "shiftop $dst,$src,$shift\t! shift packed2S" %} ins_encode %{ - int vector_len = 0; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + if (UseAVX == 0) { + XX_Inst shiftinst = get_xx_inst(this->as_Mach()->ideal_Opcode()); + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movflt($dst$$XMMRegister, $src$$XMMRegister); + (_masm.*shiftinst)($dst$$XMMRegister, $shift$$XMMRegister); + } else { + int vector_len = 0; + XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode()); + (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + } %} ins_pipe( pipe_slow ); %} -instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 8); +instruct vshift4S(vecD dst, vecD src, vecS shift) %{ + predicate(n->as_Vector()->length() == 4); match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed8S" %} + match(Set dst (RShiftVS src shift)); + match(Set dst (URShiftVS src shift)); + format %{ "shiftop $dst,$src,$shift\t! shift packed4S" %} ins_encode %{ - int vector_len = 0; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + if (UseAVX == 0) { + XX_Inst shiftinst = get_xx_inst(this->as_Mach()->ideal_Opcode()); + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movdbl($dst$$XMMRegister, $src$$XMMRegister); + (_masm.*shiftinst)($dst$$XMMRegister, $shift$$XMMRegister); + + } else { + int vector_len = 0; + XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode()); + (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + } %} ins_pipe( pipe_slow ); %} -instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 16); +instruct vshift8S(vecX dst, vecX src, vecS shift) %{ + predicate(n->as_Vector()->length() == 8); match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed16S" %} + match(Set dst (RShiftVS src shift)); + match(Set dst (URShiftVS src shift)); + format %{ "shiftop $dst,$src,$shift\t! shift packed8S" %} ins_encode %{ - int vector_len = 1; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + if (UseAVX == 0) { + XX_Inst shiftinst = get_xx_inst(this->as_Mach()->ideal_Opcode()); + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movdqu($dst$$XMMRegister, $src$$XMMRegister); + (_masm.*shiftinst)($dst$$XMMRegister, $shift$$XMMRegister); + } else { + int vector_len = 0; + XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode()); + (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + } %} ins_pipe( pipe_slow ); %} -instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{ +instruct vshift16S(vecY dst, vecY src, vecS shift) %{ predicate(UseAVX > 1 && n->as_Vector()->length() == 16); match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed16S" %} + match(Set dst (RShiftVS src shift)); + match(Set dst (URShiftVS src shift)); + format %{ "shiftop $dst,$src,$shift\t! shift packed16S" %} ins_encode %{ int vector_len = 1; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode()); + (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{ +instruct vshift32S(vecZ dst, vecZ src, vecS shift) %{ predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32); match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed32S" %} + match(Set dst (RShiftVS src shift)); + match(Set dst (URShiftVS src shift)); + format %{ "shiftop $dst,$src,$shift\t! shift packed32S" %} ins_encode %{ int vector_len = 2; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode()); + (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32); - match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed32S" %} +// Integers vector left shift +instruct vshift2I(vecD dst, vecD src, vecS shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (LShiftVI src shift)); + match(Set dst (RShiftVI src shift)); + match(Set dst (URShiftVI src shift)); + format %{ "shiftop $dst,$src,$shift\t! shift packed2I" %} ins_encode %{ - int vector_len = 2; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + if (UseAVX == 0) { + XX_Inst shiftinst = get_xx_inst(this->as_Mach()->ideal_Opcode()); + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movdbl($dst$$XMMRegister, $src$$XMMRegister); + (_masm.*shiftinst)($dst$$XMMRegister, $shift$$XMMRegister); + } else { + int vector_len = 0; + XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode()); + (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + } %} ins_pipe( pipe_slow ); %} -// Integers vector left shift -instruct vsll2I(vecD dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVI dst shift)); - format %{ "pslld $dst,$shift\t! left shift packed2I" %} +instruct vshift4I(vecX dst, vecX src, vecS shift) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (LShiftVI src shift)); + match(Set dst (RShiftVI src shift)); + match(Set dst (URShiftVI src shift)); + format %{ "shiftop $dst,$src,$shift\t! shift packed4I" %} ins_encode %{ - __ pslld($dst$$XMMRegister, $shift$$XMMRegister); + if (UseAVX == 0) { + XX_Inst shiftinst = get_xx_inst(this->as_Mach()->ideal_Opcode()); + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movdqu($dst$$XMMRegister, $src$$XMMRegister); + (_masm.*shiftinst)($dst$$XMMRegister, $shift$$XMMRegister); + } else { + int vector_len = 0; + XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode()); + (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + } %} ins_pipe( pipe_slow ); %} -instruct vsll2I_imm(vecD dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVI dst shift)); - format %{ "pslld $dst,$shift\t! left shift packed2I" %} +instruct vshift8I(vecY dst, vecY src, vecS shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + match(Set dst (LShiftVI src shift)); + match(Set dst (RShiftVI src shift)); + match(Set dst (URShiftVI src shift)); + format %{ "shiftop $dst,$src,$shift\t! shift packed8I" %} ins_encode %{ - __ pslld($dst$$XMMRegister, (int)$shift$$constant); + int vector_len = 1; + XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode()); + (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); +instruct vshift16I(vecZ dst, vecZ src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed2I" %} + match(Set dst (RShiftVI src shift)); + match(Set dst (URShiftVI src shift)); + format %{ "shiftop $dst,$src,$shift\t! shift packed16I" %} ins_encode %{ - int vector_len = 0; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + int vector_len = 2; + XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode()); + (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed2I" %} +// Longs vector shift +instruct vshift2L(vecX dst, vecX src, vecS shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (LShiftVL src shift)); + match(Set dst (URShiftVL src shift)); + format %{ "shiftop $dst,$src,$shift\t! shift packed2L" %} ins_encode %{ - int vector_len = 0; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + if (UseAVX == 0) { + XX_Inst shiftinst = get_xx_inst(this->as_Mach()->ideal_Opcode()); + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movdqu($dst$$XMMRegister, $src$$XMMRegister); + (_masm.*shiftinst)($dst$$XMMRegister, $shift$$XMMRegister); + } else { + int vector_len = 0; + XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode()); + (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + } %} ins_pipe( pipe_slow ); %} -instruct vsll4I(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVI dst shift)); - format %{ "pslld $dst,$shift\t! left shift packed4I" %} +instruct vshift4L(vecY dst, vecY src, vecS shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 4); + match(Set dst (LShiftVL src shift)); + match(Set dst (URShiftVL src shift)); + format %{ "vpsllq $dst,$src,$shift\t! left shift packed4L" %} ins_encode %{ - __ pslld($dst$$XMMRegister, $shift$$XMMRegister); + int vector_len = 1; + XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode()); + (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -instruct vsll4I_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVI dst shift)); - format %{ "pslld $dst,$shift\t! left shift packed4I" %} +instruct vshift8L(vecZ dst, vecZ src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (LShiftVL src shift)); + match(Set dst (RShiftVL src shift)); + match(Set dst (URShiftVL src shift)); + format %{ "shiftop $dst,$src,$shift\t! shift packed8L" %} ins_encode %{ - __ pslld($dst$$XMMRegister, (int)$shift$$constant); + int vector_len = 2; + XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode()); + (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed4I" %} +// -------------------ArithmeticRightShift ----------------------------------- +// Long vector arithmetic right shift +instruct vsra2L_reg(vecX dst, vecX src, vecS shift, vecX tmp, rRegI scratch) %{ + predicate(UseSSE >= 2 && n->as_Vector()->length() == 2); + match(Set dst (RShiftVL src shift)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{ "movdqu $dst,$src\n\t" + "psrlq $dst,$shift\n\t" + "movdqu $tmp,[0x8000000000000000]\n\t" + "psrlq $tmp,$shift\n\t" + "pxor $dst,$tmp\n\t" + "psubq $dst,$tmp\t! arithmetic right shift packed2L" %} ins_encode %{ - int vector_len = 0; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + __ movdqu($dst$$XMMRegister, $src$$XMMRegister); + __ psrlq($dst$$XMMRegister, $shift$$XMMRegister); + __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register); + __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister); + __ pxor($dst$$XMMRegister, $tmp$$XMMRegister); + __ psubq($dst$$XMMRegister, $tmp$$XMMRegister); %} ins_pipe( pipe_slow ); %} -instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed4I" %} +instruct vsra2L_reg_evex(vecX dst, vecX src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 2); + match(Set dst (RShiftVL src shift)); + format %{ "evpsraq $dst,$src,$shift\t! arithmetic right shift packed2L" %} ins_encode %{ int vector_len = 0; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 8); - match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed8I" %} +instruct vsra4L_reg(vecY dst, vecY src, vecS shift, vecY tmp, rRegI scratch) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 4); + match(Set dst (RShiftVL src shift)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{ "vpsrlq $dst,$src,$shift\n\t" + "vmovdqu $tmp,[0x8000000000000000]\n\t" + "vpsrlq $tmp,$tmp,$shift\n\t" + "vpxor $dst,$dst,$tmp\n\t" + "vpsubq $dst,$dst,$tmp\t! arithmetic right shift packed4L" %} ins_encode %{ int vector_len = 1; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register); + __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len); + __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len); + __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 8); - match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed8I" %} +instruct vsra4L_reg_evex(vecY dst, vecY src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 4); + match(Set dst (RShiftVL src shift)); + format %{ "evpsraq $dst,$src,$shift\t! arithmetic right shift packed4L" %} ins_encode %{ int vector_len = 1; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 16); - match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed16I" %} +// --------------------------------- AND -------------------------------------- + +instruct vand4B(vecS dst, vecS src) %{ + predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4); + match(Set dst (AndV dst src)); + format %{ "pand $dst,$src\t! and vectors (4 bytes)" %} ins_encode %{ - int vector_len = 2; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + __ pand($dst$$XMMRegister, $src$$XMMRegister); %} ins_pipe( pipe_slow ); %} -instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 16); - match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed16I" %} +instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4); + match(Set dst (AndV src1 src2)); + format %{ "vpand $dst,$src1,$src2\t! and vectors (4 bytes)" %} ins_encode %{ - int vector_len = 2; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); + int vector_len = 0; + __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -// Longs vector left shift -instruct vsll2L(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVL dst shift)); - format %{ "psllq $dst,$shift\t! left shift packed2L" %} +instruct vand4B_mem(vecS dst, vecS src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4); + match(Set dst (AndV src (LoadVector mem))); + format %{ "vpand $dst,$src,$mem\t! and vectors (4 bytes)" %} ins_encode %{ - __ psllq($dst$$XMMRegister, $shift$$XMMRegister); + int vector_len = 0; + __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} -instruct vsll2L_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVL dst shift)); - format %{ "psllq $dst,$shift\t! left shift packed2L" %} +instruct vand8B(vecD dst, vecD src) %{ + predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8); + match(Set dst (AndV dst src)); + format %{ "pand $dst,$src\t! and vectors (8 bytes)" %} ins_encode %{ - __ psllq($dst$$XMMRegister, (int)$shift$$constant); + __ pand($dst$$XMMRegister, $src$$XMMRegister); %} ins_pipe( pipe_slow ); %} -instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVL src shift)); - format %{ "vpsllq $dst,$src,$shift\t! left shift packed2L" %} - ins_encode %{ - int vector_len = 0; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVL src shift)); - format %{ "vpsllq $dst,$src,$shift\t! left shift packed2L" %} - ins_encode %{ - int vector_len = 0; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVL src shift)); - format %{ "vpsllq $dst,$src,$shift\t! left shift packed4L" %} - ins_encode %{ - int vector_len = 1; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVL src shift)); - format %{ "vpsllq $dst,$src,$shift\t! left shift packed4L" %} - ins_encode %{ - int vector_len = 1; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 8); - match(Set dst (LShiftVL src shift)); - format %{ "vpsllq $dst,$src,$shift\t! left shift packed8L" %} - ins_encode %{ - int vector_len = 2; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 8); - match(Set dst (LShiftVL src shift)); - format %{ "vpsllq $dst,$src,$shift\t! left shift packed8L" %} - ins_encode %{ - int vector_len = 2; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -// ----------------------- LogicalRightShift ----------------------------------- - -// Shorts vector logical right shift produces incorrect Java result -// for negative data because java code convert short value into int with -// sign extension before a shift. But char vectors are fine since chars are -// unsigned values. - -instruct vsrl2S(vecS dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVS dst shift)); - format %{ "psrlw $dst,$shift\t! logical right shift packed2S" %} - ins_encode %{ - __ psrlw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2S_imm(vecS dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVS dst shift)); - format %{ "psrlw $dst,$shift\t! logical right shift packed2S" %} - ins_encode %{ - __ psrlw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed2S" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed2S" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4S(vecD dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVS dst shift)); - format %{ "psrlw $dst,$shift\t! logical right shift packed4S" %} - ins_encode %{ - __ psrlw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4S_imm(vecD dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVS dst shift)); - format %{ "psrlw $dst,$shift\t! logical right shift packed4S" %} - ins_encode %{ - __ psrlw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed4S" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed4S" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8S(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 8); - match(Set dst (URShiftVS dst shift)); - format %{ "psrlw $dst,$shift\t! logical right shift packed8S" %} - ins_encode %{ - __ psrlw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8S_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 8); - match(Set dst (URShiftVS dst shift)); - format %{ "psrlw $dst,$shift\t! logical right shift packed8S" %} - ins_encode %{ - __ psrlw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 8); - match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed8S" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 8); - match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed8S" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 16); - match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed16S" %} - ins_encode %{ - int vector_len = 1; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 16); - match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed16S" %} - ins_encode %{ - int vector_len = 1; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{ - predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32); - match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed32S" %} - ins_encode %{ - int vector_len = 2; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32); - match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed32S" %} - ins_encode %{ - int vector_len = 2; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -// Integers vector logical right shift -instruct vsrl2I(vecD dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVI dst shift)); - format %{ "psrld $dst,$shift\t! logical right shift packed2I" %} - ins_encode %{ - __ psrld($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2I_imm(vecD dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVI dst shift)); - format %{ "psrld $dst,$shift\t! logical right shift packed2I" %} - ins_encode %{ - __ psrld($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed2I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed2I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4I(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVI dst shift)); - format %{ "psrld $dst,$shift\t! logical right shift packed4I" %} - ins_encode %{ - __ psrld($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4I_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVI dst shift)); - format %{ "psrld $dst,$shift\t! logical right shift packed4I" %} - ins_encode %{ - __ psrld($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed4I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed4I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 8); - match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed8I" %} - ins_encode %{ - int vector_len = 1; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 8); - match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed8I" %} - ins_encode %{ - int vector_len = 1; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 16); - match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed16I" %} - ins_encode %{ - int vector_len = 2; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 16); - match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed16I" %} - ins_encode %{ - int vector_len = 2; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -// Longs vector logical right shift -instruct vsrl2L(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVL dst shift)); - format %{ "psrlq $dst,$shift\t! logical right shift packed2L" %} - ins_encode %{ - __ psrlq($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2L_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVL dst shift)); - format %{ "psrlq $dst,$shift\t! logical right shift packed2L" %} - ins_encode %{ - __ psrlq($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVL src shift)); - format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed2L" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVL src shift)); - format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed2L" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVL src shift)); - format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed4L" %} - ins_encode %{ - int vector_len = 1; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVL src shift)); - format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed4L" %} - ins_encode %{ - int vector_len = 1; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 8); - match(Set dst (URShiftVL src shift)); - format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed8L" %} - ins_encode %{ - int vector_len = 2; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 8); - match(Set dst (URShiftVL src shift)); - format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed8L" %} - ins_encode %{ - int vector_len = 2; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -// ------------------- ArithmeticRightShift ----------------------------------- - -// Shorts/Chars vector arithmetic right shift -instruct vsra2S(vecS dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVS dst shift)); - format %{ "psraw $dst,$shift\t! arithmetic right shift packed2S" %} - ins_encode %{ - __ psraw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra2S_imm(vecS dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVS dst shift)); - format %{ "psraw $dst,$shift\t! arithmetic right shift packed2S" %} - ins_encode %{ - __ psraw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed2S" %} - ins_encode %{ - int vector_len = 0; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed2S" %} - ins_encode %{ - int vector_len = 0; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4S(vecD dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVS dst shift)); - format %{ "psraw $dst,$shift\t! arithmetic right shift packed4S" %} - ins_encode %{ - __ psraw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4S_imm(vecD dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVS dst shift)); - format %{ "psraw $dst,$shift\t! arithmetic right shift packed4S" %} - ins_encode %{ - __ psraw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed4S" %} - ins_encode %{ - int vector_len = 0; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed4S" %} - ins_encode %{ - int vector_len = 0; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra8S(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 8); - match(Set dst (RShiftVS dst shift)); - format %{ "psraw $dst,$shift\t! arithmetic right shift packed8S" %} - ins_encode %{ - __ psraw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra8S_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 8); - match(Set dst (RShiftVS dst shift)); - format %{ "psraw $dst,$shift\t! arithmetic right shift packed8S" %} - ins_encode %{ - __ psraw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 8); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed8S" %} - ins_encode %{ - int vector_len = 0; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 8); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed8S" %} - ins_encode %{ - int vector_len = 0; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 16); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed16S" %} - ins_encode %{ - int vector_len = 1; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 16); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed16S" %} - ins_encode %{ - int vector_len = 1; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{ - predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed32S" %} - ins_encode %{ - int vector_len = 2; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed32S" %} - ins_encode %{ - int vector_len = 2; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -// Integers vector arithmetic right shift -instruct vsra2I(vecD dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVI dst shift)); - format %{ "psrad $dst,$shift\t! arithmetic right shift packed2I" %} - ins_encode %{ - __ psrad($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra2I_imm(vecD dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVI dst shift)); - format %{ "psrad $dst,$shift\t! arithmetic right shift packed2I" %} - ins_encode %{ - __ psrad($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed2I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed2I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4I(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVI dst shift)); - format %{ "psrad $dst,$shift\t! arithmetic right shift packed4I" %} - ins_encode %{ - __ psrad($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4I_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVI dst shift)); - format %{ "psrad $dst,$shift\t! arithmetic right shift packed4I" %} - ins_encode %{ - __ psrad($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed4I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed4I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 8); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed8I" %} - ins_encode %{ - int vector_len = 1; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 8); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed8I" %} - ins_encode %{ - int vector_len = 1; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 16); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed16I" %} - ins_encode %{ - int vector_len = 2; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 16); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed16I" %} - ins_encode %{ - int vector_len = 2; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -// There are no longs vector arithmetic right shift instructions. - - -// --------------------------------- AND -------------------------------------- - -instruct vand4B(vecS dst, vecS src) %{ - predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4); - match(Set dst (AndV dst src)); - format %{ "pand $dst,$src\t! and vectors (4 bytes)" %} - ins_encode %{ - __ pand($dst$$XMMRegister, $src$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{ - predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4); - match(Set dst (AndV src1 src2)); - format %{ "vpand $dst,$src1,$src2\t! and vectors (4 bytes)" %} - ins_encode %{ - int vector_len = 0; - __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vand4B_mem(vecS dst, vecS src, memory mem) %{ - predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4); - match(Set dst (AndV src (LoadVector mem))); - format %{ "vpand $dst,$src,$mem\t! and vectors (4 bytes)" %} - ins_encode %{ - int vector_len = 0; - __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vand8B(vecD dst, vecD src) %{ - predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8); - match(Set dst (AndV dst src)); - format %{ "pand $dst,$src\t! and vectors (8 bytes)" %} - ins_encode %{ - __ pand($dst$$XMMRegister, $src$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{ - predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8); - match(Set dst (AndV src1 src2)); - format %{ "vpand $dst,$src1,$src2\t! and vectors (8 bytes)" %} +instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8); + match(Set dst (AndV src1 src2)); + format %{ "vpand $dst,$src1,$src2\t! and vectors (8 bytes)" %} ins_encode %{ int vector_len = 0; __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); @@ -9707,6 +9468,305 @@ %} ins_pipe( pipe_slow ); %} + +// --------------------------------- ABS -------------------------------------- +// a = |a| +instruct vabs4B_reg(vecS dst, vecS src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 4); + match(Set dst (AbsVB src)); + format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed4B" %} + ins_encode %{ + __ pabsb($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs8B_reg(vecD dst, vecD src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 8); + match(Set dst (AbsVB src)); + format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed8B" %} + ins_encode %{ + __ pabsb($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs16B_reg(vecX dst, vecX src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 16); + match(Set dst (AbsVB src)); + format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed16B" %} + ins_encode %{ + __ pabsb($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs32B_reg(vecY dst, vecY src) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 32); + match(Set dst (AbsVB src)); + format %{ "vpabsb $dst,$src\t# $dst = |$src| abs packed32B" %} + ins_encode %{ + int vector_len = 1; + __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs64B_reg(vecZ dst, vecZ src) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 64); + match(Set dst (AbsVB src)); + format %{ "vpabsb $dst,$src\t# $dst = |$src| abs packed64B" %} + ins_encode %{ + int vector_len = 2; + __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs2S_reg(vecD dst, vecD src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 2); + match(Set dst (AbsVS src)); + format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed2S" %} + ins_encode %{ + __ pabsw($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs4S_reg(vecD dst, vecD src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 4); + match(Set dst (AbsVS src)); + format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed4S" %} + ins_encode %{ + __ pabsw($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs8S_reg(vecX dst, vecX src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 8); + match(Set dst (AbsVS src)); + format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed8S" %} + ins_encode %{ + __ pabsw($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs16S_reg(vecY dst, vecY src) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (AbsVS src)); + format %{ "vpabsw $dst,$src\t# $dst = |$src| abs packed16S" %} + ins_encode %{ + int vector_len = 1; + __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs32S_reg(vecZ dst, vecZ src) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 32); + match(Set dst (AbsVS src)); + format %{ "vpabsw $dst,$src\t# $dst = |$src| abs packed32S" %} + ins_encode %{ + int vector_len = 2; + __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs2I_reg(vecD dst, vecD src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 2); + match(Set dst (AbsVI src)); + format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed2I" %} + ins_encode %{ + __ pabsd($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs4I_reg(vecX dst, vecX src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 4); + match(Set dst (AbsVI src)); + format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed4I" %} + ins_encode %{ + __ pabsd($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs8I_reg(vecY dst, vecY src) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (AbsVI src)); + format %{ "vpabsd $dst,$src\t# $dst = |$src| abs packed8I" %} + ins_encode %{ + int vector_len = 1; + __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs16I_reg(vecZ dst, vecZ src) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (AbsVI src)); + format %{ "vpabsd $dst,$src\t# $dst = |$src| abs packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs2L_reg(vecX dst, vecX src) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 2); + match(Set dst (AbsVL src)); + format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed2L" %} + ins_encode %{ + int vector_len = 0; + __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs4L_reg(vecY dst, vecY src) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 4); + match(Set dst (AbsVL src)); + format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed4L" %} + ins_encode %{ + int vector_len = 1; + __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs8L_reg(vecZ dst, vecZ src) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (AbsVL src)); + format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed8L" %} + ins_encode %{ + int vector_len = 2; + __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// --------------------------------- ABSNEG -------------------------------------- + +instruct vabsneg2D(vecX dst, vecX src, rRegI scratch) %{ + predicate(UseSSE >= 2 && n->as_Vector()->length() == 2); + match(Set dst (AbsVD src)); + match(Set dst (NegVD src)); + effect(TEMP scratch); + format %{ "and(xor)pd $dst,$src,[mask]\t# absneg packed2D" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + XAR_Inst opinst = get_xar_inst(opcode); + AddressLiteral adr = get_mask(opcode); + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movdqu($dst$$XMMRegister, $src$$XMMRegister); + (_masm.*opinst)($dst$$XMMRegister, adr, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabsneg4D(vecY dst, vecY src, rRegI scratch) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (AbsVD src)); + match(Set dst (NegVD src)); + effect(TEMP scratch); + format %{ "vand(xor)pd $dst,$src,[mask]\t# absneg packed4D" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + XXAIR_Inst opinst = get_xxair_inst(opcode); + AddressLiteral adr = get_mask(opcode); + int vector_len = 1; + (_masm.*opinst)($dst$$XMMRegister, $src$$XMMRegister, adr, vector_len, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabsneg8D(vecZ dst, vecZ src, rRegI scratch) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (AbsVD src)); + match(Set dst (NegVD src)); + effect(TEMP scratch); + format %{ "vand(xor)pd $dst,$src,[mask]\t# absneg packed8D" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + XXAIR_Inst opinst = get_xxair_inst(opcode); + AddressLiteral adr = get_mask(opcode); + int vector_len = 2; + (_masm.*opinst)($dst$$XMMRegister, $src$$XMMRegister, adr, vector_len, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabsneg2F(vecD dst, vecD src, rRegI scratch) %{ + predicate(UseSSE > 0 && n->as_Vector()->length() == 2); + match(Set dst (AbsVF src)); + match(Set dst (NegVF src)); + effect(TEMP scratch); + format %{ "and(xor)ps $dst,$src,[mask]\t# absneg packed2F" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + XAR_Inst opinst = get_xar_inst(opcode); + AddressLiteral adr = get_mask(opcode); + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movdqu($dst$$XMMRegister, $src$$XMMRegister); + (_masm.*opinst)($dst$$XMMRegister, adr, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabsneg4F(vecX dst, rRegI scratch) %{ + predicate(UseSSE > 0 && n->as_Vector()->length() == 4); + match(Set dst (AbsVF dst)); + match(Set dst (NegVF dst)); + effect(TEMP scratch); + format %{ "vand(xor)ps $dst,[mask]\t# absneg packed4F" %} + ins_cost(150); + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + XAR_Inst opinst = get_xar_inst(opcode); + AddressLiteral adr = get_mask(opcode); + (_masm.*opinst)($dst$$XMMRegister, adr, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabsneg8F(vecY dst, vecY src, rRegI scratch) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (AbsVF src)); + match(Set dst (NegVF src)); + effect(TEMP scratch); + format %{ "vand(xor)ps $dst,$src,[mask]\t# absneg packed8F" %} + ins_cost(150); + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + XXAIR_Inst opinst = get_xxair_inst(opcode); + AddressLiteral adr = get_mask(opcode); + int vector_len = 1; + (_masm.*opinst)($dst$$XMMRegister, $src$$XMMRegister, adr, vector_len, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabsneg16F(vecZ dst, vecZ src, rRegI scratch) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (AbsVF src)); + match(Set dst (NegVF src)); + effect(TEMP scratch); + format %{ "vand(xor)ps $dst,$src,[mask]\t# absneg packed16F" %} + ins_cost(150); + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + XXAIR_Inst opinst = get_xxair_inst(opcode); + AddressLiteral adr = get_mask(opcode); + int vector_len = 2; + (_masm.*opinst)($dst$$XMMRegister, $src$$XMMRegister, adr, vector_len, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} // --------------------------------- FMA -------------------------------------- --- old/src/hotspot/cpu/x86/x86_32.ad 2019-04-30 17:18:49.906965398 -0700 +++ new/src/hotspot/cpu/x86/x86_32.ad 2019-04-30 17:18:49.806965401 -0700 @@ -8949,6 +8949,28 @@ ins_pipe(ialu_reg_reg_alu0); %} +// Integer Absolute Instructions +instruct absI_rReg(rRegI dst, rRegI src, rRegI tmp, eFlagsReg cr) +%{ + match(Set dst (AbsI src)); + effect(TEMP dst, TEMP tmp, KILL cr); + format %{ "movl $tmp, $src\n\t" + "sarl $tmp, 31\n\t" + "movl $dst, $src\n\t" + "xorl $dst, $tmp\n\t" + "subl $dst, $tmp\n" + %} + ins_encode %{ + __ movl($tmp$$Register, $src$$Register); + __ sarl($tmp$$Register, 31); + __ movl($dst$$Register, $src$$Register); + __ xorl($dst$$Register, $tmp$$Register); + __ subl($dst$$Register, $tmp$$Register); + %} + + ins_pipe(ialu_reg_reg); +%} + //----------Long Instructions------------------------------------------------ // Add Long Register with Register instruct addL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{ --- old/src/hotspot/cpu/x86/x86_64.ad 2019-04-30 17:18:50.338965386 -0700 +++ new/src/hotspot/cpu/x86/x86_64.ad 2019-04-30 17:18:50.242965388 -0700 @@ -8181,6 +8181,52 @@ ins_pipe( pipe_cmpxchg ); %} +//----------Abs Instructions------------------------------------------- + +// Integer Absolute Instructions +instruct absI_rReg(rRegI dst, rRegI src, rRegI tmp, rFlagsReg cr) +%{ + match(Set dst (AbsI src)); + effect(TEMP dst, TEMP tmp, KILL cr); + format %{ "movl $tmp, $src\n\t" + "sarl $tmp, 31\n\t" + "movl $dst, $src\n\t" + "xorl $dst, $tmp\n\t" + "subl $dst, $tmp\n" + %} + ins_encode %{ + __ movl($tmp$$Register, $src$$Register); + __ sarl($tmp$$Register, 31); + __ movl($dst$$Register, $src$$Register); + __ xorl($dst$$Register, $tmp$$Register); + __ subl($dst$$Register, $tmp$$Register); + %} + + ins_pipe(ialu_reg_reg); +%} + +// Long Absolute Instructions +instruct absL_rReg(rRegL dst, rRegL src, rRegL tmp, rFlagsReg cr) +%{ + match(Set dst (AbsL src)); + effect(TEMP dst, TEMP tmp, KILL cr); + format %{ "movq $tmp, $src\n\t" + "sarq $tmp, 63\n\t" + "movq $dst, $src\n\t" + "xorq $dst, $tmp\n\t" + "subq $dst, $tmp\n" + %} + ins_encode %{ + __ movq($tmp$$Register, $src$$Register); + __ sarq($tmp$$Register, 63); + __ movq($dst$$Register, $src$$Register); + __ xorq($dst$$Register, $tmp$$Register); + __ subq($dst$$Register, $tmp$$Register); + %} + + ins_pipe(ialu_reg_reg); +%} + //----------Subtraction Instructions------------------------------------------- // Integer Subtraction Instructions --- old/src/hotspot/share/adlc/formssel.cpp 2019-04-30 17:18:50.770965373 -0700 +++ new/src/hotspot/share/adlc/formssel.cpp 2019-04-30 17:18:50.670965376 -0700 @@ -3808,7 +3808,7 @@ "MaxI","MinI","MaxF","MinF","MaxD","MinD", "MaxV", "MinV", "MulI","MulL","MulF","MulD", - "MulVS","MulVI","MulVL","MulVF","MulVD", + "MulVB","MulVS","MulVI","MulVL","MulVF","MulVD", "OrI","OrL", "OrV", "XorI","XorL", @@ -4175,10 +4175,10 @@ static const char *vector_list[] = { "AddVB","AddVS","AddVI","AddVL","AddVF","AddVD", "SubVB","SubVS","SubVI","SubVL","SubVF","SubVD", - "MulVS","MulVI","MulVL","MulVF","MulVD", + "MulVB","MulVS","MulVI","MulVL","MulVF","MulVD", "CMoveVD", "CMoveVF", "DivVF","DivVD", - "AbsVF","AbsVD", + "AbsVB","AbsVS","AbsVI","AbsVL","AbsVF","AbsVD", "NegVF","NegVD", "SqrtVD","SqrtVF", "AndV" ,"XorV" ,"OrV", --- old/src/hotspot/share/classfile/vmSymbols.cpp 2019-04-30 17:18:51.178965361 -0700 +++ new/src/hotspot/share/classfile/vmSymbols.cpp 2019-04-30 17:18:51.078965364 -0700 @@ -363,6 +363,9 @@ case vmIntrinsics::_isInstance: case vmIntrinsics::_currentThread: case vmIntrinsics::_dabs: + case vmIntrinsics::_fabs: + case vmIntrinsics::_iabs: + case vmIntrinsics::_labs: case vmIntrinsics::_dsqrt: case vmIntrinsics::_dsin: case vmIntrinsics::_dcos: @@ -404,6 +407,9 @@ case vmIntrinsics::_longBitsToDouble: case vmIntrinsics::_currentThread: case vmIntrinsics::_dabs: + case vmIntrinsics::_fabs: + case vmIntrinsics::_iabs: + case vmIntrinsics::_labs: case vmIntrinsics::_dsqrt: case vmIntrinsics::_dsin: case vmIntrinsics::_dcos: @@ -567,6 +573,9 @@ case vmIntrinsics::_doubleToRawLongBits: case vmIntrinsics::_longBitsToDouble: case vmIntrinsics::_dabs: + case vmIntrinsics::_fabs: + case vmIntrinsics::_iabs: + case vmIntrinsics::_labs: case vmIntrinsics::_dsqrt: case vmIntrinsics::_dsin: case vmIntrinsics::_dcos: --- old/src/hotspot/share/classfile/vmSymbols.hpp 2019-04-30 17:18:51.554965350 -0700 +++ new/src/hotspot/share/classfile/vmSymbols.hpp 2019-04-30 17:18:51.458965352 -0700 @@ -473,6 +473,7 @@ template(float_int_signature, "(F)I") \ template(double_long_signature, "(D)J") \ template(double_double_signature, "(D)D") \ + template(float_float_signature, "(F)F") \ template(int_float_signature, "(I)F") \ template(long_int_signature, "(J)I") \ template(long_long_signature, "(J)J") \ @@ -772,6 +773,9 @@ do_name(fma_name, "fma") \ \ do_intrinsic(_dabs, java_lang_Math, abs_name, double_double_signature, F_S) \ + do_intrinsic(_fabs, java_lang_Math, abs_name, float_float_signature, F_S) \ + do_intrinsic(_iabs, java_lang_Math, abs_name, int_int_signature, F_S) \ + do_intrinsic(_labs, java_lang_Math, abs_name, long_long_signature, F_S) \ do_intrinsic(_dsin, java_lang_Math, sin_name, double_double_signature, F_S) \ do_intrinsic(_dcos, java_lang_Math, cos_name, double_double_signature, F_S) \ do_intrinsic(_dtan, java_lang_Math, tan_name, double_double_signature, F_S) \ --- old/src/hotspot/share/opto/c2compiler.cpp 2019-04-30 17:18:51.946965338 -0700 +++ new/src/hotspot/share/opto/c2compiler.cpp 2019-04-30 17:18:51.846965341 -0700 @@ -460,6 +460,9 @@ case vmIntrinsics::_dcos: case vmIntrinsics::_dtan: case vmIntrinsics::_dabs: + case vmIntrinsics::_fabs: + case vmIntrinsics::_iabs: + case vmIntrinsics::_labs: case vmIntrinsics::_datan2: case vmIntrinsics::_dsqrt: case vmIntrinsics::_dexp: --- old/src/hotspot/share/opto/classes.hpp 2019-04-30 17:18:52.322965327 -0700 +++ new/src/hotspot/share/opto/classes.hpp 2019-04-30 17:18:52.222965330 -0700 @@ -30,6 +30,7 @@ macro(AbsD) macro(AbsF) macro(AbsI) +macro(AbsL) macro(AddD) macro(AddF) macro(AddI) @@ -335,6 +336,7 @@ macro(SubVL) macro(SubVF) macro(SubVD) +macro(MulVB) macro(MulVS) macro(MulVI) macro(MulReductionVI) @@ -349,6 +351,10 @@ macro(FmaVF) macro(DivVF) macro(DivVD) +macro(AbsVB) +macro(AbsVS) +macro(AbsVI) +macro(AbsVL) macro(AbsVF) macro(AbsVD) macro(NegVF) --- old/src/hotspot/share/opto/library_call.cpp 2019-04-30 17:18:52.798965313 -0700 +++ new/src/hotspot/share/opto/library_call.cpp 2019-04-30 17:18:52.690965316 -0700 @@ -227,6 +227,7 @@ bool runtime_math(const TypeFunc* call_type, address funcAddr, const char* funcName); bool inline_math_native(vmIntrinsics::ID id); bool inline_math(vmIntrinsics::ID id); + bool inline_double_math(vmIntrinsics::ID id); template bool inline_math_overflow(Node* arg1, Node* arg2); void inline_math_mathExact(Node* math, Node* test); @@ -533,6 +534,9 @@ case vmIntrinsics::_dcos: case vmIntrinsics::_dtan: case vmIntrinsics::_dabs: + case vmIntrinsics::_fabs: + case vmIntrinsics::_iabs: + case vmIntrinsics::_labs: case vmIntrinsics::_datan2: case vmIntrinsics::_dsqrt: case vmIntrinsics::_dexp: @@ -1793,7 +1797,7 @@ // public static double Math.sqrt(double) // public static double Math.log(double) // public static double Math.log10(double) -bool LibraryCallKit::inline_math(vmIntrinsics::ID id) { +bool LibraryCallKit::inline_double_math(vmIntrinsics::ID id) { Node* arg = round_double_node(argument(0)); Node* n = NULL; switch (id) { @@ -1805,6 +1809,23 @@ return true; } +//------------------------------inline_math----------------------------------- +// public static float Math.abs(float) +// public static int Math.abs(int) +// public static long Math.abs(long) +bool LibraryCallKit::inline_math(vmIntrinsics::ID id) { + Node* arg = argument(0); + Node* n = NULL; + switch (id) { + case vmIntrinsics::_fabs: n = new AbsFNode( arg); break; + case vmIntrinsics::_iabs: n = new AbsINode( arg); break; + case vmIntrinsics::_labs: n = new AbsLNode( arg); break; + default: fatal_unexpected_iid(id); break; + } + set_result(_gvn.transform(n)); + return true; +} + //------------------------------runtime_math----------------------------- bool LibraryCallKit::runtime_math(const TypeFunc* call_type, address funcAddr, const char* funcName) { assert(call_type == OptoRuntime::Math_DD_D_Type() || call_type == OptoRuntime::Math_D_D_Type(), @@ -1855,8 +1876,11 @@ runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dlog10), "LOG10"); // These intrinsics are supported on all hardware - case vmIntrinsics::_dsqrt: return Matcher::match_rule_supported(Op_SqrtD) ? inline_math(id) : false; - case vmIntrinsics::_dabs: return Matcher::has_match_rule(Op_AbsD) ? inline_math(id) : false; + case vmIntrinsics::_dsqrt: return Matcher::match_rule_supported(Op_SqrtD) ? inline_double_math(id) : false; + case vmIntrinsics::_dabs: return Matcher::has_match_rule(Op_AbsD) ? inline_double_math(id) : false; + case vmIntrinsics::_fabs: return Matcher::match_rule_supported(Op_AbsF) ? inline_math(id) : false; + case vmIntrinsics::_iabs: return Matcher::match_rule_supported(Op_AbsI) ? inline_math(id) : false; + case vmIntrinsics::_labs: return Matcher::match_rule_supported(Op_AbsL) ? inline_math(id) : false; case vmIntrinsics::_dexp: return StubRoutines::dexp() != NULL ? --- old/src/hotspot/share/opto/subnode.hpp 2019-04-30 17:18:53.314965297 -0700 +++ new/src/hotspot/share/opto/subnode.hpp 2019-04-30 17:18:53.214965300 -0700 @@ -350,6 +350,17 @@ virtual uint ideal_reg() const { return Op_RegI; } }; +//------------------------------AbsLNode--------------------------------------- +// Absolute value a long. Since a naive graph involves control flow, we +// "match" it in the ideal world (so the control flow can be removed). +class AbsLNode : public AbsNode { +public: + AbsLNode( Node *in1 ) : AbsNode(in1) {} + virtual int Opcode() const; + const Type *bottom_type() const { return TypeLong::LONG; } + virtual uint ideal_reg() const { return Op_RegL; } +}; + //------------------------------AbsFNode--------------------------------------- // Absolute value a float, a common float-point idiom with a cheap hardware // implemention on most chips. Since a naive graph involves control flow, we --- old/src/hotspot/share/opto/superword.cpp 2019-04-30 17:18:53.694965286 -0700 +++ new/src/hotspot/share/opto/superword.cpp 2019-04-30 17:18:53.594965289 -0700 @@ -2453,6 +2453,7 @@ } } else if (opc == Op_SqrtF || opc == Op_SqrtD || opc == Op_AbsF || opc == Op_AbsD || + opc == Op_AbsI || opc == Op_AbsL || opc == Op_NegF || opc == Op_NegD || opc == Op_PopCountI) { assert(n->req() == 2, "only one input expected"); --- old/src/hotspot/share/opto/vectornode.cpp 2019-04-30 17:18:54.006965277 -0700 +++ new/src/hotspot/share/opto/vectornode.cpp 2019-04-30 17:18:53.906965280 -0700 @@ -70,8 +70,8 @@ return Op_SubVD; case Op_MulI: switch (bt) { - case T_BOOLEAN: - case T_BYTE: return 0; // Unimplemented + case T_BOOLEAN:return 0; + case T_BYTE: return Op_MulVB; case T_CHAR: case T_SHORT: return Op_MulVS; case T_INT: return Op_MulVI; @@ -104,6 +104,18 @@ case Op_DivD: assert(bt == T_DOUBLE, "must be"); return Op_DivVD; + case Op_AbsI: + switch (bt) { + case T_BOOLEAN: + case T_CHAR: return 0; // abs does not make sense for unsigned + case T_BYTE: return Op_AbsVB; + case T_SHORT: return Op_AbsVS; + case T_INT: return Op_AbsVI; + default: ShouldNotReachHere(); return 0; + } + case Op_AbsL: + assert(bt == T_LONG, "must be"); + return Op_AbsVL; case Op_AbsF: assert(bt == T_FLOAT, "must be"); return Op_AbsVF; @@ -350,6 +362,7 @@ case Op_SubVF: return new SubVFNode(n1, n2, vt); case Op_SubVD: return new SubVDNode(n1, n2, vt); + case Op_MulVB: return new MulVBNode(n1, n2, vt); case Op_MulVS: return new MulVSNode(n1, n2, vt); case Op_MulVI: return new MulVINode(n1, n2, vt); case Op_MulVL: return new MulVLNode(n1, n2, vt); @@ -359,6 +372,10 @@ case Op_DivVF: return new DivVFNode(n1, n2, vt); case Op_DivVD: return new DivVDNode(n1, n2, vt); + case Op_AbsVB: return new AbsVBNode(n1, vt); + case Op_AbsVS: return new AbsVSNode(n1, vt); + case Op_AbsVI: return new AbsVINode(n1, vt); + case Op_AbsVL: return new AbsVLNode(n1, vt); case Op_AbsVF: return new AbsVFNode(n1, vt); case Op_AbsVD: return new AbsVDNode(n1, vt); --- old/src/hotspot/share/opto/vectornode.hpp 2019-04-30 17:18:54.286965269 -0700 +++ new/src/hotspot/share/opto/vectornode.hpp 2019-04-30 17:18:54.190965271 -0700 @@ -224,6 +224,14 @@ virtual int Opcode() const; }; +//------------------------------MulVBNode-------------------------------------- +// Vector multiply byte +class MulVBNode : public VectorNode { + public: + MulVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {} + virtual int Opcode() const; +}; + //------------------------------MulVSNode-------------------------------------- // Vector multiply short class MulVSNode : public VectorNode { @@ -360,6 +368,38 @@ virtual int Opcode() const; }; +//------------------------------AbsVBNode-------------------------------------- +// Vector Abs byte +class AbsVBNode : public VectorNode { +public: + AbsVBNode(Node* in, const TypeVect* vt) : VectorNode(in, vt) {} + virtual int Opcode() const; +}; + +//------------------------------AbsVSNode-------------------------------------- +// Vector Abs short +class AbsVSNode : public VectorNode { +public: + AbsVSNode(Node* in, const TypeVect* vt) : VectorNode(in, vt) {} + virtual int Opcode() const; +}; + +//------------------------------AbsVINode-------------------------------------- +// Vector Abs int +class AbsVINode : public VectorNode { +public: + AbsVINode(Node* in, const TypeVect* vt) : VectorNode(in, vt) {} + virtual int Opcode() const; +}; + +//------------------------------AbsVLNode-------------------------------------- +// Vector Abs long +class AbsVLNode : public VectorNode { +public: + AbsVLNode(Node* in, const TypeVect* vt) : VectorNode(in, vt) {} + virtual int Opcode() const; +}; + //------------------------------AbsVFNode-------------------------------------- // Vector Abs float class AbsVFNode : public VectorNode { --- old/src/hotspot/share/runtime/vmStructs.cpp 2019-04-30 17:18:54.678965257 -0700 +++ new/src/hotspot/share/runtime/vmStructs.cpp 2019-04-30 17:18:54.570965260 -0700 @@ -1758,6 +1758,10 @@ declare_c2_type(ReverseBytesLNode, Node) \ declare_c2_type(ReductionNode, Node) \ declare_c2_type(VectorNode, Node) \ + declare_c2_type(AbsVBNode, VectorNode) \ + declare_c2_type(AbsVSNode, VectorNode) \ + declare_c2_type(AbsVINode, VectorNode) \ + declare_c2_type(AbsVLNode, VectorNode) \ declare_c2_type(AddVBNode, VectorNode) \ declare_c2_type(AddVSNode, VectorNode) \ declare_c2_type(AddVINode, VectorNode) \ @@ -1774,6 +1778,7 @@ declare_c2_type(SubVLNode, VectorNode) \ declare_c2_type(SubVFNode, VectorNode) \ declare_c2_type(SubVDNode, VectorNode) \ + declare_c2_type(MulVBNode, VectorNode) \ declare_c2_type(MulVSNode, VectorNode) \ declare_c2_type(MulVLNode, VectorNode) \ declare_c2_type(MulReductionVLNode, ReductionNode) \ @@ -1782,6 +1787,8 @@ declare_c2_type(MulVFNode, VectorNode) \ declare_c2_type(MulReductionVFNode, ReductionNode) \ declare_c2_type(MulVDNode, VectorNode) \ + declare_c2_type(NegVFNode, VectorNode) \ + declare_c2_type(NegVDNode, VectorNode) \ declare_c2_type(FmaVDNode, VectorNode) \ declare_c2_type(FmaVFNode, VectorNode) \ declare_c2_type(CMoveVFNode, VectorNode) \ --- old/src/java.base/share/classes/java/lang/Math.java 2019-04-30 17:18:55.166965243 -0700 +++ new/src/java.base/share/classes/java/lang/Math.java 2019-04-30 17:18:55.062965246 -0700 @@ -1353,6 +1353,7 @@ * @param a the argument whose absolute value is to be determined * @return the absolute value of the argument. */ + @HotSpotIntrinsicCandidate public static int abs(int a) { return (a < 0) ? -a : a; } @@ -1370,6 +1371,7 @@ * @param a the argument whose absolute value is to be determined * @return the absolute value of the argument. */ + @HotSpotIntrinsicCandidate public static long abs(long a) { return (a < 0) ? -a : a; } @@ -1394,6 +1396,7 @@ * @param a the argument whose absolute value is to be determined * @return the absolute value of the argument. */ + @HotSpotIntrinsicCandidate public static float abs(float a) { return (a <= 0.0F) ? 0.0F - a : a; } --- old/test/hotspot/jtreg/compiler/c2/cr6340864/TestDoubleVect.java 2019-04-30 17:18:55.562965231 -0700 +++ new/test/hotspot/jtreg/compiler/c2/cr6340864/TestDoubleVect.java 2019-04-30 17:18:55.458965234 -0700 @@ -86,6 +86,7 @@ test_divc_n(a0, a1); test_divv(a0, a1, -VALUE); test_diva(a0, a1, a3); + test_negc(a0, a1); } // Test and verify results System.out.println("Verification"); @@ -339,6 +340,16 @@ for (int i=12; i 0) @@ -469,6 +481,13 @@ end = System.currentTimeMillis(); System.out.println("test_diva_n: " + (end - start)); + start = System.currentTimeMillis(); + for (int i=0; i