--- old/src/cpu/x86/vm/assembler_x86.cpp 2015-06-02 20:15:00.734684000 -0700 +++ new/src/cpu/x86/vm/assembler_x86.cpp 2015-06-02 20:15:00.413266700 -0700 @@ -1347,7 +1347,9 @@ void Assembler::andnl(Register dst, Register src1, Register src2) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode(dst, src1, src2, false); + int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), + VEX_SIMD_NONE, VEX_OPCODE_0F_38, false, AVX_128bit, + true, false); emit_int8((unsigned char)0xF2); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1355,7 +1357,9 @@ void Assembler::andnl(Register dst, Register src1, Address src2) { InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - vex_prefix_0F38(dst, src1, src2, false); + vex_prefix(src2, src1->encoding(), dst->encoding(), + VEX_SIMD_NONE, VEX_OPCODE_0F_38, false, + AVX_128bit, true, false); emit_int8((unsigned char)0xF2); emit_operand(dst, src2); } @@ -1382,7 +1386,9 @@ void Assembler::blsil(Register dst, Register src) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode(rbx, dst, src, false); + int encode = vex_prefix_and_encode(rbx->encoding(), dst->encoding(), src->encoding(), + VEX_SIMD_NONE, VEX_OPCODE_0F_38, false, AVX_128bit, + true, false); emit_int8((unsigned char)0xF3); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1390,14 +1396,18 @@ void Assembler::blsil(Register dst, Address src) { InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - vex_prefix_0F38(rbx, dst, src, false); + vex_prefix(src, dst->encoding(), rbx->encoding(), + VEX_SIMD_NONE, VEX_OPCODE_0F_38, false, + AVX_128bit, true, false); emit_int8((unsigned char)0xF3); emit_operand(rbx, src); } void Assembler::blsmskl(Register dst, Register src) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode(rdx, dst, src, false); + int encode = vex_prefix_and_encode(rdx->encoding(), dst->encoding(), src->encoding(), + VEX_SIMD_NONE, VEX_OPCODE_0F_38, false, AVX_128bit, + true, false); emit_int8((unsigned char)0xF3); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1412,7 +1422,9 @@ void Assembler::blsrl(Register dst, Register src) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode(rcx, dst, src, false); + int encode = vex_prefix_and_encode(rcx->encoding(), dst->encoding(), src->encoding(), + VEX_SIMD_NONE, VEX_OPCODE_0F_38, false, AVX_128bit, + true, false); emit_int8((unsigned char)0xF3); emit_int8((unsigned char)(0xC0 | encode)); } @@ -1420,7 +1432,9 @@ void Assembler::blsrl(Register dst, Address src) { InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - vex_prefix_0F38(rcx, dst, src, false); + vex_prefix(src, dst->encoding(), rcx->encoding(), + VEX_SIMD_NONE, VEX_OPCODE_0F_38, false, + AVX_128bit, true, false); emit_int8((unsigned char)0xF3); emit_operand(rcx, src); } @@ -3099,15 +3113,16 @@ assert(VM_Version::supports_sse4_1(), ""); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); InstructionMark im(this); - simd_prefix(dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38); + simd_prefix(dst, xnoreg, src, VEX_SIMD_66, false, + VEX_OPCODE_0F_38, false, AVX_128bit, true); emit_int8(0x17); emit_operand(dst, src); } void Assembler::ptest(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, - false, VEX_OPCODE_0F_38); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false, + VEX_OPCODE_0F_38, false, AVX_128bit, true); emit_int8(0x17); emit_int8((unsigned char)(0xC0 | encode)); } @@ -3119,7 +3134,7 @@ assert(dst != xnoreg, "sanity"); int dst_enc = dst->encoding(); // swap src<->dst for encoding - vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len); + vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len, true, false); emit_int8(0x17); emit_operand(dst, src); } @@ -3128,7 +3143,7 @@ assert(VM_Version::supports_avx(), ""); int vector_len = AVX_256bit; int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, - vector_len, VEX_OPCODE_0F_38); + vector_len, VEX_OPCODE_0F_38, true, false); emit_int8(0x17); emit_int8((unsigned char)(0xC0 | encode)); } @@ -4972,7 +4987,51 @@ emit_int8((unsigned char)(0xC0 | encode)); } -// duplicate 4-bytes integer data from src into 8 locations in dest +// duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL +void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, + vector_len, VEX_OPCODE_0F_38, false); + emit_int8(0x78); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::evpbroadcastb(XMMRegister dst, Address src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_8bit; + InstructionMark im(this); + assert(dst != xnoreg, "sanity"); + int dst_enc = dst->encoding(); + // swap src<->dst for encoding + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len); + emit_int8(0x78); + emit_operand(dst, src); +} + +// duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL +void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, + vector_len, VEX_OPCODE_0F_38, false); + emit_int8(0x79); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::evpbroadcastw(XMMRegister dst, Address src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_16bit; + InstructionMark im(this); + assert(dst != xnoreg, "sanity"); + int dst_enc = dst->encoding(); + // swap src<->dst for encoding + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len); + emit_int8(0x79); + emit_operand(dst, src); +} + +// duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL void Assembler::evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), ""); int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, @@ -4981,6 +5040,121 @@ emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::evpbroadcastd(XMMRegister dst, Address src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + InstructionMark im(this); + assert(dst != xnoreg, "sanity"); + int dst_enc = dst->encoding(); + // swap src<->dst for encoding + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len); + emit_int8(0x58); + emit_operand(dst, src); +} + +// duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL +void Assembler::evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, + VEX_OPCODE_0F_38, true, vector_len, false, false); + emit_int8(0x59); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::evpbroadcastq(XMMRegister dst, Address src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + InstructionMark im(this); + assert(dst != xnoreg, "sanity"); + int dst_enc = dst->encoding(); + // swap src<->dst for encoding + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len); + emit_int8(0x59); + emit_operand(dst, src); +} + +// duplicate single precision fp from src into 4|8|16 locations in dest : requires AVX512VL +void Assembler::evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, + VEX_OPCODE_0F_38, false, vector_len, false, false); + emit_int8(0x18); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::evpbroadcastss(XMMRegister dst, Address src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_32bit; + InstructionMark im(this); + assert(dst != xnoreg, "sanity"); + int dst_enc = dst->encoding(); + // swap src<->dst for encoding + vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len); + emit_int8(0x18); + emit_operand(dst, src); +} + +// duplicate double precision fp from src into 2|4|8 locations in dest : requires AVX512VL +void Assembler::evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, + VEX_OPCODE_0F_38, true, vector_len, false, false); + emit_int8(0x19); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::evpbroadcastsd(XMMRegister dst, Address src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + tuple_type = EVEX_T1S; + input_size_in_bits = EVEX_64bit; + InstructionMark im(this); + assert(dst != xnoreg, "sanity"); + int dst_enc = dst->encoding(); + // swap src<->dst for encoding + vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len); + emit_int8(0x19); + emit_operand(dst, src); +} + +// duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL +void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, + VEX_OPCODE_0F_38, false, vector_len, false, false); + emit_int8(0x7A); + emit_int8((unsigned char)(0xC0 | encode)); +} + +// duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL +void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, + VEX_OPCODE_0F_38, false, vector_len, false, false); + emit_int8(0x7B); + emit_int8((unsigned char)(0xC0 | encode)); +} + +// duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL +void Assembler::evpbroadcastd(XMMRegister dst, Register src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, + VEX_OPCODE_0F_38, false, vector_len, false, false); + emit_int8(0x7C); + emit_int8((unsigned char)(0xC0 | encode)); +} + +// duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL +void Assembler::evpbroadcastq(XMMRegister dst, Register src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, + VEX_OPCODE_0F_38, true, vector_len, false, false); + emit_int8(0x7C); + emit_int8((unsigned char)(0xC0 | encode)); +} + // Carry-Less Multiplication Quadword void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) { assert(VM_Version::supports_clmul(), ""); @@ -5591,7 +5765,7 @@ void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, int vector_len, bool legacy_mode, bool no_mask_reg) { - bool vex_r = (xreg_enc >= 8); + bool vex_r = ((xreg_enc & 8) == 8) ? 1 : 0; bool vex_b = adr.base_needs_rex(); bool vex_x = adr.index_needs_rex(); avx_vector_len = vector_len; @@ -5619,8 +5793,8 @@ int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, int vector_len, bool legacy_mode, bool no_mask_reg ) { - bool vex_r = (dst_enc >= 8); - bool vex_b = (src_enc >= 8); + bool vex_r = ((dst_enc & 8) == 8) ? 1 : 0; + bool vex_b = ((src_enc & 8) == 8) ? 1 : 0; bool vex_x = false; avx_vector_len = vector_len; @@ -6265,19 +6439,19 @@ void Assembler::andnq(Register dst, Register src1, Register src2) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode_q(dst, src1, src2); + int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), + VEX_SIMD_NONE, VEX_OPCODE_0F_38, true, AVX_128bit, + true, false); emit_int8((unsigned char)0xF2); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::andnq(Register dst, Register src1, Address src2) { - if (VM_Version::supports_evex()) { - tuple_type = EVEX_T1S; - input_size_in_bits = EVEX_64bit; - } InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - vex_prefix_0F38_q(dst, src1, src2); + vex_prefix(src2, src1->encoding(), dst->encoding(), + VEX_SIMD_NONE, VEX_OPCODE_0F_38, true, + AVX_128bit, true, false); emit_int8((unsigned char)0xF2); emit_operand(dst, src2); } @@ -6304,7 +6478,9 @@ void Assembler::blsiq(Register dst, Register src) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode_q(rbx, dst, src); + int encode = vex_prefix_and_encode(rbx->encoding(), dst->encoding(), src->encoding(), + VEX_SIMD_NONE, VEX_OPCODE_0F_38, true, AVX_128bit, + true, false); emit_int8((unsigned char)0xF3); emit_int8((unsigned char)(0xC0 | encode)); } @@ -6312,14 +6488,18 @@ void Assembler::blsiq(Register dst, Address src) { InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - vex_prefix_0F38_q(rbx, dst, src); + vex_prefix(src, dst->encoding(), rbx->encoding(), + VEX_SIMD_NONE, VEX_OPCODE_0F_38, true, + AVX_128bit, true, false); emit_int8((unsigned char)0xF3); emit_operand(rbx, src); } void Assembler::blsmskq(Register dst, Register src) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode_q(rdx, dst, src); + int encode = vex_prefix_and_encode(rdx->encoding(), dst->encoding(), src->encoding(), + VEX_SIMD_NONE, VEX_OPCODE_0F_38, true, AVX_128bit, + true, false); emit_int8((unsigned char)0xF3); emit_int8((unsigned char)(0xC0 | encode)); } @@ -6327,14 +6507,18 @@ void Assembler::blsmskq(Register dst, Address src) { InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - vex_prefix_0F38_q(rdx, dst, src); + vex_prefix(src, dst->encoding(), rdx->encoding(), + VEX_SIMD_NONE, VEX_OPCODE_0F_38, true, + AVX_128bit, true, false); emit_int8((unsigned char)0xF3); emit_operand(rdx, src); } void Assembler::blsrq(Register dst, Register src) { assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - int encode = vex_prefix_0F38_and_encode_q(rcx, dst, src); + int encode = vex_prefix_and_encode(rcx->encoding(), dst->encoding(), src->encoding(), + VEX_SIMD_NONE, VEX_OPCODE_0F_38, true, AVX_128bit, + true, false); emit_int8((unsigned char)0xF3); emit_int8((unsigned char)(0xC0 | encode)); } @@ -6342,7 +6526,9 @@ void Assembler::blsrq(Register dst, Address src) { InstructionMark im(this); assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported"); - vex_prefix_0F38_q(rcx, dst, src); + vex_prefix(src, dst->encoding(), rcx->encoding(), + VEX_SIMD_NONE, VEX_OPCODE_0F_38, true, + AVX_128bit, true, false); emit_int8((unsigned char)0xF3); emit_operand(rcx, src); } --- old/src/cpu/x86/vm/assembler_x86.hpp 2015-06-02 20:15:27.949016700 -0700 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2015-06-02 20:15:27.302176900 -0700 @@ -2021,8 +2021,25 @@ // duplicate 4-bytes integer data from src into 8 locations in dest void vpbroadcastd(XMMRegister dst, XMMRegister src); - // duplicate 4-bytes integer data from src into vector_len locations in dest + // duplicate n-bytes integer data from src into vector_len locations in dest + void evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len); + void evpbroadcastb(XMMRegister dst, Address src, int vector_len); + void evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len); + void evpbroadcastw(XMMRegister dst, Address src, int vector_len); void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len); + void evpbroadcastd(XMMRegister dst, Address src, int vector_len); + void evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len); + void evpbroadcastq(XMMRegister dst, Address src, int vector_len); + + void evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len); + void evpbroadcastss(XMMRegister dst, Address src, int vector_len); + void evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len); + void evpbroadcastsd(XMMRegister dst, Address src, int vector_len); + + void evpbroadcastb(XMMRegister dst, Register src, int vector_len); + void evpbroadcastw(XMMRegister dst, Register src, int vector_len); + void evpbroadcastd(XMMRegister dst, Register src, int vector_len); + void evpbroadcastq(XMMRegister dst, Register src, int vector_len); // Carry-Less Multiplication Quadword void pclmulqdq(XMMRegister dst, XMMRegister src, int mask); --- old/src/cpu/x86/vm/sharedRuntime_x86_64.cpp 2015-06-02 20:16:52.892299600 -0700 +++ new/src/cpu/x86/vm/sharedRuntime_x86_64.cpp 2015-06-02 20:16:52.166357100 -0700 @@ -365,22 +365,22 @@ map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next()); map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next()); if (UseAVX > 2) { - map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg()); - map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg()); + map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg()->next()); + map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg()->next()); } } @@ -466,7 +466,7 @@ __ vinsertf64x4h(xmm29, Address(rsp, 928)); __ vinsertf64x4h(xmm30, Address(rsp, 960)); __ vinsertf64x4h(xmm31, Address(rsp, 992)); - __ subptr(rsp, 1024); + __ addptr(rsp, 1024); } } #else --- old/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2015-06-02 20:17:34.225963600 -0700 +++ new/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2015-06-02 20:17:33.928577500 -0700 @@ -382,8 +382,15 @@ // restore regs belonging to calling function #ifdef _WIN64 - for (int i = 15; i >= 6; i--) { - __ movdqu(as_XMMRegister(i), xmm_save(i)); + if (UseAVX > 2) { + for (int i = 6; i <= 31; i++) { + __ movdqu(as_XMMRegister(i), xmm_save(i)); + } + } + else { + for (int i = 6; i <= 15; i++) { + __ movdqu(as_XMMRegister(i), xmm_save(i)); + } } #endif __ movptr(r15, r15_save); --- old/src/cpu/x86/vm/x86.ad 2015-06-02 20:17:36.426821000 -0700 +++ new/src/cpu/x86/vm/x86.ad 2015-06-02 20:17:36.121424500 -0700 @@ -2894,45 +2894,32 @@ ins_pipe( pipe_slow ); %} -// Replicate byte scalar to be vector -instruct Repl4B(vecS dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 4); - match(Set dst (ReplicateB src)); - format %{ "movd $dst,$src\n\t" - "punpcklbw $dst,$dst\n\t" - "pshuflw $dst,$dst,0x00\t! replicate4B" %} - ins_encode %{ - __ movdl($dst$$XMMRegister, $src$$Register); - __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister); - __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); - %} - ins_pipe( pipe_slow ); -%} +// ====================LEGACY REPLICATE======================================= -instruct Repl8B(vecD dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 8); +instruct Repl16B(vecX dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl() && !VM_Version::supports_avx512bw()); match(Set dst (ReplicateB src)); format %{ "movd $dst,$src\n\t" "punpcklbw $dst,$dst\n\t" - "pshuflw $dst,$dst,0x00\t! replicate8B" %} + "pshuflw $dst,$dst,0x00\n\t" + "punpcklqdq $dst,$dst\t! replicate16B" %} ins_encode %{ __ movdl($dst$$XMMRegister, $src$$Register); __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister); __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); %} ins_pipe( pipe_slow ); %} -instruct Repl16B(vecX dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 16); - match(Set dst (ReplicateB src)); - format %{ "movd $dst,$src\n\t" - "punpcklbw $dst,$dst\n\t" +instruct Repl16B_mem(vecX dst, memory mem) %{ + predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl() && !VM_Version::supports_avx512bw()); + match(Set dst (ReplicateB (LoadB mem))); + format %{ "punpcklbw $dst,$mem\n\t" "pshuflw $dst,$dst,0x00\n\t" "punpcklqdq $dst,$dst\t! replicate16B" %} ins_encode %{ - __ movdl($dst$$XMMRegister, $src$$Register); - __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister); + __ punpcklbw($dst$$XMMRegister, $mem$$Address); __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); %} @@ -2940,7 +2927,7 @@ %} instruct Repl32B(vecY dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 32); + predicate(n->as_Vector()->length() == 32 && UseAVX > 0 && !VM_Version::supports_avx512vl() && !VM_Version::supports_avx512bw()); match(Set dst (ReplicateB src)); format %{ "movd $dst,$src\n\t" "punpcklbw $dst,$dst\n\t" @@ -2957,49 +2944,24 @@ ins_pipe( pipe_slow ); %} -instruct Repl64B(vecZ dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 64); - match(Set dst (ReplicateB src)); - format %{ "movd $dst,$src\n\t" - "punpcklbw $dst,$dst\n\t" +instruct Repl32B_mem(vecY dst, memory mem) %{ + predicate(n->as_Vector()->length() == 32 && UseAVX > 0 && !VM_Version::supports_avx512vl() && !VM_Version::supports_avx512bw()); + match(Set dst (ReplicateB (LoadB mem))); + format %{ "punpcklbw $dst,$mem\n\t" "pshuflw $dst,$dst,0x00\n\t" "punpcklqdq $dst,$dst\n\t" - "vinserti128h $dst,$dst,$dst\t! lower replicate32B\n\t" - "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate632B" %} + "vinserti128h $dst,$dst,$dst\t! replicate32B" %} ins_encode %{ - __ movdl($dst$$XMMRegister, $src$$Register); - __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister); + __ punpcklbw($dst$$XMMRegister, $mem$$Address); __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -// Replicate byte scalar immediate to be vector by loading from const table. -instruct Repl4B_imm(vecS dst, immI con) %{ - predicate(n->as_Vector()->length() == 4); - match(Set dst (ReplicateB con)); - format %{ "movdl $dst,[$constantaddress]\t! replicate4B($con)" %} - ins_encode %{ - __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1))); - %} - ins_pipe( pipe_slow ); -%} - -instruct Repl8B_imm(vecD dst, immI con) %{ - predicate(n->as_Vector()->length() == 8); - match(Set dst (ReplicateB con)); - format %{ "movq $dst,[$constantaddress]\t! replicate8B($con)" %} - ins_encode %{ - __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1))); %} ins_pipe( pipe_slow ); %} instruct Repl16B_imm(vecX dst, immI con) %{ - predicate(n->as_Vector()->length() == 16); + predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl() && !VM_Version::supports_avx512bw()); match(Set dst (ReplicateB con)); format %{ "movq $dst,[$constantaddress]\n\t" "punpcklqdq $dst,$dst\t! replicate16B($con)" %} @@ -3011,7 +2973,7 @@ %} instruct Repl32B_imm(vecY dst, immI con) %{ - predicate(n->as_Vector()->length() == 32); + predicate(n->as_Vector()->length() == 32 && UseAVX > 0 && !VM_Version::supports_avx512vl() && !VM_Version::supports_avx512bw()); match(Set dst (ReplicateB con)); format %{ "movq $dst,[$constantaddress]\n\t" "punpcklqdq $dst,$dst\n\t" @@ -3024,45 +2986,8 @@ ins_pipe( pipe_slow ); %} -instruct Repl64B_imm(vecZ dst, immI con) %{ - predicate(n->as_Vector()->length() == 64); - match(Set dst (ReplicateB con)); - format %{ "movq $dst,[$constantaddress]\n\t" - "punpcklqdq $dst,$dst\n\t" - "vinserti128h $dst,$dst,$dst\t! lower replicate32B($con)\n\t" - "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate32B($con)" %} - ins_encode %{ - __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1))); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -// Replicate byte scalar zero to be vector -instruct Repl4B_zero(vecS dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 4); - match(Set dst (ReplicateB zero)); - format %{ "pxor $dst,$dst\t! replicate4B zero" %} - ins_encode %{ - __ pxor($dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( fpu_reg_reg ); -%} - -instruct Repl8B_zero(vecD dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 8); - match(Set dst (ReplicateB zero)); - format %{ "pxor $dst,$dst\t! replicate8B zero" %} - ins_encode %{ - __ pxor($dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( fpu_reg_reg ); -%} - instruct Repl16B_zero(vecX dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 16); + predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && UseAVX < 3); match(Set dst (ReplicateB zero)); format %{ "pxor $dst,$dst\t! replicate16B zero" %} ins_encode %{ @@ -3072,7 +2997,7 @@ %} instruct Repl32B_zero(vecY dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 32); + predicate(n->as_Vector()->length() == 32 && UseAVX > 0 && UseAVX < 3); match(Set dst (ReplicateB zero)); format %{ "vpxor $dst,$dst,$dst\t! replicate32B zero" %} ins_encode %{ @@ -3083,45 +3008,8 @@ ins_pipe( fpu_reg_reg ); %} -instruct Repl64B_zero(vecZ dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 64); - match(Set dst (ReplicateB zero)); - format %{ "vpxor $dst k0,$dst,$dst\t! replicate64B zero" %} - ins_encode %{ - // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it). - int vector_len = 2; - __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len); - %} - ins_pipe( fpu_reg_reg ); -%} - -// Replicate char/short (2 byte) scalar to be vector -instruct Repl2S(vecS dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateS src)); - format %{ "movd $dst,$src\n\t" - "pshuflw $dst,$dst,0x00\t! replicate2S" %} - ins_encode %{ - __ movdl($dst$$XMMRegister, $src$$Register); - __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); - %} - ins_pipe( fpu_reg_reg ); -%} - -instruct Repl4S(vecD dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 4); - match(Set dst (ReplicateS src)); - format %{ "movd $dst,$src\n\t" - "pshuflw $dst,$dst,0x00\t! replicate4S" %} - ins_encode %{ - __ movdl($dst$$XMMRegister, $src$$Register); - __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); - %} - ins_pipe( fpu_reg_reg ); -%} - instruct Repl8S(vecX dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 8); + predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl() && !VM_Version::supports_avx512bw()); match(Set dst (ReplicateS src)); format %{ "movd $dst,$src\n\t" "pshuflw $dst,$dst,0x00\n\t" @@ -3135,7 +3023,7 @@ %} instruct Repl16S(vecY dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 16); + predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl() && !VM_Version::supports_avx512bw()); match(Set dst (ReplicateS src)); format %{ "movd $dst,$src\n\t" "pshuflw $dst,$dst,0x00\n\t" @@ -3150,47 +3038,8 @@ ins_pipe( pipe_slow ); %} -instruct Repl32S(vecZ dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 32); - match(Set dst (ReplicateS src)); - format %{ "movd $dst,$src\n\t" - "pshuflw $dst,$dst,0x00\n\t" - "punpcklqdq $dst,$dst\n\t" - "vinserti128h $dst,$dst,$dst\t! lower replicate16S\n\t" - "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S" %} - ins_encode %{ - __ movdl($dst$$XMMRegister, $src$$Register); - __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -// Replicate char/short (2 byte) scalar immediate to be vector by loading from const table. -instruct Repl2S_imm(vecS dst, immI con) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateS con)); - format %{ "movdl $dst,[$constantaddress]\t! replicate2S($con)" %} - ins_encode %{ - __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2))); - %} - ins_pipe( fpu_reg_reg ); -%} - -instruct Repl4S_imm(vecD dst, immI con) %{ - predicate(n->as_Vector()->length() == 4); - match(Set dst (ReplicateS con)); - format %{ "movq $dst,[$constantaddress]\t! replicate4S($con)" %} - ins_encode %{ - __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2))); - %} - ins_pipe( fpu_reg_reg ); -%} - instruct Repl8S_imm(vecX dst, immI con) %{ - predicate(n->as_Vector()->length() == 8); + predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl() && !VM_Version::supports_avx512bw()); match(Set dst (ReplicateS con)); format %{ "movq $dst,[$constantaddress]\n\t" "punpcklqdq $dst,$dst\t! replicate8S($con)" %} @@ -3202,7 +3051,7 @@ %} instruct Repl16S_imm(vecY dst, immI con) %{ - predicate(n->as_Vector()->length() == 16); + predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl() && !VM_Version::supports_avx512bw()); match(Set dst (ReplicateS con)); format %{ "movq $dst,[$constantaddress]\n\t" "punpcklqdq $dst,$dst\n\t" @@ -3215,45 +3064,8 @@ ins_pipe( pipe_slow ); %} -instruct Repl32S_imm(vecZ dst, immI con) %{ - predicate(n->as_Vector()->length() == 32); - match(Set dst (ReplicateS con)); - format %{ "movq $dst,[$constantaddress]\n\t" - "punpcklqdq $dst,$dst\n\t" - "vinserti128h $dst,$dst,$dst\t! lower replicate16S($con)\n\t" - "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S($con)" %} - ins_encode %{ - __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2))); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -// Replicate char/short (2 byte) scalar zero to be vector -instruct Repl2S_zero(vecS dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateS zero)); - format %{ "pxor $dst,$dst\t! replicate2S zero" %} - ins_encode %{ - __ pxor($dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( fpu_reg_reg ); -%} - -instruct Repl4S_zero(vecD dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 4); - match(Set dst (ReplicateS zero)); - format %{ "pxor $dst,$dst\t! replicate4S zero" %} - ins_encode %{ - __ pxor($dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( fpu_reg_reg ); -%} - instruct Repl8S_zero(vecX dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 8); + predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && UseAVX < 3); match(Set dst (ReplicateS zero)); format %{ "pxor $dst,$dst\t! replicate8S zero" %} ins_encode %{ @@ -3263,7 +3075,7 @@ %} instruct Repl16S_zero(vecY dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 16); + predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && UseAVX < 3); match(Set dst (ReplicateS zero)); format %{ "vpxor $dst,$dst,$dst\t! replicate16S zero" %} ins_encode %{ @@ -3274,45 +3086,30 @@ ins_pipe( fpu_reg_reg ); %} -instruct Repl32S_zero(vecZ dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 32); - match(Set dst (ReplicateS zero)); - format %{ "vpxor $dst k0,$dst,$dst\t! replicate32S zero" %} - ins_encode %{ - // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it). - int vector_len = 2; - __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); - %} - ins_pipe( fpu_reg_reg ); -%} - -// Replicate integer (4 byte) scalar to be vector -instruct Repl2I(vecD dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 2); +instruct Repl4I(vecX dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl()); match(Set dst (ReplicateI src)); format %{ "movd $dst,$src\n\t" - "pshufd $dst,$dst,0x00\t! replicate2I" %} + "pshufd $dst,$dst,0x00\t! replicate4I" %} ins_encode %{ __ movdl($dst$$XMMRegister, $src$$Register); __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); %} - ins_pipe( fpu_reg_reg ); + ins_pipe( pipe_slow ); %} -instruct Repl4I(vecX dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 4); - match(Set dst (ReplicateI src)); - format %{ "movd $dst,$src\n\t" - "pshufd $dst,$dst,0x00\t! replicate4I" %} +instruct Repl4I_mem(vecX dst, memory mem) %{ + predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl()); + match(Set dst (ReplicateI (LoadI mem))); + format %{ "pshufd $dst,$mem,0x00\t! replicate4I" %} ins_encode %{ - __ movdl($dst$$XMMRegister, $src$$Register); - __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00); %} ins_pipe( pipe_slow ); %} instruct Repl8I(vecY dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 8); + predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl()); match(Set dst (ReplicateI src)); format %{ "movd $dst,$src\n\t" "pshufd $dst,$dst,0x00\n\t" @@ -3325,35 +3122,20 @@ ins_pipe( pipe_slow ); %} -instruct Repl16I(vecZ dst, rRegI src) %{ - predicate(n->as_Vector()->length() == 16); - match(Set dst (ReplicateI src)); - format %{ "movd $dst,$src\n\t" - "pshufd $dst,$dst,0x00\n\t" - "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t" - "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %} +instruct Repl8I_mem(vecY dst, memory mem) %{ + predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl()); + match(Set dst (ReplicateI (LoadI mem))); + format %{ "pshufd $dst,$mem,0x00\n\t" + "vinserti128h $dst,$dst,$dst\t! replicate8I" %} ins_encode %{ - __ movdl($dst$$XMMRegister, $src$$Register); - __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00); __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); %} ins_pipe( pipe_slow ); %} -// Replicate integer (4 byte) scalar immediate to be vector by loading from const table. -instruct Repl2I_imm(vecD dst, immI con) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateI con)); - format %{ "movq $dst,[$constantaddress]\t! replicate2I($con)" %} - ins_encode %{ - __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4))); - %} - ins_pipe( fpu_reg_reg ); -%} - instruct Repl4I_imm(vecX dst, immI con) %{ - predicate(n->as_Vector()->length() == 4); + predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl()); match(Set dst (ReplicateI con)); format %{ "movq $dst,[$constantaddress]\t! replicate4I($con)\n\t" "punpcklqdq $dst,$dst" %} @@ -3365,7 +3147,7 @@ %} instruct Repl8I_imm(vecY dst, immI con) %{ - predicate(n->as_Vector()->length() == 8); + predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl()); match(Set dst (ReplicateI con)); format %{ "movq $dst,[$constantaddress]\t! replicate8I($con)\n\t" "punpcklqdq $dst,$dst\n\t" @@ -3378,15 +3160,101 @@ ins_pipe( pipe_slow ); %} -instruct Repl16I_imm(vecZ dst, immI con) %{ - predicate(n->as_Vector()->length() == 16); - match(Set dst (ReplicateI con)); - format %{ "movq $dst,[$constantaddress]\t! replicate16I($con)\n\t" - "punpcklqdq $dst,$dst\n\t" - "vinserti128h $dst,$dst,$dst\n\t" - "vinserti64x4h $dst k0,$dst,$dst" %} - ins_encode %{ - __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4))); +instruct Repl4I_zero(vecX dst, immI0 zero) %{ + predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && UseAVX < 3); + match(Set dst (ReplicateI zero)); + format %{ "pxor $dst,$dst\t! replicate4I zero)" %} + ins_encode %{ + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl8I_zero(vecY dst, immI0 zero) %{ + predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && UseAVX < 3); + match(Set dst (ReplicateI zero)); + format %{ "vpxor $dst,$dst,$dst\t! replicate8I zero" %} + ins_encode %{ + // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it). + int vector_len = 1; + __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + +// Replicate long (8 byte) scalar to be vector +#ifdef _LP64 +instruct Repl4L(vecY dst, rRegL src) %{ + predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl()); + match(Set dst (ReplicateL src)); + format %{ "movdq $dst,$src\n\t" + "punpcklqdq $dst,$dst\n\t" + "vinserti128h $dst,$dst,$dst\t! replicate4L" %} + ins_encode %{ + __ movdq($dst$$XMMRegister, $src$$Register); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} +#else // _LP64 +instruct Repl4L(vecY dst, eRegL src, regD tmp) %{ + predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl()); + match(Set dst (ReplicateL src)); + effect(TEMP dst, USE src, TEMP tmp); + format %{ "movdl $dst,$src.lo\n\t" + "movdl $tmp,$src.hi\n\t" + "punpckldq $dst,$tmp\n\t" + "punpcklqdq $dst,$dst\n\t" + "vinserti128h $dst,$dst,$dst\t! replicate4L" %} + ins_encode %{ + __ movdl($dst$$XMMRegister, $src$$Register); + __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register)); + __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} +#endif // _LP64 + +instruct Repl4L_imm(vecY dst, immL con) %{ + predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl()); + match(Set dst (ReplicateL con)); + format %{ "movq $dst,[$constantaddress]\n\t" + "punpcklqdq $dst,$dst\n\t" + "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %} + ins_encode %{ + __ movq($dst$$XMMRegister, $constantaddress($con)); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl4L_mem(vecY dst, memory mem) %{ + predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl()); + match(Set dst (ReplicateL (LoadL mem))); + format %{ "movq $dst,$mem\n\t" + "punpcklqdq $dst,$dst\n\t" + "vinserti128h $dst,$dst,$dst\t! replicate4L" %} + ins_encode %{ + __ movq($dst$$XMMRegister, $mem$$Address); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl8L_mem(vecZ dst, memory mem) %{ + predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && UseAVX < 3); + match(Set dst (ReplicateL (LoadL mem))); + format %{ "movq $dst,$mem\n\t" + "punpcklqdq $dst,$dst\n\t" + "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t" + "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %} + ins_encode %{ + __ movq($dst$$XMMRegister, $mem$$Address); __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); @@ -3394,96 +3262,819 @@ ins_pipe( pipe_slow ); %} -// Integer could be loaded into xmm register directly from memory. -instruct Repl2I_mem(vecD dst, memory mem) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateI (LoadI mem))); - format %{ "movd $dst,$mem\n\t" - "pshufd $dst,$dst,0x00\t! replicate2I" %} +instruct Repl8F(vecY dst, regF src) %{ + predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl()); + match(Set dst (ReplicateF src)); + format %{ "pshufd $dst,$src,0x00\n\t" + "vinsertf128h $dst,$dst,$dst\t! replicate8F" %} ins_encode %{ - __ movdl($dst$$XMMRegister, $mem$$Address); - __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); %} - ins_pipe( fpu_reg_reg ); + ins_pipe( pipe_slow ); %} -instruct Repl4I_mem(vecX dst, memory mem) %{ +instruct Repl8F_mem(vecY dst, memory mem) %{ + predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl()); + match(Set dst (ReplicateF (LoadF mem))); + format %{ "pshufd $dst,$mem,0x00\n\t" + "vinsertf128h $dst,$dst,$dst\t! replicate8F" %} + ins_encode %{ + __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00); + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl4D(vecY dst, regD src) %{ + predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl()); + match(Set dst (ReplicateD src)); + format %{ "pshufd $dst,$src,0x44\n\t" + "vinsertf128h $dst,$dst,$dst\t! replicate4D" %} + ins_encode %{ + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44); + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl4D_mem(vecY dst, memory mem) %{ + predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl()); + match(Set dst (ReplicateD (LoadD mem))); + format %{ "pshufd $dst,$mem,0x44\n\t" + "vinsertf128h $dst,$dst,$dst\t! replicate4D" %} + ins_encode %{ + __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44); + __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +// ====================GENERIC REPLICATE========================================== + +// Replicate byte scalar to be vector +instruct Repl4B(vecS dst, rRegI src) %{ predicate(n->as_Vector()->length() == 4); - match(Set dst (ReplicateI (LoadI mem))); - format %{ "movd $dst,$mem\n\t" - "pshufd $dst,$dst,0x00\t! replicate4I" %} + match(Set dst (ReplicateB src)); + format %{ "movd $dst,$src\n\t" + "punpcklbw $dst,$dst\n\t" + "pshuflw $dst,$dst,0x00\t! replicate4B" %} ins_encode %{ - __ movdl($dst$$XMMRegister, $mem$$Address); - __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + __ movdl($dst$$XMMRegister, $src$$Register); + __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister); + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); %} ins_pipe( pipe_slow ); %} -instruct Repl8I_mem(vecY dst, memory mem) %{ +instruct Repl4B_mem(vecS dst, memory mem) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (ReplicateB (LoadB mem))); + format %{ "punpcklbw $dst,$mem\n\t" + "pshuflw $dst,$dst,0x00\t! replicate4B" %} + ins_encode %{ + __ punpcklbw($dst$$XMMRegister, $mem$$Address); + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl8B(vecD dst, rRegI src) %{ predicate(n->as_Vector()->length() == 8); - match(Set dst (ReplicateI (LoadI mem))); - format %{ "movd $dst,$mem\n\t" - "pshufd $dst,$dst,0x00\n\t" - "vinserti128h $dst,$dst,$dst\t! replicate8I" %} + match(Set dst (ReplicateB src)); + format %{ "movd $dst,$src\n\t" + "punpcklbw $dst,$dst\n\t" + "pshuflw $dst,$dst,0x00\t! replicate8B" %} ins_encode %{ - __ movdl($dst$$XMMRegister, $mem$$Address); - __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); - __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ movdl($dst$$XMMRegister, $src$$Register); + __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister); + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); %} ins_pipe( pipe_slow ); %} -instruct Repl16I_mem(vecZ dst, memory mem) %{ - predicate(n->as_Vector()->length() == 16); - match(Set dst (ReplicateI (LoadI mem))); - format %{ "movd $dst,$mem\n\t" - "pshufd $dst,$dst,0x00\n\t" - "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t" - "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %} +instruct Repl8B_mem(vecD dst, memory mem) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (ReplicateB (LoadB mem))); + format %{ "punpcklbw $dst,$mem\n\t" + "pshuflw $dst,$dst,0x00\t! replicate8B" %} ins_encode %{ - __ movdl($dst$$XMMRegister, $mem$$Address); - __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); - __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ punpcklbw($dst$$XMMRegister, $mem$$Address); + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); %} ins_pipe( pipe_slow ); %} -// Replicate integer (4 byte) scalar zero to be vector -instruct Repl2I_zero(vecD dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateI zero)); - format %{ "pxor $dst,$dst\t! replicate2I" %} +// Replicate byte scalar immediate to be vector by loading from const table. +instruct Repl4B_imm(vecS dst, immI con) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (ReplicateB con)); + format %{ "movdl $dst,[$constantaddress]\t! replicate4B($con)" %} ins_encode %{ - __ pxor($dst$$XMMRegister, $dst$$XMMRegister); + __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1))); %} - ins_pipe( fpu_reg_reg ); + ins_pipe( pipe_slow ); %} -instruct Repl4I_zero(vecX dst, immI0 zero) %{ +instruct Repl8B_imm(vecD dst, immI con) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (ReplicateB con)); + format %{ "movq $dst,[$constantaddress]\t! replicate8B($con)" %} + ins_encode %{ + __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1))); + %} + ins_pipe( pipe_slow ); +%} + +// Replicate byte scalar zero to be vector +instruct Repl4B_zero(vecS dst, immI0 zero) %{ predicate(n->as_Vector()->length() == 4); - match(Set dst (ReplicateI zero)); - format %{ "pxor $dst,$dst\t! replicate4I zero)" %} + match(Set dst (ReplicateB zero)); + format %{ "pxor $dst,$dst\t! replicate4B zero" %} ins_encode %{ __ pxor($dst$$XMMRegister, $dst$$XMMRegister); %} ins_pipe( fpu_reg_reg ); %} -instruct Repl8I_zero(vecY dst, immI0 zero) %{ +instruct Repl8B_zero(vecD dst, immI0 zero) %{ predicate(n->as_Vector()->length() == 8); - match(Set dst (ReplicateI zero)); - format %{ "vpxor $dst,$dst,$dst\t! replicate8I zero" %} + match(Set dst (ReplicateB zero)); + format %{ "pxor $dst,$dst\t! replicate8B zero" %} ins_encode %{ - // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it). - int vector_len = 1; - __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + +// Replicate char/short (2 byte) scalar to be vector +instruct Repl2S(vecS dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateS src)); + format %{ "movd $dst,$src\n\t" + "pshuflw $dst,$dst,0x00\t! replicate2S" %} + ins_encode %{ + __ movdl($dst$$XMMRegister, $src$$Register); + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl4S(vecD dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (ReplicateS src)); + format %{ "movd $dst,$src\n\t" + "pshuflw $dst,$dst,0x00\t! replicate4S" %} + ins_encode %{ + __ movdl($dst$$XMMRegister, $src$$Register); + __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl4S_mem(vecD dst, memory mem) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (ReplicateS (LoadS mem))); + format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %} + ins_encode %{ + __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00); %} ins_pipe( fpu_reg_reg ); %} -instruct Repl16I_zero(vecZ dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 16); +// Replicate char/short (2 byte) scalar immediate to be vector by loading from const table. +instruct Repl2S_imm(vecS dst, immI con) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateS con)); + format %{ "movdl $dst,[$constantaddress]\t! replicate2S($con)" %} + ins_encode %{ + __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2))); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl4S_imm(vecD dst, immI con) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (ReplicateS con)); + format %{ "movq $dst,[$constantaddress]\t! replicate4S($con)" %} + ins_encode %{ + __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2))); + %} + ins_pipe( fpu_reg_reg ); +%} + +// Replicate char/short (2 byte) scalar zero to be vector +instruct Repl2S_zero(vecS dst, immI0 zero) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateS zero)); + format %{ "pxor $dst,$dst\t! replicate2S zero" %} + ins_encode %{ + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl4S_zero(vecD dst, immI0 zero) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (ReplicateS zero)); + format %{ "pxor $dst,$dst\t! replicate4S zero" %} + ins_encode %{ + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + +// Replicate integer (4 byte) scalar to be vector +instruct Repl2I(vecD dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateI src)); + format %{ "movd $dst,$src\n\t" + "pshufd $dst,$dst,0x00\t! replicate2I" %} + ins_encode %{ + __ movdl($dst$$XMMRegister, $src$$Register); + __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + %} + ins_pipe( fpu_reg_reg ); +%} + +// Integer could be loaded into xmm register directly from memory. +instruct Repl2I_mem(vecD dst, memory mem) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateI (LoadI mem))); + format %{ "movd $dst,$mem\n\t" + "pshufd $dst,$dst,0x00\t! replicate2I" %} + ins_encode %{ + __ movdl($dst$$XMMRegister, $mem$$Address); + __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); + %} + ins_pipe( fpu_reg_reg ); +%} + +// Replicate integer (4 byte) scalar immediate to be vector by loading from const table. +instruct Repl2I_imm(vecD dst, immI con) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateI con)); + format %{ "movq $dst,[$constantaddress]\t! replicate2I($con)" %} + ins_encode %{ + __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4))); + %} + ins_pipe( fpu_reg_reg ); +%} + +// Replicate integer (4 byte) scalar zero to be vector +instruct Repl2I_zero(vecD dst, immI0 zero) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateI zero)); + format %{ "pxor $dst,$dst\t! replicate2I" %} + ins_encode %{ + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + +// Replicate long (8 byte) scalar to be vector +#ifdef _LP64 +instruct Repl2L(vecX dst, rRegL src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateL src)); + format %{ "movdq $dst,$src\n\t" + "punpcklqdq $dst,$dst\t! replicate2L" %} + ins_encode %{ + __ movdq($dst$$XMMRegister, $src$$Register); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} +#else // _LP64 +instruct Repl2L(vecX dst, eRegL src, regD tmp) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateL src)); + effect(TEMP dst, USE src, TEMP tmp); + format %{ "movdl $dst,$src.lo\n\t" + "movdl $tmp,$src.hi\n\t" + "punpckldq $dst,$tmp\n\t" + "punpcklqdq $dst,$dst\t! replicate2L"%} + ins_encode %{ + __ movdl($dst$$XMMRegister, $src$$Register); + __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register)); + __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} +#endif // _LP64 + +// Replicate long (8 byte) scalar immediate to be vector by loading from const table. +instruct Repl2L_imm(vecX dst, immL con) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateL con)); + format %{ "movq $dst,[$constantaddress]\n\t" + "punpcklqdq $dst,$dst\t! replicate2L($con)" %} + ins_encode %{ + __ movq($dst$$XMMRegister, $constantaddress($con)); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +// Long could be loaded into xmm register directly from memory. +instruct Repl2L_mem(vecX dst, memory mem) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateL (LoadL mem))); + format %{ "movq $dst,$mem\n\t" + "punpcklqdq $dst,$dst\t! replicate2L" %} + ins_encode %{ + __ movq($dst$$XMMRegister, $mem$$Address); + __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +// Replicate long (8 byte) scalar zero to be vector +instruct Repl2L_zero(vecX dst, immL0 zero) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateL zero)); + format %{ "pxor $dst,$dst\t! replicate2L zero" %} + ins_encode %{ + __ pxor($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl4L_zero(vecY dst, immL0 zero) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (ReplicateL zero)); + format %{ "vpxor $dst,$dst,$dst\t! replicate4L zero" %} + ins_encode %{ + // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it). + int vector_len = 1; + __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + +// Replicate float (4 byte) scalar to be vector +instruct Repl2F(vecD dst, regF src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateF src)); + format %{ "pshufd $dst,$dst,0x00\t! replicate2F" %} + ins_encode %{ + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl2F_mem(vecD dst, memory mem) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateF (LoadF mem))); + format %{ "pshufd $dst,$mem,0x00\t! replicate2F" %} + ins_encode %{ + __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl4F(vecX dst, regF src) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (ReplicateF src)); + format %{ "pshufd $dst,$dst,0x00\t! replicate4F" %} + ins_encode %{ + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl4F_mem(vecX dst, memory mem) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (ReplicateF (LoadF mem))); + format %{ "pshufd $dst,$mem,0x00\t! replicate4F" %} + ins_encode %{ + __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00); + %} + ins_pipe( pipe_slow ); +%} + +// Replicate float (4 byte) scalar zero to be vector +instruct Repl2F_zero(vecD dst, immF0 zero) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateF zero)); + format %{ "xorps $dst,$dst\t! replicate2F zero" %} + ins_encode %{ + __ xorps($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl4F_zero(vecX dst, immF0 zero) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (ReplicateF zero)); + format %{ "xorps $dst,$dst\t! replicate4F zero" %} + ins_encode %{ + __ xorps($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl8F_zero(vecY dst, immF0 zero) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (ReplicateF zero)); + format %{ "vxorps $dst,$dst,$dst\t! replicate8F zero" %} + ins_encode %{ + int vector_len = 1; + __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + +// Replicate double (8 bytes) scalar to be vector +instruct Repl2D(vecX dst, regD src) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateD src)); + format %{ "pshufd $dst,$src,0x44\t! replicate2D" %} + ins_encode %{ + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl2D_mem(vecX dst, memory mem) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateD (LoadD mem))); + format %{ "pshufd $dst,$mem,0x44\t! replicate2D" %} + ins_encode %{ + __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44); + %} + ins_pipe( pipe_slow ); +%} + +// Replicate double (8 byte) scalar zero to be vector +instruct Repl2D_zero(vecX dst, immD0 zero) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (ReplicateD zero)); + format %{ "xorpd $dst,$dst\t! replicate2D zero" %} + ins_encode %{ + __ xorpd($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl4D_zero(vecY dst, immD0 zero) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (ReplicateD zero)); + format %{ "vxorpd $dst,$dst,$dst,vect256\t! replicate4D zero" %} + ins_encode %{ + int vector_len = 1; + __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + +// ====================EVEX REPLICATE============================================= + +// Note: some of the legacy forms are applicable to EVEX + +instruct Repl16B_evex(vecX dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vl() && VM_Version::supports_avx512bw()); + match(Set dst (ReplicateB src)); + format %{ "vpbroadcastb $dst,$src\t! replicate16B" %} + ins_encode %{ + int vector_len = 0; + __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl16B_mem_evex(vecX dst, memory mem) %{ + predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vl() && VM_Version::supports_avx512bw()); + match(Set dst (ReplicateB (LoadB mem))); + format %{ "vpbroadcastb $dst,$mem\t! replicate16B" %} + ins_encode %{ + int vector_len = 0; + __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl32B_evex(vecY dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vl() && VM_Version::supports_avx512bw()); + match(Set dst (ReplicateB src)); + format %{ "vpbroadcastb $dst,$src\t! replicate32B" %} + ins_encode %{ + int vector_len = 1; + __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl32B_mem_evex(vecY dst, memory mem) %{ + predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vl() && VM_Version::supports_avx512bw()); + match(Set dst (ReplicateB (LoadB mem))); + format %{ "vpbroadcastb $dst,$mem\t! replicate32B" %} + ins_encode %{ + int vector_len = 1; + __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl64B_evex(vecZ dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 64 && UseAVX > 2); + match(Set dst (ReplicateB src)); + format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %} + ins_encode %{ + int vector_len = 2; + __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl64B_mem_evex(vecZ dst, memory mem) %{ + predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512vl() && VM_Version::supports_avx512bw()); + match(Set dst (ReplicateB (LoadB mem))); + format %{ "vpbroadcastb $dst,$mem\t! replicate64B" %} + ins_encode %{ + int vector_len = 2; + __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl16B_imm_evex(vecX dst, immI con) %{ + predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vl() && VM_Version::supports_avx512bw()); + match(Set dst (ReplicateB con)); + format %{ "movq $dst,[$constantaddress]\n\t" + "vpbroadcastb $dst,$dst\t! replicate16B" %} + ins_encode %{ + int vector_len = 0; + __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1))); + __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl32B_imm_evex(vecY dst, immI con) %{ + predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vl() && VM_Version::supports_avx512bw()); + match(Set dst (ReplicateB con)); + format %{ "movq $dst,[$constantaddress]\n\t" + "vpbroadcastb $dst,$dst\t! replicate32B" %} + ins_encode %{ + int vector_len = 1; + __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1))); + __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl64B_imm_evex(vecZ dst, immI con) %{ + predicate(n->as_Vector()->length() == 64 && UseAVX > 2); + match(Set dst (ReplicateB con)); + format %{ "movq $dst,[$constantaddress]\n\t" + "vpbroadcastb $dst,$dst\t! upper replicate64B" %} + ins_encode %{ + int vector_len = 2; + __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1))); + __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); + ins_pipe( pipe_slow ); +%} + +instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{ + predicate(n->as_Vector()->length() == 64 && UseAVX > 2); + match(Set dst (ReplicateB zero)); + format %{ "vpxor $dst k0,$dst,$dst\t! replicate64B zero" %} + ins_encode %{ + // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it). + int vector_len = 2; + __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl8S_evex(vecX dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl() && VM_Version::supports_avx512bw()); + match(Set dst (ReplicateS src)); + format %{ "vpbroadcastw $dst,$src\t! replicate8S" %} + ins_encode %{ + int vector_len = 0; + __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl8S_mem_evex(vecX dst, memory mem) %{ + predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl() && VM_Version::supports_avx512bw()); + match(Set dst (ReplicateS (LoadS mem))); + format %{ "vpbroadcastw $dst,$mem\t! replicate8S" %} + ins_encode %{ + int vector_len = 0; + __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl16S_evex(vecY dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vl() && VM_Version::supports_avx512bw()); + match(Set dst (ReplicateS src)); + format %{ "vpbroadcastw $dst,$src\t! replicate16S" %} + ins_encode %{ + int vector_len = 1; + __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl16S_mem_evex(vecY dst, memory mem) %{ + predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vl() && VM_Version::supports_avx512bw()); + match(Set dst (ReplicateS (LoadS mem))); + format %{ "vpbroadcastw $dst,$mem\t! replicate16S" %} + ins_encode %{ + int vector_len = 1; + __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl32S_evex(vecZ dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 32 && UseAVX > 2); + match(Set dst (ReplicateS src)); + format %{ "vpbroadcastw $dst,$src\t! replicate32S" %} + ins_encode %{ + int vector_len = 2; + __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl32S_mem_evex(vecZ dst, memory mem) %{ + predicate(n->as_Vector()->length() == 32 && UseAVX > 2); + match(Set dst (ReplicateS (LoadS mem))); + format %{ "vpbroadcastw $dst,$mem\t! replicate32S" %} + ins_encode %{ + int vector_len = 2; + __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl8S_imm_evex(vecX dst, immI con) %{ + predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl() && VM_Version::supports_avx512bw()); + match(Set dst (ReplicateS con)); + format %{ "movq $dst,[$constantaddress]\n\t" + "vpbroadcastw $dst,$dst\t! replicate8S" %} + ins_encode %{ + int vector_len = 0; + __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2))); + __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl16S_imm_evex(vecY dst, immI con) %{ + predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vl() && VM_Version::supports_avx512bw()); + match(Set dst (ReplicateS con)); + format %{ "movq $dst,[$constantaddress]\n\t" + "vpbroadcastw $dst,$dst\t! replicate16S" %} + ins_encode %{ + int vector_len = 1; + __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2))); + __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl32S_imm_evex(vecZ dst, immI con) %{ + predicate(n->as_Vector()->length() == 32 && UseAVX > 2); + match(Set dst (ReplicateS con)); + format %{ "movq $dst,[$constantaddress]\n\t" + "vpbroadcastw $dst,$dst\t! replicate32S" %} + ins_encode %{ + int vector_len = 2; + __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2))); + __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{ + predicate(n->as_Vector()->length() == 32 && UseAVX > 2); + match(Set dst (ReplicateS zero)); + format %{ "vpxor $dst k0,$dst,$dst\t! replicate32S zero" %} + ins_encode %{ + // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it). + int vector_len = 2; + __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( fpu_reg_reg ); +%} + +instruct Repl4I_evex(vecX dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl()); + match(Set dst (ReplicateI src)); + format %{ "vpbroadcastd $dst,$src\t! replicate4I" %} + ins_encode %{ + int vector_len = 0; + __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl4I_mem_evex(vecX dst, memory mem) %{ + predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl()); + match(Set dst (ReplicateI (LoadI mem))); + format %{ "vpbroadcastd $dst,$mem\t! replicate4I" %} + ins_encode %{ + int vector_len = 0; + __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl8I_evex(vecY dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl()); + match(Set dst (ReplicateI src)); + format %{ "vpbroadcastd $dst,$src\t! replicate8I" %} + ins_encode %{ + int vector_len = 1; + __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl8I_mem_evex(vecY dst, memory mem) %{ + predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl()); + match(Set dst (ReplicateI (LoadI mem))); + format %{ "vpbroadcastd $dst,$mem\t! replicate8I" %} + ins_encode %{ + int vector_len = 1; + __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl16I_evex(vecZ dst, rRegI src) %{ + predicate(n->as_Vector()->length() == 16 && UseAVX > 2); + match(Set dst (ReplicateI src)); + format %{ "vpbroadcastd $dst,$src\t! replicate16I" %} + ins_encode %{ + int vector_len = 2; + __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl16I_mem_evex(vecZ dst, memory mem) %{ + predicate(n->as_Vector()->length() == 16 && UseAVX > 2); + match(Set dst (ReplicateI (LoadI mem))); + format %{ "vpbroadcastd $dst,$mem\t! replicate16I" %} + ins_encode %{ + int vector_len = 2; + __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl4I_imm_evex(vecX dst, immI con) %{ + predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl()); + match(Set dst (ReplicateI con)); + format %{ "movq $dst,[$constantaddress]\t! replicate8I($con)\n\t" + "vpbroadcastd $dst,$dst\t! replicate4I" %} + ins_encode %{ + int vector_len = 0; + __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4))); + __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl8I_imm_evex(vecY dst, immI con) %{ + predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl()); + match(Set dst (ReplicateI con)); + format %{ "movq $dst,[$constantaddress]\t! replicate8I($con)\n\t" + "vpbroadcastd $dst,$dst\t! replicate8I" %} + ins_encode %{ + int vector_len = 1; + __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4))); + __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl16I_imm_evex(vecZ dst, immI con) %{ + predicate(n->as_Vector()->length() == 16 && UseAVX > 2); + match(Set dst (ReplicateI con)); + format %{ "movq $dst,[$constantaddress]\t! replicate16I($con)\n\t" + "vpbroadcastd $dst,$dst\t! replicate16I" %} + ins_encode %{ + int vector_len = 2; + __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4))); + __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{ + predicate(n->as_Vector()->length() == 16 && UseAVX > 2); match(Set dst (ReplicateI zero)); format %{ "vpxor $dst k0,$dst,$dst\t! replicate16I zero" %} ins_encode %{ @@ -3496,216 +4087,115 @@ // Replicate long (8 byte) scalar to be vector #ifdef _LP64 -instruct Repl2L(vecX dst, rRegL src) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateL src)); - format %{ "movdq $dst,$src\n\t" - "punpcklqdq $dst,$dst\t! replicate2L" %} - ins_encode %{ - __ movdq($dst$$XMMRegister, $src$$Register); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct Repl4L(vecY dst, rRegL src) %{ - predicate(n->as_Vector()->length() == 4); +instruct Repl4L_evex(vecY dst, rRegL src) %{ + predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl()); match(Set dst (ReplicateL src)); - format %{ "movdq $dst,$src\n\t" - "punpcklqdq $dst,$dst\n\t" - "vinserti128h $dst,$dst,$dst\t! replicate4L" %} + format %{ "vpbroadcastq $dst,$src\t! replicate4L" %} ins_encode %{ - __ movdq($dst$$XMMRegister, $src$$Register); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + int vector_len = 1; + __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len); %} ins_pipe( pipe_slow ); %} -instruct Repl8L(vecZ dst, rRegL src) %{ - predicate(n->as_Vector()->length() == 8); +instruct Repl8L_evex(vecZ dst, rRegL src) %{ + predicate(n->as_Vector()->length() == 8 && UseAVX > 2); match(Set dst (ReplicateL src)); - format %{ "movdq $dst,$src\n\t" - "punpcklqdq $dst,$dst\n\t" - "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t" - "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %} + format %{ "vpbroadcastq $dst,$src\t! replicate8L" %} ins_encode %{ - __ movdq($dst$$XMMRegister, $src$$Register); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + int vector_len = 2; + __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len); %} ins_pipe( pipe_slow ); %} #else // _LP64 -instruct Repl2L(vecX dst, eRegL src, regD tmp) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateL src)); - effect(TEMP dst, USE src, TEMP tmp); - format %{ "movdl $dst,$src.lo\n\t" - "movdl $tmp,$src.hi\n\t" - "punpckldq $dst,$tmp\n\t" - "punpcklqdq $dst,$dst\t! replicate2L"%} - ins_encode %{ - __ movdl($dst$$XMMRegister, $src$$Register); - __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register)); - __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct Repl4L(vecY dst, eRegL src, regD tmp) %{ - predicate(n->as_Vector()->length() == 4); +instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{ + predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl()); match(Set dst (ReplicateL src)); effect(TEMP dst, USE src, TEMP tmp); format %{ "movdl $dst,$src.lo\n\t" "movdl $tmp,$src.hi\n\t" "punpckldq $dst,$tmp\n\t" - "punpcklqdq $dst,$dst\n\t" - "vinserti128h $dst,$dst,$dst\t! replicate4L" %} + "vpbroadcastq $dst,$dst\t! replicate4L" %} ins_encode %{ + int vector_len = 1; __ movdl($dst$$XMMRegister, $src$$Register); __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register)); __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -instruct Repl8L(vecZ dst, eRegL src, regD tmp) %{ - predicate(n->as_Vector()->length() == 4); +instruct Repl8L_evex(vecZ dst, eRegL src, regD tmp) %{ + predicate(n->as_Vector()->length() == 8 && UseAVX > 2); match(Set dst (ReplicateL src)); effect(TEMP dst, USE src, TEMP tmp); format %{ "movdl $dst,$src.lo\n\t" "movdl $tmp,$src.hi\n\t" "punpckldq $dst,$tmp\n\t" - "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t" - "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %} + "vpbroadcastq $dst,$dst\t! replicate8L" %} ins_encode %{ + int vector_len = 2; __ movdl($dst$$XMMRegister, $src$$Register); __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register)); __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} #endif // _LP64 -// Replicate long (8 byte) scalar immediate to be vector by loading from const table. -instruct Repl2L_imm(vecX dst, immL con) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateL con)); - format %{ "movq $dst,[$constantaddress]\n\t" - "punpcklqdq $dst,$dst\t! replicate2L($con)" %} - ins_encode %{ - __ movq($dst$$XMMRegister, $constantaddress($con)); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct Repl4L_imm(vecY dst, immL con) %{ - predicate(n->as_Vector()->length() == 4); +instruct Repl4L_imm_evex(vecY dst, immL con) %{ + predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl()); match(Set dst (ReplicateL con)); format %{ "movq $dst,[$constantaddress]\n\t" - "punpcklqdq $dst,$dst\n\t" - "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %} + "vpbroadcastq $dst,$dst\t! replicate4L" %} ins_encode %{ + int vector_len = 1; __ movq($dst$$XMMRegister, $constantaddress($con)); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -instruct Repl8L_imm(vecZ dst, immL con) %{ - predicate(n->as_Vector()->length() == 8); +instruct Repl8L_imm_evex(vecZ dst, immL con) %{ + predicate(n->as_Vector()->length() == 8 && UseAVX > 2); match(Set dst (ReplicateL con)); format %{ "movq $dst,[$constantaddress]\n\t" - "punpcklqdq $dst,$dst\n\t" - "vinserti128h $dst,$dst,$dst\t! lower replicate4L($con)\n\t" - "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L($con)" %} + "vpbroadcastq $dst,$dst\t! replicate8L" %} ins_encode %{ + int vector_len = 2; __ movq($dst$$XMMRegister, $constantaddress($con)); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -// Long could be loaded into xmm register directly from memory. -instruct Repl2L_mem(vecX dst, memory mem) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateL (LoadL mem))); - format %{ "movq $dst,$mem\n\t" - "punpcklqdq $dst,$dst\t! replicate2L" %} - ins_encode %{ - __ movq($dst$$XMMRegister, $mem$$Address); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); + __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -instruct Repl4L_mem(vecY dst, memory mem) %{ - predicate(n->as_Vector()->length() == 4); +instruct Repl4L_mem_evex(vecY dst, memory mem) %{ + predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl()); match(Set dst (ReplicateL (LoadL mem))); - format %{ "movq $dst,$mem\n\t" - "punpcklqdq $dst,$dst\n\t" - "vinserti128h $dst,$dst,$dst\t! replicate4L" %} + format %{ "vpbroadcastd $dst,$mem\t! replicate4L" %} ins_encode %{ - __ movq($dst$$XMMRegister, $mem$$Address); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + int vector_len = 1; + __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} -instruct Repl8L_mem(vecZ dst, memory mem) %{ - predicate(n->as_Vector()->length() == 8); +instruct Repl8L_mem_evex(vecZ dst, memory mem) %{ + predicate(n->as_Vector()->length() == 8 && UseAVX > 2); match(Set dst (ReplicateL (LoadL mem))); - format %{ "movq $dst,$mem\n\t" - "punpcklqdq $dst,$dst\n\t" - "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t" - "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %} + format %{ "vpbroadcastd $dst,$mem\t! replicate8L" %} ins_encode %{ - __ movq($dst$$XMMRegister, $mem$$Address); - __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + int vector_len = 2; + __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} -// Replicate long (8 byte) scalar zero to be vector -instruct Repl2L_zero(vecX dst, immL0 zero) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateL zero)); - format %{ "pxor $dst,$dst\t! replicate2L zero" %} - ins_encode %{ - __ pxor($dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( fpu_reg_reg ); -%} - -instruct Repl4L_zero(vecY dst, immL0 zero) %{ - predicate(n->as_Vector()->length() == 4); - match(Set dst (ReplicateL zero)); - format %{ "vpxor $dst,$dst,$dst\t! replicate4L zero" %} - ins_encode %{ - // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it). - int vector_len = 1; - __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); - %} - ins_pipe( fpu_reg_reg ); -%} - -instruct Repl8L_zero(vecZ dst, immL0 zero) %{ - predicate(n->as_Vector()->length() == 8); +instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{ + predicate(n->as_Vector()->length() == 8 && UseAVX > 2); match(Set dst (ReplicateL zero)); format %{ "vpxor $dst k0,$dst,$dst\t! replicate8L zero" %} ins_encode %{ @@ -3716,87 +4206,52 @@ ins_pipe( fpu_reg_reg ); %} -// Replicate float (4 byte) scalar to be vector -instruct Repl2F(vecD dst, regF src) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateF src)); - format %{ "pshufd $dst,$dst,0x00\t! replicate2F" %} - ins_encode %{ - __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); - %} - ins_pipe( fpu_reg_reg ); -%} - -instruct Repl4F(vecX dst, regF src) %{ - predicate(n->as_Vector()->length() == 4); +instruct Repl8F_evex(vecY dst, regF src) %{ + predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl()); match(Set dst (ReplicateF src)); - format %{ "pshufd $dst,$dst,0x00\t! replicate4F" %} + format %{ "vbroadcastss $dst,$src\t! replicate8F" %} ins_encode %{ - __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); + int vector_len = 1; + __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -instruct Repl8F(vecY dst, regF src) %{ - predicate(n->as_Vector()->length() == 8); - match(Set dst (ReplicateF src)); - format %{ "pshufd $dst,$src,0x00\n\t" - "vinsertf128h $dst,$dst,$dst\t! replicate8F" %} +instruct Repl8F_mem_evex(vecY dst, memory mem) %{ + predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl()); + match(Set dst (ReplicateF (LoadF mem))); + format %{ "vbroadcastss $dst,$mem\t! replicate8F" %} ins_encode %{ - __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); - __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + int vector_len = 1; + __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} -instruct Repl16F(vecZ dst, regF src) %{ - predicate(n->as_Vector()->length() == 16); +instruct Repl16F_evex(vecZ dst, regF src) %{ + predicate(n->as_Vector()->length() == 16 && UseAVX > 2); match(Set dst (ReplicateF src)); - format %{ "pshufd $dst,$src,0x00\n\t" - "vinsertf128h $dst,$dst,$dst\t! lower replicate8F\n\t" - "vinsertf64x4h $dst k0,$dst,$dst\t! lower replicate8F" %} + format %{ "vbroadcastss $dst,$src\t! replicate16F" %} ins_encode %{ - __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); - __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + int vector_len = 2; + __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -// Replicate float (4 byte) scalar zero to be vector -instruct Repl2F_zero(vecD dst, immF0 zero) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateF zero)); - format %{ "xorps $dst,$dst\t! replicate2F zero" %} - ins_encode %{ - __ xorps($dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( fpu_reg_reg ); -%} - -instruct Repl4F_zero(vecX dst, immF0 zero) %{ - predicate(n->as_Vector()->length() == 4); - match(Set dst (ReplicateF zero)); - format %{ "xorps $dst,$dst\t! replicate4F zero" %} - ins_encode %{ - __ xorps($dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( fpu_reg_reg ); -%} - -instruct Repl8F_zero(vecY dst, immF0 zero) %{ - predicate(n->as_Vector()->length() == 8); - match(Set dst (ReplicateF zero)); - format %{ "vxorps $dst,$dst,$dst\t! replicate8F zero" %} +instruct Repl16F_mem_evex(vecZ dst, memory mem) %{ + predicate(n->as_Vector()->length() == 16 && UseAVX > 2); + match(Set dst (ReplicateF (LoadF mem))); + format %{ "vbroadcastss $dst,$mem\t! replicate16F" %} ins_encode %{ - int vector_len = 1; - __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + int vector_len = 2; + __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len); %} - ins_pipe( fpu_reg_reg ); + ins_pipe( pipe_slow ); %} -instruct Repl16F_zero(vecZ dst, immF0 zero) %{ - predicate(n->as_Vector()->length() == 16); +instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{ + predicate(n->as_Vector()->length() == 16 && UseAVX > 2); match(Set dst (ReplicateF zero)); format %{ "vxorps $dst k0,$dst,$dst\t! replicate16F zero" %} ins_encode %{ @@ -3806,67 +4261,52 @@ ins_pipe( fpu_reg_reg ); %} -// Replicate double (8 bytes) scalar to be vector -instruct Repl2D(vecX dst, regD src) %{ - predicate(n->as_Vector()->length() == 2); +instruct Repl4D_evex(vecY dst, regD src) %{ + predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl()); match(Set dst (ReplicateD src)); - format %{ "pshufd $dst,$src,0x44\t! replicate2D" %} + format %{ "vbroadcastsd $dst,$src\t! replicate4D" %} ins_encode %{ - __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44); + int vector_len = 1; + __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -instruct Repl4D(vecY dst, regD src) %{ - predicate(n->as_Vector()->length() == 4); - match(Set dst (ReplicateD src)); - format %{ "pshufd $dst,$src,0x44\n\t" - "vinsertf128h $dst,$dst,$dst\t! replicate4D" %} +instruct Repl4D_mem_evex(vecY dst, memory mem) %{ + predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl()); + match(Set dst (ReplicateD (LoadD mem))); + format %{ "vbroadcastsd $dst,$mem\t! replicate4D" %} ins_encode %{ - __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44); - __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + int vector_len = 1; + __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} -instruct Repl8D(vecZ dst, regD src) %{ - predicate(n->as_Vector()->length() == 8); +instruct Repl8D_evex(vecZ dst, regD src) %{ + predicate(n->as_Vector()->length() == 8 && UseAVX > 2); match(Set dst (ReplicateD src)); - format %{ "pshufd $dst,$src,0x44\n\t" - "vinsertf128h $dst,$dst,$dst\t! lower replicate4D\n\t" - "vinsertf64x4h $dst k0,$dst,$dst\t! upper replicate4D" %} + format %{ "vbroadcastsd $dst,$src\t! replicate8D" %} ins_encode %{ - __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44); - __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); - __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister); + int vector_len = 2; + __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -// Replicate double (8 byte) scalar zero to be vector -instruct Repl2D_zero(vecX dst, immD0 zero) %{ - predicate(n->as_Vector()->length() == 2); - match(Set dst (ReplicateD zero)); - format %{ "xorpd $dst,$dst\t! replicate2D zero" %} - ins_encode %{ - __ xorpd($dst$$XMMRegister, $dst$$XMMRegister); - %} - ins_pipe( fpu_reg_reg ); -%} - -instruct Repl4D_zero(vecY dst, immD0 zero) %{ - predicate(n->as_Vector()->length() == 4); - match(Set dst (ReplicateD zero)); - format %{ "vxorpd $dst,$dst,$dst,vect256\t! replicate4D zero" %} +instruct Repl8D_mem_evex(vecZ dst, memory mem) %{ + predicate(n->as_Vector()->length() == 8 && UseAVX > 2); + match(Set dst (ReplicateD (LoadD mem))); + format %{ "vbroadcastsd $dst,$mem\t! replicate8D" %} ins_encode %{ - int vector_len = 1; - __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len); + int vector_len = 2; + __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len); %} - ins_pipe( fpu_reg_reg ); + ins_pipe( pipe_slow ); %} -instruct Repl8D_zero(vecZ dst, immD0 zero) %{ - predicate(n->as_Vector()->length() == 8); +instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{ + predicate(n->as_Vector()->length() == 8 && UseAVX > 2); match(Set dst (ReplicateD zero)); format %{ "vxorpd $dst k0,$dst,$dst,vect512\t! replicate8D zero" %} ins_encode %{ @@ -4963,11 +5403,22 @@ instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{ predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (AddVB src1 src2)); - format %{ "vpaddb $dst,$src1,$src2\t! add packed4B" %} + match(Set dst (AddVB src1 src2)); + format %{ "vpaddb $dst,$src1,$src2\t! add packed4B" %} + ins_encode %{ + int vector_len = 0; + __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (AddVB src (LoadVector mem))); + format %{ "vpaddb $dst,$src,$mem\t! add packed4B" %} ins_encode %{ int vector_len = 0; - __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -4993,6 +5444,17 @@ ins_pipe( pipe_slow ); %} +instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (AddVB src (LoadVector mem))); + format %{ "vpaddb $dst,$src,$mem\t! add packed8B" %} + ins_encode %{ + int vector_len = 0; + __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vadd16B(vecX dst, vecX src) %{ predicate(n->as_Vector()->length() == 16); match(Set dst (AddVB dst src)); @@ -5091,6 +5553,17 @@ ins_pipe( pipe_slow ); %} +instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (AddVS src (LoadVector mem))); + format %{ "vpaddw $dst,$src,$mem\t! add packed2S" %} + ins_encode %{ + int vector_len = 0; + __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vadd4S(vecD dst, vecD src) %{ predicate(n->as_Vector()->length() == 4); match(Set dst (AddVS dst src)); @@ -5112,6 +5585,17 @@ ins_pipe( pipe_slow ); %} +instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (AddVS src (LoadVector mem))); + format %{ "vpaddw $dst,$src,$mem\t! add packed4S" %} + ins_encode %{ + int vector_len = 0; + __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vadd8S(vecX dst, vecX src) %{ predicate(n->as_Vector()->length() == 8); match(Set dst (AddVS dst src)); @@ -5210,6 +5694,17 @@ ins_pipe( pipe_slow ); %} +instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (AddVI src (LoadVector mem))); + format %{ "vpaddd $dst,$src,$mem\t! add packed2I" %} + ins_encode %{ + int vector_len = 0; + __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vadd4I(vecX dst, vecX src) %{ predicate(n->as_Vector()->length() == 4); match(Set dst (AddVI dst src)); @@ -5385,6 +5880,17 @@ ins_pipe( pipe_slow ); %} +instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (AddVF src (LoadVector mem))); + format %{ "vaddps $dst,$src,$mem\t! add packed2F" %} + ins_encode %{ + int vector_len = 0; + __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vadd4F(vecX dst, vecX src) %{ predicate(n->as_Vector()->length() == 4); match(Set dst (AddVF dst src)); @@ -5562,6 +6068,17 @@ ins_pipe( pipe_slow ); %} +instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (SubVB src (LoadVector mem))); + format %{ "vpsubb $dst,$src,$mem\t! sub packed4B" %} + ins_encode %{ + int vector_len = 0; + __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vsub8B(vecD dst, vecD src) %{ predicate(n->as_Vector()->length() == 8); match(Set dst (SubVB dst src)); @@ -5583,6 +6100,17 @@ ins_pipe( pipe_slow ); %} +instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (SubVB src (LoadVector mem))); + format %{ "vpsubb $dst,$src,$mem\t! sub packed8B" %} + ins_encode %{ + int vector_len = 0; + __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vsub16B(vecX dst, vecX src) %{ predicate(n->as_Vector()->length() == 16); match(Set dst (SubVB dst src)); @@ -5681,6 +6209,17 @@ ins_pipe( pipe_slow ); %} +instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (SubVS src (LoadVector mem))); + format %{ "vpsubw $dst,$src,$mem\t! sub packed2S" %} + ins_encode %{ + int vector_len = 0; + __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vsub4S(vecD dst, vecD src) %{ predicate(n->as_Vector()->length() == 4); match(Set dst (SubVS dst src)); @@ -5702,6 +6241,17 @@ ins_pipe( pipe_slow ); %} +instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (SubVS src (LoadVector mem))); + format %{ "vpsubw $dst,$src,$mem\t! sub packed4S" %} + ins_encode %{ + int vector_len = 0; + __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vsub8S(vecX dst, vecX src) %{ predicate(n->as_Vector()->length() == 8); match(Set dst (SubVS dst src)); @@ -5800,6 +6350,17 @@ ins_pipe( pipe_slow ); %} +instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (SubVI src (LoadVector mem))); + format %{ "vpsubd $dst,$src,$mem\t! sub packed2I" %} + ins_encode %{ + int vector_len = 0; + __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vsub4I(vecX dst, vecX src) %{ predicate(n->as_Vector()->length() == 4); match(Set dst (SubVI dst src)); @@ -5975,6 +6536,17 @@ ins_pipe( pipe_slow ); %} +instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (SubVF src (LoadVector mem))); + format %{ "vsubps $dst,$src,$mem\t! sub packed2F" %} + ins_encode %{ + int vector_len = 0; + __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vsub4F(vecX dst, vecX src) %{ predicate(n->as_Vector()->length() == 4); match(Set dst (SubVF dst src)); @@ -6152,6 +6724,17 @@ ins_pipe( pipe_slow ); %} +instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (MulVS src (LoadVector mem))); + format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %} + ins_encode %{ + int vector_len = 0; + __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vmul4S(vecD dst, vecD src) %{ predicate(n->as_Vector()->length() == 4); match(Set dst (MulVS dst src)); @@ -6173,6 +6756,17 @@ ins_pipe( pipe_slow ); %} +instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (MulVS src (LoadVector mem))); + format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %} + ins_encode %{ + int vector_len = 0; + __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vmul8S(vecX dst, vecX src) %{ predicate(n->as_Vector()->length() == 8); match(Set dst (MulVS dst src)); @@ -6271,13 +6865,13 @@ ins_pipe( pipe_slow ); %} -instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq()); - match(Set dst (MulVL src1 src2)); - format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %} +instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (MulVI src (LoadVector mem))); + format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %} ins_encode %{ int vector_len = 0; - __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} @@ -6314,6 +6908,28 @@ ins_pipe( pipe_slow ); %} +instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq()); + match(Set dst (MulVL src1 src2)); + format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %} + ins_encode %{ + int vector_len = 0; + __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq()); + match(Set dst (MulVL src (LoadVector mem))); + format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %} + ins_encode %{ + int vector_len = 0; + __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{ predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq()); match(Set dst (MulVL src1 src2)); @@ -6336,34 +6952,34 @@ ins_pipe( pipe_slow ); %} -instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 8); - match(Set dst (MulVI src1 src2)); - format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %} +instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq()); + match(Set dst (MulVL src1 src2)); + format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %} ins_encode %{ - int vector_len = 1; - __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vector_len = 2; + __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} -instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{ +instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{ predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq()); - match(Set dst (MulVL src1 src2)); - format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %} + match(Set dst (MulVL src (LoadVector mem))); + format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %} ins_encode %{ int vector_len = 2; - __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); %} ins_pipe( pipe_slow ); %} -instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 16); +instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); match(Set dst (MulVI src1 src2)); - format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %} + format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %} ins_encode %{ - int vector_len = 2; + int vector_len = 1; __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); @@ -6380,13 +6996,13 @@ ins_pipe( pipe_slow ); %} -instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq()); - match(Set dst (MulVL src (LoadVector mem))); - format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %} +instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (MulVI src1 src2)); + format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %} ins_encode %{ int vector_len = 2; - __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -6424,6 +7040,17 @@ ins_pipe( pipe_slow ); %} +instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (MulVF src (LoadVector mem))); + format %{ "vmulps $dst,$src,$mem\t! mul packed2F" %} + ins_encode %{ + int vector_len = 0; + __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vmul4F(vecX dst, vecX src) %{ predicate(n->as_Vector()->length() == 4); match(Set dst (MulVF dst src)); @@ -6601,6 +7228,17 @@ ins_pipe( pipe_slow ); %} +instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (DivVF src (LoadVector mem))); + format %{ "vdivps $dst,$src,$mem\t! div packed2F" %} + ins_encode %{ + int vector_len = 0; + __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vdiv4F(vecX dst, vecX src) %{ predicate(n->as_Vector()->length() == 4); match(Set dst (DivVF dst src)); @@ -7878,6 +8516,17 @@ ins_pipe( pipe_slow ); %} +instruct vand4B_mem(vecS dst, vecS src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4); + match(Set dst (AndV src (LoadVector mem))); + format %{ "vpand $dst,$src,$mem\t! and vectors (4 bytes)" %} + ins_encode %{ + int vector_len = 0; + __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vand8B(vecD dst, vecD src) %{ predicate(n->as_Vector()->length_in_bytes() == 8); match(Set dst (AndV dst src)); @@ -7899,6 +8548,17 @@ ins_pipe( pipe_slow ); %} +instruct vand8B_mem(vecD dst, vecD src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8); + match(Set dst (AndV src (LoadVector mem))); + format %{ "vpand $dst,$src,$mem\t! and vectors (8 bytes)" %} + ins_encode %{ + int vector_len = 0; + __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vand16B(vecX dst, vecX src) %{ predicate(n->as_Vector()->length_in_bytes() == 16); match(Set dst (AndV dst src)); @@ -7998,6 +8658,17 @@ ins_pipe( pipe_slow ); %} +instruct vor4B_mem(vecS dst, vecS src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4); + match(Set dst (OrV src (LoadVector mem))); + format %{ "vpor $dst,$src,$mem\t! or vectors (4 bytes)" %} + ins_encode %{ + int vector_len = 0; + __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vor8B(vecD dst, vecD src) %{ predicate(n->as_Vector()->length_in_bytes() == 8); match(Set dst (OrV dst src)); @@ -8019,6 +8690,17 @@ ins_pipe( pipe_slow ); %} +instruct vor8B_mem(vecD dst, vecD src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4); + match(Set dst (OrV src (LoadVector mem))); + format %{ "vpor $dst,$src,$mem\t! or vectors (8 bytes)" %} + ins_encode %{ + int vector_len = 0; + __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vor16B(vecX dst, vecX src) %{ predicate(n->as_Vector()->length_in_bytes() == 16); match(Set dst (OrV dst src)); @@ -8118,6 +8800,17 @@ ins_pipe( pipe_slow ); %} +instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4); + match(Set dst (XorV src (LoadVector mem))); + format %{ "vpxor $dst,$src,$mem\t! xor vectors (4 bytes)" %} + ins_encode %{ + int vector_len = 0; + __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vxor8B(vecD dst, vecD src) %{ predicate(n->as_Vector()->length_in_bytes() == 8); match(Set dst (XorV dst src)); @@ -8138,6 +8831,17 @@ %} ins_pipe( pipe_slow ); %} + +instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{ + predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8); + match(Set dst (XorV src (LoadVector mem))); + format %{ "vpxor $dst,$src,$mem\t! xor vectors (8 bytes)" %} + ins_encode %{ + int vector_len = 0; + __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} instruct vxor16B(vecX dst, vecX src) %{ predicate(n->as_Vector()->length_in_bytes() == 16);