--- old/src/hotspot/cpu/x86/assembler_x86.cpp 2020-04-02 18:03:43.216854097 -0700 +++ new/src/hotspot/cpu/x86/assembler_x86.cpp 2020-04-02 18:03:43.044854097 -0700 @@ -984,6 +984,8 @@ case 0x61: // pcmpestri r, r/a, #8 case 0x70: // pshufd r, r/a, #8 case 0x73: // psrldq r, #8 + case 0x1f: // evpcmpd/evpcmpq + case 0x3f: // evpcmpb/evpcmpw tail_size = 1; // the imm8 break; default: @@ -1209,6 +1211,11 @@ emit_int8(imm8); } +void Assembler::addw(Register dst, Register src) { + (void)prefix_and_encode(dst->encoding(), src->encoding()); + emit_arith(0x03, 0xC0, dst, src); +} + void Assembler::addw(Address dst, int imm16) { InstructionMark im(this); emit_int8(0x66); @@ -1415,6 +1422,11 @@ emit_int16((unsigned char)0xDD, (0xC0 | encode)); } +void Assembler::andw(Register dst, Register src) { + (void)prefix_and_encode(dst->encoding(), src->encoding()); + emit_arith(0x23, 0xC0, dst, src); +} + void Assembler::andl(Address dst, int32_t imm32) { InstructionMark im(this); prefix(dst); @@ -1783,6 +1795,13 @@ emit_int16((unsigned char)0xE6, (0xC0 | encode)); } +void Assembler::vcvtdq2pd(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len <= AVX_256bit ? VM_Version::supports_avx() : VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xE6, (0xC0 | encode)); +} + void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); @@ -1790,6 +1809,13 @@ emit_int16(0x5B, (0xC0 | encode)); } +void Assembler::vcvtdq2ps(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len <= AVX_256bit ? VM_Version::supports_avx() : VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int16(0x5B, (0xC0 | encode)); +} + void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); @@ -1912,18 +1938,18 @@ } void Assembler::vpabsb(XMMRegister dst, XMMRegister src, int vector_len) { - assert(vector_len == AVX_128bit? VM_Version::supports_avx() : - vector_len == AVX_256bit? VM_Version::supports_avx2() : - vector_len == AVX_512bit? VM_Version::supports_avx512bw() : 0, ""); + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + vector_len == AVX_256bit ? VM_Version::supports_avx2() : + vector_len == AVX_512bit ? VM_Version::supports_avx512bw() : false, "not supported"); InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int16(0x1C, (0xC0 | encode)); } void Assembler::vpabsw(XMMRegister dst, XMMRegister src, int vector_len) { - assert(vector_len == AVX_128bit? VM_Version::supports_avx() : - vector_len == AVX_256bit? VM_Version::supports_avx2() : - vector_len == AVX_512bit? VM_Version::supports_avx512bw() : 0, ""); + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + vector_len == AVX_256bit ? VM_Version::supports_avx2() : + vector_len == AVX_512bit ? VM_Version::supports_avx512bw() : false, ""); InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int16(0x1D, (0xC0 | encode)); @@ -1946,6 +1972,85 @@ emit_int16(0x1F, (0xC0 | encode)); } +void Assembler::vcvtps2pd(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len <= AVX_256bit ? VM_Version::supports_avx() : VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int16(0x5A, (0xC0 | encode)); +} + +void Assembler::vcvtpd2ps(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len <= AVX_256bit ? VM_Version::supports_avx() : VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + attributes.set_rex_vex_w_reverted(); + emit_int16(0x5A, (0xC0 | encode)); +} + +void Assembler::evcvtqq2ps(XMMRegister dst, XMMRegister src, int vector_len) { + assert(UseAVX > 2 && VM_Version::supports_avx512dq(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int16(0x5B, (0xC0 | encode)); +} + +void Assembler::evcvtqq2pd(XMMRegister dst, XMMRegister src, int vector_len) { + assert(UseAVX > 2 && VM_Version::supports_avx512dq(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xE6, (0xC0 | encode)); +} + +void Assembler::evpmovwb(XMMRegister dst, XMMRegister src, int vector_len) { + assert(UseAVX > 2 && VM_Version::supports_avx512bw(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x30, (0xC0 | encode)); +} + +void Assembler::evpmovdw(XMMRegister dst, XMMRegister src, int vector_len) { + assert(UseAVX > 2, ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x33, (0xC0 | encode)); +} + +void Assembler::evpmovdb(XMMRegister dst, XMMRegister src, int vector_len) { + assert(UseAVX > 2, ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x31, (0xC0 | encode)); +} + +void Assembler::evpmovqd(XMMRegister dst, XMMRegister src, int vector_len) { + assert(UseAVX > 2, ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x35, (0xC0 | encode)); +} + +void Assembler::evpmovqb(XMMRegister dst, XMMRegister src, int vector_len) { + assert(UseAVX > 2, ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x32, (0xC0 | encode)); +} + +void Assembler::evpmovqw(XMMRegister dst, XMMRegister src, int vector_len) { + assert(UseAVX > 2, ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x34, (0xC0 | encode)); +} + void Assembler::decl(Address dst) { // Don't use it directly. Use MacroAssembler::decrement() instead. InstructionMark im(this); @@ -2543,28 +2648,34 @@ } // Move Unaligned EVEX enabled Vector (programmable : 8,16,32,64) -void Assembler::evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) { +void Assembler::evmovdqub(XMMRegister dst, XMMRegister src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_is_evex_instruction(); + if (merge) { + attributes.reset_is_clear_context(); + } int prefix = (_legacy_mode_bw) ? VEX_SIMD_F2 : VEX_SIMD_F3; int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes); emit_int16(0x6F, (0xC0 | encode)); } -void Assembler::evmovdqub(XMMRegister dst, Address src, int vector_len) { +void Assembler::evmovdqub(XMMRegister dst, Address src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); InstructionMark im(this); InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int prefix = (_legacy_mode_bw) ? VEX_SIMD_F2 : VEX_SIMD_F3; attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); attributes.set_is_evex_instruction(); + if (merge) { + attributes.reset_is_clear_context(); + } vex_prefix(src, 0, dst->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes); emit_int8(0x6F); emit_operand(dst, src); } -void Assembler::evmovdqub(Address dst, XMMRegister src, int vector_len) { +void Assembler::evmovdqub(Address dst, XMMRegister src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); assert(src != xnoreg, "sanity"); InstructionMark im(this); @@ -2572,132 +2683,202 @@ int prefix = (_legacy_mode_bw) ? VEX_SIMD_F2 : VEX_SIMD_F3; attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); attributes.set_is_evex_instruction(); + if (merge) { + attributes.reset_is_clear_context(); + } vex_prefix(dst, 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes); emit_int8(0x7F); emit_operand(src, dst); } -void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, int vector_len) { +void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { assert(VM_Version::supports_avx512vlbw(), ""); InstructionMark im(this); InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); attributes.set_embedded_opmask_register_specifier(mask); attributes.set_is_evex_instruction(); + if (merge) { + attributes.reset_is_clear_context(); + } vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x6F); emit_operand(dst, src); } -void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) { +void Assembler::evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); InstructionMark im(this); InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); attributes.set_is_evex_instruction(); + if (merge) { + attributes.reset_is_clear_context(); + } int prefix = (_legacy_mode_bw) ? VEX_SIMD_F2 : VEX_SIMD_F3; vex_prefix(src, 0, dst->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes); emit_int8(0x6F); emit_operand(dst, src); } -void Assembler::evmovdquw(XMMRegister dst, KRegister mask, Address src, int vector_len) { +void Assembler::evmovdquw(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { assert(VM_Version::supports_avx512vlbw(), ""); InstructionMark im(this); InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); attributes.set_embedded_opmask_register_specifier(mask); attributes.set_is_evex_instruction(); + if (merge) { + attributes.reset_is_clear_context(); + } vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x6F); emit_operand(dst, src); } -void Assembler::evmovdquw(Address dst, XMMRegister src, int vector_len) { +void Assembler::evmovdquw(Address dst, XMMRegister src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); assert(src != xnoreg, "sanity"); InstructionMark im(this); InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); attributes.set_is_evex_instruction(); + if (merge) { + attributes.reset_is_clear_context(); + } int prefix = (_legacy_mode_bw) ? VEX_SIMD_F2 : VEX_SIMD_F3; vex_prefix(dst, 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes); emit_int8(0x7F); emit_operand(src, dst); } -void Assembler::evmovdquw(Address dst, KRegister mask, XMMRegister src, int vector_len) { +void Assembler::evmovdquw(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { assert(VM_Version::supports_avx512vlbw(), ""); assert(src != xnoreg, "sanity"); InstructionMark im(this); InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); - attributes.reset_is_clear_context(); attributes.set_embedded_opmask_register_specifier(mask); attributes.set_is_evex_instruction(); + if (merge) { + attributes.reset_is_clear_context(); + } vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x7F); emit_operand(src, dst); } void Assembler::evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) { + // Unmasked instruction + evmovdqul(dst, k0, src, /*merge*/ false, vector_len); +} + +void Assembler::evmovdqul(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_embedded_opmask_register_specifier(mask); attributes.set_is_evex_instruction(); + if (merge) { + attributes.reset_is_clear_context(); + } int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int16(0x6F, (0xC0 | encode)); } void Assembler::evmovdqul(XMMRegister dst, Address src, int vector_len) { + // Unmasked instruction + evmovdqul(dst, k0, src, /*merge*/ false, vector_len); +} + +void Assembler::evmovdqul(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true , /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false , /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_embedded_opmask_register_specifier(mask); attributes.set_is_evex_instruction(); + if (merge) { + attributes.reset_is_clear_context(); + } vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x6F); emit_operand(dst, src); } void Assembler::evmovdqul(Address dst, XMMRegister src, int vector_len) { + // Unmasked isntruction + evmovdqul(dst, k0, src, /*merge*/ true, vector_len); +} + +void Assembler::evmovdqul(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); assert(src != xnoreg, "sanity"); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); - attributes.reset_is_clear_context(); + attributes.set_embedded_opmask_register_specifier(mask); attributes.set_is_evex_instruction(); + if (merge) { + attributes.reset_is_clear_context(); + } vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x7F); emit_operand(src, dst); } void Assembler::evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) { + // Unmasked instruction + if (dst->encoding() == src->encoding()) return; + evmovdquq(dst, k0, src, /*merge*/ false, vector_len); +} + +void Assembler::evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); - InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_embedded_opmask_register_specifier(mask); attributes.set_is_evex_instruction(); + if (merge) { + attributes.reset_is_clear_context(); + } int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int16(0x6F, (0xC0 | encode)); } void Assembler::evmovdquq(XMMRegister dst, Address src, int vector_len) { + // Unmasked instruction + evmovdquq(dst, k0, src, /*merge*/ false, vector_len); +} + +void Assembler::evmovdquq(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_embedded_opmask_register_specifier(mask); attributes.set_is_evex_instruction(); + if (merge) { + attributes.reset_is_clear_context(); + } vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x6F); emit_operand(dst, src); } void Assembler::evmovdquq(Address dst, XMMRegister src, int vector_len) { + // Unmasked instruction + evmovdquq(dst, k0, src, /*merge*/ true, vector_len); +} + +void Assembler::evmovdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); assert(src != xnoreg, "sanity"); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); - attributes.reset_is_clear_context(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } attributes.set_is_evex_instruction(); vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x7F); @@ -2775,6 +2956,29 @@ emit_operand(src, dst); } +void Assembler::movq(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + attributes.set_rex_vex_w_reverted(); + int encode = simd_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xD6, (0xC0 | encode)); +} + +void Assembler::movq(Register dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + // swap src/dst to get correct prefix + int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x7E, (0xC0 | encode)); +} + +void Assembler::movq(XMMRegister dst, Register src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, xnoreg, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x6E, (0xC0 | encode)); +} + void Assembler::movsbl(Register dst, Address src) { // movsxb InstructionMark im(this); prefix(src, dst); @@ -3274,6 +3478,11 @@ emit_int16((unsigned char)0xF7, (0xD0 | encode)); } +void Assembler::orw(Register dst, Register src) { + (void)prefix_and_encode(dst->encoding(), src->encoding()); + emit_arith(0x0B, 0xC0, dst, src); +} + void Assembler::orl(Address dst, int32_t imm32) { InstructionMark im(this); prefix(dst); @@ -3312,6 +3521,34 @@ emit_int8(imm8); } +void Assembler::packsswb(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x63, (0xC0 | encode)); +} + +void Assembler::vpacksswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "some form of AVX must be enabled"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x63, (0xC0 | encode)); +} + +void Assembler::packssdw(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse2(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x6B, (0xC0 | encode)); +} + +void Assembler::vpackssdw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "some form of AVX must be enabled"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x6B, (0xC0 | encode)); +} + void Assembler::packuswb(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); @@ -3337,21 +3574,74 @@ emit_int16(0x67, (0xC0 | encode)); } +void Assembler::packusdw(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x2B, (0xC0 | encode)); +} + +void Assembler::vpackusdw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "some form of AVX must be enabled"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x2B, (0xC0 | encode)); +} + void Assembler::vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len) { assert(VM_Version::supports_avx2(), ""); + assert(vector_len != AVX_128bit, ""); + // VEX.256.66.0F3A.W1 00 /r ib InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int24(0x00, (0xC0 | encode), imm8); } void Assembler::vpermq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { - assert(UseAVX > 2, "requires AVX512F"); + assert(vector_len == AVX_256bit ? VM_Version::supports_avx512vl() : + vector_len == AVX_512bit ? VM_Version::supports_evex() : false, "not supported"); InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_is_evex_instruction(); int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int16(0x36, (0xC0 | encode)); } +void Assembler::vpermb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(VM_Version::supports_avx512_vbmi(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16((unsigned char)0x8D, (0xC0 | encode)); +} + +void Assembler::vpermw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx512vlbw() : + vector_len == AVX_256bit ? VM_Version::supports_avx512vlbw() : + vector_len == AVX_512bit ? VM_Version::supports_avx512bw() : false, "not supported"); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16((unsigned char)0x8D, (0xC0 | encode)); +} + +void Assembler::vpermd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len <= AVX_256bit ? VM_Version::supports_avx2() : VM_Version::supports_evex(), ""); + // VEX.NDS.256.66.0F38.W0 36 /r + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x36, (0xC0 | encode)); +} + +void Assembler::vpermd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(vector_len <= AVX_256bit ? VM_Version::supports_avx2() : VM_Version::supports_evex(), ""); + // VEX.NDS.256.66.0F38.W0 36 /r + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x36); + emit_operand(dst, src); +} + void Assembler::vperm2i128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8) { assert(VM_Version::supports_avx2(), ""); InstructionAttr attributes(AVX_256bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); @@ -3366,6 +3656,28 @@ emit_int24(0x06, (0xC0 | encode), imm8); } +void Assembler::vpermilps(XMMRegister dst, XMMRegister src, int imm8, int vector_len) { + assert(vector_len <= AVX_256bit ? VM_Version::supports_avx() : VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x04, (0xC0 | encode), imm8); +} + +void Assembler::vpermilpd(XMMRegister dst, XMMRegister src, int imm8, int vector_len) { + assert(vector_len <= AVX_256bit ? VM_Version::supports_avx() : VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ VM_Version::supports_evex(),/* legacy_mode */ false,/* no_mask_reg */ true, /* uses_vl */ false); + attributes.set_rex_vex_w_reverted(); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x05, (0xC0 | encode), imm8); +} + +void Assembler::vpermpd(XMMRegister dst, XMMRegister src, int imm8, int vector_len) { + assert(vector_len <= AVX_256bit ? VM_Version::supports_avx2() : VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */false, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x01, (0xC0 | encode), imm8); +} + void Assembler::evpermi2q(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), ""); InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); @@ -3374,7 +3686,6 @@ emit_int16(0x76, (0xC0 | encode)); } - void Assembler::pause() { emit_int16((unsigned char)0xF3, (unsigned char)0x90); } @@ -3408,9 +3719,18 @@ emit_int16(0x74, (0xC0 | encode)); } +void Assembler::vpcmpCCbwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : VM_Version::supports_avx2(), ""); + assert(vector_len <= AVX_256bit, "evex encoding is different - has k register as dest"); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(cond_encoding, (0xC0 | encode)); +} + // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst void Assembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { - assert(VM_Version::supports_avx(), ""); + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : VM_Version::supports_avx2(), ""); + assert(vector_len <= AVX_256bit, "evex encoding is different - has k register as dest"); InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int16(0x74, (0xC0 | encode)); @@ -3497,7 +3817,7 @@ void Assembler::evpcmpeqb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx512vlbw(), ""); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_reg_mask */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); attributes.reset_is_clear_context(); attributes.set_embedded_opmask_register_specifier(mask); @@ -3517,7 +3837,8 @@ // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst void Assembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { - assert(VM_Version::supports_avx(), ""); + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : VM_Version::supports_avx2(), ""); + assert(vector_len <= AVX_256bit, "evex encoding is different - has k register as dest"); InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int16(0x75, (0xC0 | encode)); @@ -3554,29 +3875,32 @@ // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst void Assembler::vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { - assert(VM_Version::supports_avx(), ""); - InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : VM_Version::supports_avx2(), ""); + assert(vector_len <= AVX_256bit, "evex encoding is different - has k register as dest"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int16(0x76, (0xC0 | encode)); } // In this context, kdst is written the mask used to process the equal components -void Assembler::evpcmpeqd(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len) { +void Assembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), ""); - InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_is_evex_instruction(); attributes.reset_is_clear_context(); + attributes.set_embedded_opmask_register_specifier(mask); int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int16(0x76, (0xC0 | encode)); } -void Assembler::evpcmpeqd(KRegister kdst, XMMRegister nds, Address src, int vector_len) { +void Assembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_evex(), ""); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit); - attributes.reset_is_clear_context(); attributes.set_is_evex_instruction(); + attributes.reset_is_clear_context(); + attributes.set_embedded_opmask_register_specifier(mask); int dst_enc = kdst->encoding(); vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x76); @@ -3591,6 +3915,13 @@ emit_int16(0x29, (0xC0 | encode)); } +void Assembler::vpcmpCCq(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, int vector_len) { + assert(VM_Version::supports_avx(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(cond_encoding, (0xC0 | encode)); +} + // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst void Assembler::vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx(), ""); @@ -3623,6 +3954,31 @@ emit_operand(as_Register(dst_enc), src); } +void Assembler::evpmovd2m(KRegister kdst, XMMRegister src, int vector_len) { + assert(UseAVX > 2 && VM_Version::supports_avx512dq(), ""); + assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(kdst->encoding(), 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x39, (0xC0 | encode)); +} + +void Assembler::evpmovq2m(KRegister kdst, XMMRegister src, int vector_len) { + assert(UseAVX > 2 && VM_Version::supports_avx512dq(), ""); + assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), ""); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(kdst->encoding(), 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x39, (0xC0 | encode)); +} + +void Assembler::pcmpgtq(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x37, (0xC0 | encode)); +} + void Assembler::pmovmskb(Register dst, XMMRegister src) { assert(VM_Version::supports_sse2(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); @@ -3639,14 +3995,14 @@ void Assembler::pextrd(Register dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int24(0x16, (0xC0 | encode), imm8); } void Assembler::pextrd(Address dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int8(0x16); @@ -3656,14 +4012,14 @@ void Assembler::pextrq(Register dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int24(0x16, (0xC0 | encode), imm8); } void Assembler::pextrq(Address dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit); simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int8(0x16); @@ -3673,14 +4029,14 @@ void Assembler::pextrw(Register dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse2(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int24((unsigned char)0xC5, (0xC0 | encode), imm8); } void Assembler::pextrw(Address dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit); simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int8(0x15); @@ -3688,9 +4044,16 @@ emit_int8(imm8); } +void Assembler::pextrb(Register dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x14, (0xC0 | encode), imm8); +} + void Assembler::pextrb(Address dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit); simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int8(0x14); @@ -3700,14 +4063,14 @@ void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int24(0x22, (0xC0 | encode), imm8); } void Assembler::pinsrd(XMMRegister dst, Address src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int8(0x22); @@ -3715,16 +4078,23 @@ emit_int8(imm8); } +void Assembler::vpinsrd(XMMRegister dst, XMMRegister nds, Register src, int imm8) { + assert(VM_Version::supports_avx(), ""); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x22, (0xC0 | encode), imm8); +} + void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int24(0x22, (0xC0 | encode), imm8); } void Assembler::pinsrq(XMMRegister dst, Address src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit); simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int8(0x22); @@ -3732,16 +4102,23 @@ emit_int8(imm8); } +void Assembler::vpinsrq(XMMRegister dst, XMMRegister nds, Register src, int imm8) { + assert(VM_Version::supports_avx(), ""); + InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x22, (0xC0 | encode), imm8); +} + void Assembler::pinsrw(XMMRegister dst, Register src, int imm8) { assert(VM_Version::supports_sse2(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int24((unsigned char)0xC4, (0xC0 | encode), imm8); } void Assembler::pinsrw(XMMRegister dst, Address src, int imm8) { assert(VM_Version::supports_sse2(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit); simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xC4); @@ -3749,9 +4126,16 @@ emit_int8(imm8); } +void Assembler::vpinsrw(XMMRegister dst, XMMRegister nds, Register src, int imm8) { + assert(VM_Version::supports_avx(), ""); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int24((unsigned char)0xC4, (0xC0 | encode), imm8); +} + void Assembler::pinsrb(XMMRegister dst, Address src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit); simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int8(0x20); @@ -3759,6 +4143,34 @@ emit_int8(imm8); } +void Assembler::pinsrb(XMMRegister dst, Register src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x20, (0xC0 | encode), imm8); +} + +void Assembler::vpinsrb(XMMRegister dst, XMMRegister nds, Register src, int imm8) { + assert(VM_Version::supports_avx(), ""); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x20, (0xC0 | encode), imm8); +} + +void Assembler::insertps(XMMRegister dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x21, (0xC0 | encode), imm8); +} + +void Assembler::vinsertps(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8) { + assert(VM_Version::supports_avx(), ""); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x21, (0xC0 | encode), imm8); +} + void Assembler::pmovzxbw(XMMRegister dst, Address src) { assert(VM_Version::supports_sse4_1(), ""); InstructionMark im(this); @@ -3783,6 +4195,41 @@ emit_int16(0x20, (0xC0 | encode)); } +void Assembler::pmovzxdq(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x35, (0xC0 | encode)); +} + +void Assembler::pmovsxbd(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x21, (0xC0 | encode)); +} + +void Assembler::pmovzxbd(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x31, (0xC0 | encode)); +} + +void Assembler::pmovsxbq(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x22, (0xC0 | encode)); +} + +void Assembler::pmovsxwd(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x23, (0xC0 | encode)); +} + void Assembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); @@ -3816,7 +4263,7 @@ assert(VM_Version::supports_avx512vlbw(), ""); assert(dst != xnoreg, "sanity"); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit); attributes.set_embedded_opmask_register_specifier(mask); attributes.set_is_evex_instruction(); @@ -3824,6 +4271,86 @@ emit_int8(0x30); emit_operand(dst, src); } + +void Assembler::evpandd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { + assert(VM_Version::supports_evex(), ""); + // Encoding: EVEX.NDS.XXX.66.0F.W0 DB /r + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xDB, (0xC0 | encode)); +} + +void Assembler::vpmovzxdq(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len > AVX_128bit ? VM_Version::supports_avx2() : VM_Version::supports_avx(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x35, (0xC0 | encode)); +} + +void Assembler::vpmovzxbd(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len > AVX_128bit ? VM_Version::supports_avx2() : VM_Version::supports_avx(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x31, (0xC0 | encode)); +} + +void Assembler::vpmovzxbq(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len > AVX_128bit ? VM_Version::supports_avx2() : VM_Version::supports_avx(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x32, (0xC0 | encode)); +} + +void Assembler::vpmovsxbd(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + vector_len == AVX_256bit ? VM_Version::supports_avx2() : + VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x21, (0xC0 | encode)); +} + +void Assembler::vpmovsxbq(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + vector_len == AVX_256bit ? VM_Version::supports_avx2() : + VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x22, (0xC0 | encode)); +} + +void Assembler::vpmovsxwd(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + vector_len == AVX_256bit ? VM_Version::supports_avx2() : + VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x23, (0xC0 | encode)); +} + +void Assembler::vpmovsxwq(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + vector_len == AVX_256bit ? VM_Version::supports_avx2() : + VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x24, (0xC0 | encode)); +} + +void Assembler::vpmovsxdq(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + vector_len == AVX_256bit ? VM_Version::supports_avx2() : + VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x25, (0xC0 | encode)); +} + void Assembler::evpmovwb(Address dst, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx512vlbw(), ""); assert(src != xnoreg, "sanity"); @@ -4050,6 +4577,14 @@ emit_int8(mode & 0xFF); } +void Assembler::pshufhw(XMMRegister dst, XMMRegister src, int mode) { + assert(isByte(mode), "invalid value"); + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); + emit_int24(0x70, (0xC0 | encode), mode & 0xFF); +} + void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) { assert(isByte(mode), "invalid value"); NOT_LP64(assert(VM_Version::supports_sse2(), "")); @@ -4080,6 +4615,35 @@ emit_int24(0x43, (0xC0 | encode), imm8 & 0xFF); } +void Assembler::pshufpd(XMMRegister dst, XMMRegister src, int imm8) { + assert(isByte(imm8), "invalid value"); + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int24((unsigned char)0xC6, (0xC0 | encode), imm8 & 0xFF); +} + +void Assembler::vpshufpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) { + InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_rex_vex_w_reverted(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int24((unsigned char)0xC6, (0xC0 | encode), imm8 & 0xFF); +} + +void Assembler::pshufps(XMMRegister dst, XMMRegister src, int imm8) { + assert(isByte(imm8), "invalid value"); + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int24((unsigned char)0xC6, (0xC0 | encode), imm8 & 0xFF); +} + +void Assembler::vpshufps(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) { + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int24((unsigned char)0xC6, (0xC0 | encode), imm8 & 0xFF); +} + void Assembler::psrldq(XMMRegister dst, int shift) { // Shift left 128 bit value in dst XMMRegister by shift number of bytes. NOT_LP64(assert(VM_Version::supports_sse2(), "")); @@ -4151,6 +4715,13 @@ emit_int16(0x17, (0xC0 | encode)); } +void Assembler::vptest(XMMRegister dst, XMMRegister src, int vector_len) { + assert(VM_Version::supports_avx(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x17, (0xC0 | encode)); +} + void Assembler::punpcklbw(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); @@ -4819,6 +5390,11 @@ emit_operand(dst, src); } +void Assembler::xorw(Register dst, Register src) { + (void)prefix_and_encode(dst->encoding(), src->encoding()); + emit_arith(0x33, 0xC0, dst, src); +} + // AVX 3-operands scalar float-point arithmetic instructions void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) { @@ -5732,6 +6308,13 @@ emit_int16(0x40, (0xC0 | encode)); } +void Assembler::pmuludq(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse2(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xF4, (0xC0 | encode)); +} + void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); @@ -5754,6 +6337,13 @@ emit_int16(0x40, (0xC0 | encode)); } +void Assembler::vpmuludq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xF4, (0xC0 | encode)); +} + void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); InstructionMark im(this); @@ -5785,95 +6375,256 @@ emit_operand(dst, src); } -// Shift packed integers left by specified number of bits. -void Assembler::psllw(XMMRegister dst, int shift) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); +// Min, max +void Assembler::pminsb(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); - // XMM6 is for /6 encoding: 66 0F 71 /6 ib - int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int24(0x71, (0xC0 | encode), shift & 0xFF); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x38, (0xC0 | encode)); } -void Assembler::pslld(XMMRegister dst, int shift) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); - // XMM6 is for /6 encoding: 66 0F 72 /6 ib - int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int24(0x72, (0xC0 | encode), shift & 0xFF); +void Assembler::vpminsb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + (vector_len == AVX_256bit ? VM_Version::supports_avx2() : VM_Version::supports_avx512bw()), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x38, (0xC0 | encode)); } -void Assembler::psllq(XMMRegister dst, int shift) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); - // XMM6 is for /6 encoding: 66 0F 73 /6 ib - int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int24(0x73, (0xC0 | encode), shift & 0xFF); +void Assembler::pminsw(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse2(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xEA, (0xC0 | encode)); } -void Assembler::psllw(XMMRegister dst, XMMRegister shift) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); - int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int16((unsigned char)0xF1, (0xC0 | encode)); +void Assembler::vpminsw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + (vector_len == AVX_256bit ? VM_Version::supports_avx2() : VM_Version::supports_avx512bw()), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xEA, (0xC0 | encode)); } -void Assembler::pslld(XMMRegister dst, XMMRegister shift) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); +void Assembler::pminsd(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); - int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int16((unsigned char)0xF2, (0xC0 | encode)); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x39, (0xC0 | encode)); } -void Assembler::psllq(XMMRegister dst, XMMRegister shift) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); - attributes.set_rex_vex_w_reverted(); - int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int16((unsigned char)0xF3, (0xC0 | encode)); +void Assembler::vpminsd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + (vector_len == AVX_256bit ? VM_Version::supports_avx2() : VM_Version::supports_evex()), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x39, (0xC0 | encode)); } -void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len) { - assert(UseAVX > 0, "requires some form of AVX"); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); - // XMM6 is for /6 encoding: 66 0F 71 /6 ib - int encode = vex_prefix_and_encode(xmm6->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int24(0x71, (0xC0 | encode), shift & 0xFF); +void Assembler::vpminsq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 2, "requires AVX512F"); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x39, (0xC0 | encode)); } -void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len) { - assert(UseAVX > 0, "requires some form of AVX"); - NOT_LP64(assert(VM_Version::supports_sse2(), "")); +void Assembler::minps(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int16(0x5D, (0xC0 | encode)); +} +void Assembler::vminps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len >= AVX_512bit ? VM_Version::supports_evex() : VM_Version::supports_avx(), ""); InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); - // XMM6 is for /6 encoding: 66 0F 72 /6 ib - int encode = vex_prefix_and_encode(xmm6->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int24(0x72, (0xC0 | encode), shift & 0xFF); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int16(0x5D, (0xC0 | encode)); } -void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len) { - assert(UseAVX > 0, "requires some form of AVX"); - InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); - attributes.set_rex_vex_w_reverted(); - // XMM6 is for /6 encoding: 66 0F 73 /6 ib - int encode = vex_prefix_and_encode(xmm6->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int24(0x73, (0xC0 | encode), shift & 0xFF); +void Assembler::minpd(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x5D, (0xC0 | encode)); } - -void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { - assert(UseAVX > 0, "requires some form of AVX"); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); - int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int16((unsigned char)0xF1, (0xC0 | encode)); +void Assembler::vminpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len >= AVX_512bit ? VM_Version::supports_evex() : VM_Version::supports_avx(), ""); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x5D, (0xC0 | encode)); } -void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { - assert(UseAVX > 0, "requires some form of AVX"); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); - int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int16((unsigned char)0xF2, (0xC0 | encode)); +void Assembler::pmaxsb(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x3C, (0xC0 | encode)); } -void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { - assert(UseAVX > 0, "requires some form of AVX"); +void Assembler::vpmaxsb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + (vector_len == AVX_256bit ? VM_Version::supports_avx2() : VM_Version::supports_avx512bw()), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x3C, (0xC0 | encode)); +} + +void Assembler::pmaxsw(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse2(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xEE, (0xC0 | encode)); +} + +void Assembler::vpmaxsw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + (vector_len == AVX_256bit ? VM_Version::supports_avx2() : VM_Version::supports_avx512bw()), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xEE, (0xC0 | encode)); +} + +void Assembler::pmaxsd(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x3D, (0xC0 | encode)); +} + +void Assembler::vpmaxsd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + (vector_len == AVX_256bit ? VM_Version::supports_avx2() : VM_Version::supports_evex()), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x3D, (0xC0 | encode)); +} + +void Assembler::vpmaxsq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 2, "requires AVX512F"); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x3D, (0xC0 | encode)); +} + +void Assembler::maxps(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int16(0x5F, (0xC0 | encode)); +} + +void Assembler::vmaxps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len >= AVX_512bit ? VM_Version::supports_evex() : VM_Version::supports_avx(), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int16(0x5F, (0xC0 | encode)); +} + +void Assembler::maxpd(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x5F, (0xC0 | encode)); +} + +void Assembler::vmaxpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len >= AVX_512bit ? VM_Version::supports_evex() : VM_Version::supports_avx(), ""); + InstructionAttr attributes(vector_len, /* vex_w */true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x5F, (0xC0 | encode)); +} + +// Shift packed integers left by specified number of bits. +void Assembler::psllw(XMMRegister dst, int shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + // XMM6 is for /6 encoding: 66 0F 71 /6 ib + int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int24(0x71, (0xC0 | encode), shift & 0xFF); +} + +void Assembler::pslld(XMMRegister dst, int shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + // XMM6 is for /6 encoding: 66 0F 72 /6 ib + int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int24(0x72, (0xC0 | encode), shift & 0xFF); +} + +void Assembler::psllq(XMMRegister dst, int shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + // XMM6 is for /6 encoding: 66 0F 73 /6 ib + int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int24(0x73, (0xC0 | encode), shift & 0xFF); +} + +void Assembler::psllw(XMMRegister dst, XMMRegister shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xF1, (0xC0 | encode)); +} + +void Assembler::pslld(XMMRegister dst, XMMRegister shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xF2, (0xC0 | encode)); +} + +void Assembler::psllq(XMMRegister dst, XMMRegister shift) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_rex_vex_w_reverted(); + int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xF3, (0xC0 | encode)); +} + +void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + // XMM6 is for /6 encoding: 66 0F 71 /6 ib + int encode = vex_prefix_and_encode(xmm6->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int24(0x71, (0xC0 | encode), shift & 0xFF); +} + +void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + // XMM6 is for /6 encoding: 66 0F 72 /6 ib + int encode = vex_prefix_and_encode(xmm6->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int24(0x72, (0xC0 | encode), shift & 0xFF); +} + +void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_rex_vex_w_reverted(); + // XMM6 is for /6 encoding: 66 0F 73 /6 ib + int encode = vex_prefix_and_encode(xmm6->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int24(0x73, (0xC0 | encode), shift & 0xFF); +} + +void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xF1, (0xC0 | encode)); +} + +void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xF2, (0xC0 | encode)); +} + +void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 0, "requires some form of AVX"); InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_rex_vex_w_reverted(); int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); @@ -6106,13 +6857,67 @@ emit_int16((unsigned char)0xDB, (0xC0 | encode)); } +//Variable Shift packed integers logically left. +void Assembler::vpsllvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 1, "requires AVX2"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x47, (0xC0 | encode)); +} + +void Assembler::vpsllvq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 1, "requires AVX2"); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x47, (0xC0 | encode)); +} + +//Variable Shift packed integers logically right. +void Assembler::vpsrlvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 1, "requires AVX2"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x45, (0xC0 | encode)); +} + +void Assembler::vpsrlvq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 1, "requires AVX2"); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x45, (0xC0 | encode)); +} + +//Variable right Shift arithmetic packed integers . +void Assembler::vpsravd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 1, "requires AVX2"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x46, (0xC0 | encode)); +} + +void Assembler::evpsravw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(VM_Version::supports_avx512bw(), ""); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x11, (0xC0 | encode)); +} + +void Assembler::evpsravq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 2, "requires AVX512"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL"); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x46, (0xC0 | encode)); +} + void Assembler::vpshldvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { assert(VM_Version::supports_avx512_vbmi2(), "requires vbmi2"); InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_is_evex_instruction(); int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); - emit_int8(0x71); - emit_int8((0xC0 | encode)); + emit_int16(0x71, (0xC0 | encode)); } void Assembler::vpshrdvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { @@ -6138,7 +6943,6 @@ emit_int16((unsigned char)0xDF, (0xC0 | encode)); } - void Assembler::por(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); @@ -6171,6 +6975,35 @@ } +void Assembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { + assert(VM_Version::supports_evex(), ""); + // Encoding: EVEX.NDS.XXX.66.0F.W0 EB /r + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xEB, (0xC0 | encode)); +} + +void Assembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { + assert(VM_Version::supports_evex(), ""); + // Encoding: EVEX.NDS.XXX.66.0F.W0 EB /r + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_NObit); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0xEB); + emit_operand(dst, src); +} + void Assembler::pxor(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); @@ -6195,13 +7028,33 @@ emit_operand(dst, src); } +void Assembler::vpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 2, "requires some form of EVEX"); + InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_rex_vex_w_reverted(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xEF, (0xC0 | encode)); +} + +void Assembler::evpxord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { + assert(VM_Version::supports_evex(), ""); + // Encoding: EVEX.NDS.XXX.66.0F.W0 EF /r + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xEF, (0xC0 | encode)); +} + void Assembler::evpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), "requires EVEX support"); InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_is_evex_instruction(); int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int8((unsigned char)0xEF); - emit_int8((0xC0 | encode)); + emit_int16((unsigned char)0xEF, (0xC0 | encode)); } void Assembler::evpxorq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { @@ -6216,7 +7069,6 @@ emit_operand(dst, src); } - // vinserti forms void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) { @@ -6776,12 +7628,67 @@ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int16(0x7C, (0xC0 | encode)); } + +void Assembler::vpgatherdd(XMMRegister dst, Address src, XMMRegister mask, int vector_len) { + assert(VM_Version::supports_avx2(), ""); + assert(vector_len == Assembler::AVX_128bit || vector_len == Assembler::AVX_256bit, ""); + assert(dst != xnoreg, "sanity"); + assert(src.isxmmindex(),"expected to be xmm index"); + assert(dst != src.xmmindex(), "instruction will #UD if dst and index are the same"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true); + vex_prefix(src, mask->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x90); + emit_operand(dst, src); +} + +void Assembler::vpgatherdq(XMMRegister dst, Address src, XMMRegister mask, int vector_len) { + assert(VM_Version::supports_avx2(), ""); + assert(vector_len == Assembler::AVX_128bit || vector_len == Assembler::AVX_256bit, ""); + assert(dst != xnoreg, "sanity"); + assert(src.isxmmindex(),"expected to be xmm index"); + assert(dst != src.xmmindex(), "instruction will #UD if dst and index are the same"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true); + vex_prefix(src, mask->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x90); + emit_operand(dst, src); +} + +void Assembler::vgatherdpd(XMMRegister dst, Address src, XMMRegister mask, int vector_len) { + assert(VM_Version::supports_avx2(), ""); + assert(vector_len == Assembler::AVX_128bit || vector_len == Assembler::AVX_256bit, ""); + assert(dst != xnoreg, "sanity"); + assert(src.isxmmindex(),"expected to be xmm index"); + assert(dst != src.xmmindex(), "instruction will #UD if dst and index are the same"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true); + vex_prefix(src, mask->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x92); + emit_operand(dst, src); +} + +void Assembler::vgatherdps(XMMRegister dst, Address src, XMMRegister mask, int vector_len) { + assert(VM_Version::supports_avx2(), ""); + assert(vector_len == Assembler::AVX_128bit || vector_len == Assembler::AVX_256bit, ""); + assert(dst != xnoreg, "sanity"); + assert(src.isxmmindex(),"expected to be xmm index"); + assert(dst != src.xmmindex(), "instruction will #UD if dst and index are the same"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ true); + vex_prefix(src, mask->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x92); + emit_operand(dst, src); +} void Assembler::evpgatherdd(XMMRegister dst, KRegister mask, Address src, int vector_len) { assert(VM_Version::supports_evex(), ""); assert(dst != xnoreg, "sanity"); + assert(src.isxmmindex(),"expected to be xmm index"); + assert(dst != src.xmmindex(), "instruction will #UD if dst and index are the same"); + assert(mask != k0, "instruction will #UD if mask is in k0"); InstructionMark im(this); InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); - attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); attributes.reset_is_clear_context(); attributes.set_embedded_opmask_register_specifier(mask); attributes.set_is_evex_instruction(); @@ -6790,19 +7697,129 @@ emit_int8((unsigned char)0x90); emit_operand(dst, src); } -// Carry-Less Multiplication Quadword -void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) { - assert(VM_Version::supports_clmul(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true); - int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); - emit_int24(0x44, (0xC0 | encode), (unsigned char)mask); -} -// Carry-Less Multiplication Quadword -void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) { - assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), ""); - InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true); - int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); +void Assembler::evpgatherdq(XMMRegister dst, KRegister mask, Address src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(dst != xnoreg, "sanity"); + assert(src.isxmmindex(),"expected to be xmm index"); + assert(dst != src.xmmindex(), "instruction will #UD if dst and index are the same"); + assert(mask != k0, "instruction will #UD if mask is in k0"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); + attributes.reset_is_clear_context(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + // swap src<->dst for encoding + vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x90); + emit_operand(dst, src); +} + +void Assembler::evgatherdpd(XMMRegister dst, KRegister mask, Address src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(dst != xnoreg, "sanity"); + assert(src.isxmmindex(),"expected to be xmm index"); + assert(dst != src.xmmindex(), "instruction will #UD if dst and index are the same"); + assert(mask != k0, "instruction will #UD if mask is in k0"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); + attributes.reset_is_clear_context(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + // swap src<->dst for encoding + vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x92); + emit_operand(dst, src); +} + +void Assembler::evgatherdps(XMMRegister dst, KRegister mask, Address src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(dst != xnoreg, "sanity"); + assert(src.isxmmindex(),"expected to be xmm index"); + assert(dst != src.xmmindex(), "instruction will #UD if dst and index are the same"); + assert(mask != k0, "instruction will #UD if mask is in k0"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); + attributes.reset_is_clear_context(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + // swap src<->dst for encoding + vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x92); + emit_operand(dst, src); +} + +void Assembler::evpscatterdd(Address dst, KRegister mask, XMMRegister src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(mask != k0, "instruction will #UD if mask is in k0"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); + attributes.reset_is_clear_context(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0xA0); + emit_operand(src, dst); +} + +void Assembler::evpscatterdq(Address dst, KRegister mask, XMMRegister src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(mask != k0, "instruction will #UD if mask is in k0"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); + attributes.reset_is_clear_context(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0xA0); + emit_operand(src, dst); +} + +void Assembler::evscatterdps(Address dst, KRegister mask, XMMRegister src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(mask != k0, "instruction will #UD if mask is in k0"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); + attributes.reset_is_clear_context(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0xA2); + emit_operand(src, dst); +} + +void Assembler::evscatterdpd(Address dst, KRegister mask, XMMRegister src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(mask != k0, "instruction will #UD if mask is in k0"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); + attributes.reset_is_clear_context(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0xA2); + emit_operand(src, dst); +} +// Carry-Less Multiplication Quadword +void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) { + assert(VM_Version::supports_clmul(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x44, (0xC0 | encode), (unsigned char)mask); +} + +// Carry-Less Multiplication Quadword +void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) { + assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), ""); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int24(0x44, (0xC0 | encode), (unsigned char)mask); } @@ -7387,7 +8404,8 @@ // fourth EVEX.L'L for vector length : 0 is 128, 1 is 256, 2 is 512, currently we do not support 1024 byte4 |= ((_attributes->get_vector_len())& 0x3) << 5; // last is EVEX.z for zero/merge actions - if (_attributes->is_no_reg_mask() == false) { + if (_attributes->is_no_reg_mask() == false && + _attributes->get_embedded_opmask_register_specifier() != 0) { byte4 |= (_attributes->is_clear_context() ? EVEX_Z : 0); } @@ -7555,7 +8573,7 @@ emit_int16(0x5D, (0xC0 | encode)); } -void Assembler::cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) { +void Assembler::vcmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) { assert(VM_Version::supports_avx(), ""); assert(vector_len <= AVX_256bit, ""); InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true); @@ -7563,8 +8581,8 @@ emit_int24((unsigned char)0xC2, (0xC0 | encode), (0xF & cop)); } -void Assembler::blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) { - assert(VM_Version::supports_avx(), ""); +void Assembler::vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) { + assert(UseAVX > 0 && (vector_len == AVX_128bit || vector_len == AVX_256bit), ""); assert(vector_len <= AVX_256bit, ""); InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); @@ -7572,28 +8590,330 @@ emit_int24(0x4B, (0xC0 | encode), (0xF0 & src2_enc << 4)); } -void Assembler::cmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) { - assert(VM_Version::supports_avx(), ""); +void Assembler::vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) { + assert(VM_Version::supports_avx2(), ""); assert(vector_len <= AVX_256bit, ""); InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true); - int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); - emit_int24((unsigned char)0xC2, (0xC0 | encode), (0xF & cop)); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x02, (0xC0 | encode), (unsigned char)imm8); } -void Assembler::blendvps(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) { +void Assembler::vcmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int comparison, int vector_len) { assert(VM_Version::supports_avx(), ""); assert(vector_len <= AVX_256bit, ""); InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int24((unsigned char)0xC2, (0xC0 | encode), (unsigned char)comparison); +} + +void Assembler::evcmpps(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + ComparisonPredicateFP comparison, int vector_len) { + assert(VM_Version::supports_evex(), ""); + // Encoding: EVEX.NDS.XXX.0F.W0 C2 /r ib + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.reset_is_clear_context(); + int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int24((unsigned char)0xC2, (0xC0 | encode), comparison); +} + +void Assembler::evcmppd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + ComparisonPredicateFP comparison, int vector_len) { + assert(VM_Version::supports_evex(), ""); + // Encoding: EVEX.NDS.XXX.66.0F.W1 C2 /r ib + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.reset_is_clear_context(); + int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int24((unsigned char)0xC2, (0xC0 | encode), comparison); +} + +void Assembler::blendvps(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); + assert(UseAVX <= 0, "sse encoding is inconsistent with avx encoding"); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x14, (0xC0 | encode)); +} + +void Assembler::blendvpd(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); + assert(UseAVX <= 0, "sse encoding is inconsistent with avx encoding"); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x15, (0xC0 | encode)); +} + +void Assembler::pblendvb(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); + assert(UseAVX <= 0, "sse encoding is inconsistent with avx encoding"); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x10, (0xC0 | encode)); +} + +void Assembler::vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) { + assert(UseAVX > 0 && (vector_len == AVX_128bit || vector_len == AVX_256bit), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); int src2_enc = src2->encoding(); emit_int24(0x4A, (0xC0 | encode), (0xF0 & src2_enc << 4)); } -void Assembler::vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) { - assert(VM_Version::supports_avx2(), ""); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true); +void Assembler::vblendps(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) { + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); - emit_int24(0x02, (0xC0 | encode), (unsigned char)imm8); + emit_int24(0x0C, (0xC0 | encode), imm8); +} + +void Assembler::vpcmpgtb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : VM_Version::supports_avx2(), ""); + assert(vector_len <= AVX_256bit, "evex encoding is different - has k register as dest"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x64, (0xC0 | encode)); +} + +void Assembler::vpcmpgtw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : VM_Version::supports_avx2(), ""); + assert(vector_len <= AVX_256bit, "evex encoding is different - has k register as dest"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x65, (0xC0 | encode)); +} + +void Assembler::vpcmpgtd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : VM_Version::supports_avx2(), ""); + assert(vector_len <= AVX_256bit, "evex encoding is different - has k register as dest"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x66, (0xC0 | encode)); +} + +void Assembler::vpcmpgtq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : VM_Version::supports_avx2(), ""); + assert(vector_len <= AVX_256bit, "evex encoding is different - has k register as dest"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x37, (0xC0 | encode)); +} + +void Assembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + int comparison, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(comparison >= Assembler::eq && comparison <= Assembler::_true, ""); + // Encoding: EVEX.NDS.XXX.66.0F3A.W0 1F /r ib + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.reset_is_clear_context(); + int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x1F, (0xC0 | encode), comparison); +} + +void Assembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, Address src, + int comparison, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(comparison >= Assembler::eq && comparison <= Assembler::_true, ""); + // Encoding: EVEX.NDS.XXX.66.0F3A.W0 1F /r ib + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_NObit); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.reset_is_clear_context(); + int dst_enc = kdst->encoding(); + vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8((unsigned char)0x1F); + emit_operand(as_Register(dst_enc), src); + emit_int8((unsigned char)comparison); +} + +void Assembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + int comparison, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(comparison >= Assembler::eq && comparison <= Assembler::_true, ""); + // Encoding: EVEX.NDS.XXX.66.0F3A.W1 1F /r ib + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.reset_is_clear_context(); + int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x1F, (0xC0 | encode), comparison); +} + +void Assembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, Address src, + int comparison, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(comparison >= Assembler::eq && comparison <= Assembler::_true, ""); + // Encoding: EVEX.NDS.XXX.66.0F3A.W1 1F /r ib + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_NObit); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.reset_is_clear_context(); + int dst_enc = kdst->encoding(); + vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8((unsigned char)0x1F); + emit_operand(as_Register(dst_enc), src); + emit_int8((unsigned char)comparison); +} + +void Assembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + int comparison, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(VM_Version::supports_avx512bw(), ""); + assert(comparison >= Assembler::eq && comparison <= Assembler::_true, ""); + // Encoding: EVEX.NDS.XXX.66.0F3A.W0 3F /r ib + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.reset_is_clear_context(); + int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x3F, (0xC0 | encode), comparison); +} + +void Assembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, + int comparison, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(VM_Version::supports_avx512bw(), ""); + assert(comparison >= Assembler::eq && comparison <= Assembler::_true, ""); + // Encoding: EVEX.NDS.XXX.66.0F3A.W0 3F /r ib + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.reset_is_clear_context(); + int dst_enc = kdst->encoding(); + vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8((unsigned char)0x3F); + emit_operand(as_Register(dst_enc), src); + emit_int8((unsigned char)comparison); +} + +void Assembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + int comparison, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(VM_Version::supports_avx512bw(), ""); + assert(comparison >= Assembler::eq && comparison <= Assembler::_true, ""); + // Encoding: EVEX.NDS.XXX.66.0F3A.W1 3F /r ib + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.reset_is_clear_context(); + int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x3F, (0xC0 | encode), comparison); +} + +void Assembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, Address src, + int comparison, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(VM_Version::supports_avx512bw(), ""); + assert(comparison >= Assembler::eq && comparison <= Assembler::_true, ""); + // Encoding: EVEX.NDS.XXX.66.0F3A.W1 3F /r ib + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.reset_is_clear_context(); + int dst_enc = kdst->encoding(); + vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8((unsigned char)0x3F); + emit_operand(as_Register(dst_enc), src); + emit_int8((unsigned char)comparison); +} + +void Assembler::vpblendvb(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len) { + assert(VM_Version::supports_avx(), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + int mask_enc = mask->encoding(); + emit_int24(0x4C, (0xC0 | encode), 0xF0 & mask_enc << 4); +} + +void Assembler::evblendmpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { + assert(VM_Version::supports_evex(), ""); + // Encoding: EVEX.NDS.XXX.66.0F38.W1 65 /r + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x65, (0xC0 | encode)); +} + +void Assembler::evblendmps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { + assert(VM_Version::supports_evex(), ""); + // Encoding: EVEX.NDS.XXX.66.0F38.W0 65 /r + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x65, (0xC0 | encode)); +} + +void Assembler::evpblendmb (XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(VM_Version::supports_avx512bw(), ""); + // Encoding: EVEX.NDS.512.66.0F38.W0 66 /r + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x66, (0xC0 | encode)); +} + +void Assembler::evpblendmw (XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(VM_Version::supports_avx512bw(), ""); + // Encoding: EVEX.NDS.512.66.0F38.W1 66 /r + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x66, (0xC0 | encode)); +} + +void Assembler::evpblendmd (XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { + assert(VM_Version::supports_evex(), ""); + //Encoding: EVEX.NDS.512.66.0F38.W0 64 /r + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x64, (0xC0 | encode)); +} + +void Assembler::evpblendmq (XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { + assert(VM_Version::supports_evex(), ""); + //Encoding: EVEX.NDS.512.66.0F38.W1 64 /r + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x64, (0xC0 | encode)); } void Assembler::shlxl(Register dst, Register src1, Register src2) { --- old/src/hotspot/cpu/x86/assembler_x86.hpp 2020-04-02 18:03:43.880854098 -0700 +++ new/src/hotspot/cpu/x86/assembler_x86.hpp 2020-04-02 18:03:43.716854098 -0700 @@ -620,6 +620,7 @@ #endif }; + // Comparison predicates for integral types & FP types when using SSE enum ComparisonPredicate { eq = 0, lt = 1, @@ -631,6 +632,51 @@ _true = 7 }; + // Comparison predicates for FP types when using AVX + // O means ordered. U is unordered. When using ordered, any NaN comparison is false. Otherwise, it is true. + // S means signaling. Q means non-signaling. When signaling is true, instruction signals #IA on NaN. + enum ComparisonPredicateFP { + EQ_OQ = 0, + LT_OS = 1, + LE_OS = 2, + UNORD_Q = 3, + NEQ_UQ = 4, + NLT_US = 5, + NLE_US = 6, + ORD_Q = 7, + EQ_UQ = 8, + NGE_US = 9, + NGT_US = 0xA, + FALSE_OQ = 0XB, + NEQ_OQ = 0xC, + GE_OS = 0xD, + GT_OS = 0xE, + TRUE_UQ = 0xF, + EQ_OS = 0x10, + LT_OQ = 0x11, + LE_OQ = 0x12, + UNORD_S = 0x13, + NEQ_US = 0x14, + NLT_UQ = 0x15, + NLE_UQ = 0x16, + ORD_S = 0x17, + EQ_US = 0x18, + NGE_UQ = 0x19, + NGT_UQ = 0x1A, + FALSE_OS = 0x1B, + NEQ_OS = 0x1C, + GE_OQ = 0x1D, + GT_OQ = 0x1E, + TRUE_US =0x1F + }; + + enum Width { + B = 0, + W = 1, + D = 2, + Q = 3 + }; + //---< calculate length of instruction >--- // As instruction size can't be found out easily on x86/x64, // we just use '4' for len and maxlen. @@ -950,6 +996,7 @@ void adcq(Register dst, Register src); void addb(Address dst, int imm8); + void addw(Register dst, Register src); void addw(Address dst, int imm16); void addl(Address dst, int32_t imm32); @@ -1000,6 +1047,8 @@ void vaesdec(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vaesdeclast(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void andw(Register dst, Register src); + void andl(Address dst, int32_t imm32); void andl(Register dst, int32_t imm32); void andl(Register dst, Address src); @@ -1125,9 +1174,11 @@ // Convert Packed Signed Doubleword Integers to Packed Double-Precision Floating-Point Value void cvtdq2pd(XMMRegister dst, XMMRegister src); + void vcvtdq2pd(XMMRegister dst, XMMRegister src, int vector_len); // Convert Packed Signed Doubleword Integers to Packed Single-Precision Floating-Point Value void cvtdq2ps(XMMRegister dst, XMMRegister src); + void vcvtdq2ps(XMMRegister dst, XMMRegister src, int vector_len); // Convert Scalar Single-Precision Floating-Point Value to Scalar Double-Precision Floating-Point Value void cvtss2sd(XMMRegister dst, XMMRegister src); @@ -1143,8 +1194,25 @@ void cvttss2sil(Register dst, XMMRegister src); void cvttss2siq(Register dst, XMMRegister src); + // Convert vector double to int void cvttpd2dq(XMMRegister dst, XMMRegister src); + // Convert vector float and double + void vcvtps2pd(XMMRegister dst, XMMRegister src, int vector_len); + void vcvtpd2ps(XMMRegister dst, XMMRegister src, int vector_len); + + // Convert vector long to vector FP + void evcvtqq2ps(XMMRegister dst, XMMRegister src, int vector_len); + void evcvtqq2pd(XMMRegister dst, XMMRegister src, int vector_len); + + // Evex casts with truncation + void evpmovwb(XMMRegister dst, XMMRegister src, int vector_len); + void evpmovdw(XMMRegister dst, XMMRegister src, int vector_len); + void evpmovdb(XMMRegister dst, XMMRegister src, int vector_len); + void evpmovqd(XMMRegister dst, XMMRegister src, int vector_len); + void evpmovqb(XMMRegister dst, XMMRegister src, int vector_len); + void evpmovqw(XMMRegister dst, XMMRegister src, int vector_len); + //Abs of packed Integer values void pabsb(XMMRegister dst, XMMRegister src); void pabsw(XMMRegister dst, XMMRegister src); @@ -1504,20 +1572,26 @@ void vmovdqu(XMMRegister dst, XMMRegister src); // Move Unaligned 512bit Vector - void evmovdqub(Address dst, XMMRegister src, int vector_len); - void evmovdqub(XMMRegister dst, Address src, int vector_len); - void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len); - void evmovdqub(XMMRegister dst, KRegister mask, Address src, int vector_len); - void evmovdquw(Address dst, XMMRegister src, int vector_len); - void evmovdquw(Address dst, KRegister mask, XMMRegister src, int vector_len); - void evmovdquw(XMMRegister dst, Address src, int vector_len); - void evmovdquw(XMMRegister dst, KRegister mask, Address src, int vector_len); + void evmovdqub(Address dst, XMMRegister src, bool merge, int vector_len); + void evmovdqub(XMMRegister dst, Address src, bool merge, int vector_len); + void evmovdqub(XMMRegister dst, XMMRegister src, bool merge, int vector_len); + void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len); + void evmovdquw(Address dst, XMMRegister src, bool merge, int vector_len); + void evmovdquw(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len); + void evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len); + void evmovdquw(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len); void evmovdqul(Address dst, XMMRegister src, int vector_len); void evmovdqul(XMMRegister dst, Address src, int vector_len); void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len); + void evmovdqul(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len); + void evmovdqul(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len); + void evmovdqul(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len); void evmovdquq(Address dst, XMMRegister src, int vector_len); void evmovdquq(XMMRegister dst, Address src, int vector_len); void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len); + void evmovdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len); + void evmovdquq(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len); + void evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len); // Move lower 64bit to high 64bit in 128bit register void movlhps(XMMRegister dst, XMMRegister src); @@ -1549,6 +1623,9 @@ // Move Quadword void movq(Address dst, XMMRegister src); void movq(XMMRegister dst, Address src); + void movq(XMMRegister dst, XMMRegister src); + void movq(Register dst, XMMRegister src); + void movq(XMMRegister dst, Register src); void movsbl(Register dst, Address src); void movsbl(Register dst, Register src); @@ -1629,6 +1706,8 @@ void btrq(Address dst, int imm8); #endif + void orw(Register dst, Register src); + void orl(Address dst, int32_t imm32); void orl(Register dst, int32_t imm32); void orl(Register dst, Address src); @@ -1642,17 +1721,32 @@ void orq(Register dst, Address src); void orq(Register dst, Register src); + // Pack with signed saturation + void packsswb(XMMRegister dst, XMMRegister src); + void vpacksswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void packssdw(XMMRegister dst, XMMRegister src); + void vpackssdw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + // Pack with unsigned saturation void packuswb(XMMRegister dst, XMMRegister src); void packuswb(XMMRegister dst, Address src); + void packusdw(XMMRegister dst, XMMRegister src); void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpackusdw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); - // Pemutation of 64bit words + // Permutations void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len); void vpermq(XMMRegister dst, XMMRegister src, int imm8); void vpermq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpermb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpermw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpermd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vpermd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vperm2i128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8); void vperm2f128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8); + void vpermilps(XMMRegister dst, XMMRegister src, int imm8, int vector_len); + void vpermilpd(XMMRegister dst, XMMRegister src, int imm8, int vector_len); + void vpermpd(XMMRegister dst, XMMRegister src, int imm8, int vector_len); void evpermi2q(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void pause(); @@ -1665,11 +1759,14 @@ void pcmpestri(XMMRegister xmm1, Address src, int imm8); void pcmpeqb(XMMRegister dst, XMMRegister src); + void vpcmpCCbwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, int vector_len); + void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len); void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len); void evpcmpeqb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len); + void vpcmpgtb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpcmpgtb(KRegister kdst, XMMRegister nds, Address src, int vector_len); void evpcmpgtb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len); @@ -1682,16 +1779,22 @@ void evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len); void evpcmpeqw(KRegister kdst, XMMRegister nds, Address src, int vector_len); + void vpcmpgtw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void pcmpeqd(XMMRegister dst, XMMRegister src); void vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); - void evpcmpeqd(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len); - void evpcmpeqd(KRegister kdst, XMMRegister nds, Address src, int vector_len); + void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, int vector_len); + void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len); void pcmpeqq(XMMRegister dst, XMMRegister src); + void vpcmpCCq(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, int vector_len); void vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpcmpeqq(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len); void evpcmpeqq(KRegister kdst, XMMRegister nds, Address src, int vector_len); + void pcmpgtq(XMMRegister dst, XMMRegister src); + void vpcmpgtq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void pmovmskb(Register dst, XMMRegister src); void vpmovmskb(Register dst, XMMRegister src); @@ -1700,6 +1803,7 @@ void pextrq(Register dst, XMMRegister src, int imm8); void pextrd(Address dst, XMMRegister src, int imm8); void pextrq(Address dst, XMMRegister src, int imm8); + void pextrb(Register dst, XMMRegister src, int imm8); void pextrb(Address dst, XMMRegister src, int imm8); // SSE 2 extract void pextrw(Register dst, XMMRegister src, int imm8); @@ -1708,21 +1812,46 @@ // SSE 4.1 insert void pinsrd(XMMRegister dst, Register src, int imm8); void pinsrq(XMMRegister dst, Register src, int imm8); + void pinsrb(XMMRegister dst, Register src, int imm8); void pinsrd(XMMRegister dst, Address src, int imm8); void pinsrq(XMMRegister dst, Address src, int imm8); void pinsrb(XMMRegister dst, Address src, int imm8); + void insertps(XMMRegister dst, XMMRegister src, int imm8); // SSE 2 insert void pinsrw(XMMRegister dst, Register src, int imm8); void pinsrw(XMMRegister dst, Address src, int imm8); - // SSE4.1 packed move + // AVX insert + void vpinsrd(XMMRegister dst, XMMRegister nds, Register src, int imm8); + void vpinsrb(XMMRegister dst, XMMRegister nds, Register src, int imm8); + void vpinsrq(XMMRegister dst, XMMRegister nds, Register src, int imm8); + void vpinsrw(XMMRegister dst, XMMRegister nds, Register src, int imm8); + void vinsertps(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8); + + // Zero extend moves void pmovzxbw(XMMRegister dst, XMMRegister src); void pmovzxbw(XMMRegister dst, Address src); - + void pmovzxbd(XMMRegister dst, XMMRegister src); void vpmovzxbw( XMMRegister dst, Address src, int vector_len); + void pmovzxdq(XMMRegister dst, XMMRegister src); void vpmovzxbw(XMMRegister dst, XMMRegister src, int vector_len); + void vpmovzxdq(XMMRegister dst, XMMRegister src, int vector_len); + void vpmovzxbd(XMMRegister dst, XMMRegister src, int vector_len); + void vpmovzxbq(XMMRegister dst, XMMRegister src, int vector_len); void evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vector_len); + // Sign extend moves + void pmovsxbd(XMMRegister dst, XMMRegister src); + void pmovsxbq(XMMRegister dst, XMMRegister src); + void pmovsxbw(XMMRegister dst, XMMRegister src); + void pmovsxwd(XMMRegister dst, XMMRegister src); + void vpmovsxbd(XMMRegister dst, XMMRegister src, int vector_len); + void vpmovsxbq(XMMRegister dst, XMMRegister src, int vector_len); + void vpmovsxbw(XMMRegister dst, XMMRegister src, int vector_len); + void vpmovsxwd(XMMRegister dst, XMMRegister src, int vector_len); + void vpmovsxwq(XMMRegister dst, XMMRegister src, int vector_len); + void vpmovsxdq(XMMRegister dst, XMMRegister src, int vector_len); + void evpmovwb(Address dst, XMMRegister src, int vector_len); void evpmovwb(Address dst, KRegister mask, XMMRegister src, int vector_len); @@ -1730,10 +1859,6 @@ void evpmovdb(Address dst, XMMRegister src, int vector_len); - // Sign extend moves - void pmovsxbw(XMMRegister dst, XMMRegister src); - void vpmovsxbw(XMMRegister dst, XMMRegister src, int vector_len); - // Multiply add void pmaddwd(XMMRegister dst, XMMRegister src); void vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); @@ -1777,10 +1902,17 @@ void pshufd(XMMRegister dst, Address src, int mode); void vpshufd(XMMRegister dst, XMMRegister src, int mode, int vector_len); - // Shuffle Packed Low Words + // Shuffle Packed High/Low Words + void pshufhw(XMMRegister dst, XMMRegister src, int mode); void pshuflw(XMMRegister dst, XMMRegister src, int mode); void pshuflw(XMMRegister dst, Address src, int mode); + //shuffle floats and doubles + void pshufps(XMMRegister, XMMRegister, int); + void pshufpd(XMMRegister, XMMRegister, int); + void vpshufps(XMMRegister, XMMRegister, XMMRegister, int, int); + void vpshufpd(XMMRegister, XMMRegister, XMMRegister, int, int); + // Shuffle packed values at 128 bit granularity void evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len); @@ -1796,6 +1928,9 @@ void vptest(XMMRegister dst, XMMRegister src); void vptest(XMMRegister dst, Address src); + // Vector compare + void vptest(XMMRegister dst, XMMRegister src, int vector_len); + // Interleave Low Bytes void punpcklbw(XMMRegister dst, XMMRegister src); void punpcklbw(XMMRegister dst, Address src); @@ -1858,6 +1993,7 @@ void evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8); void pblendw(XMMRegister dst, XMMRegister src, int imm8); + void vblendps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len); void sha1rnds4(XMMRegister dst, XMMRegister src, int imm8); void sha1nexte(XMMRegister dst, XMMRegister src); @@ -1976,6 +2112,7 @@ void xorl(Register dst, Register src); void xorb(Register dst, Address src); + void xorw(Register dst, Register src); void xorq(Register dst, Address src); void xorq(Register dst, Register src); @@ -2012,6 +2149,8 @@ void shlxq(Register dst, Register src1, Register src2); //====================VECTOR ARITHMETIC===================================== + void evpmovd2m(KRegister kdst, XMMRegister src, int vector_len); + void evpmovq2m(KRegister kdst, XMMRegister src, int vector_len); // Add Packed Floating-Point Values void addpd(XMMRegister dst, XMMRegister src); @@ -2121,13 +2260,41 @@ // Multiply packed integers (only shorts and ints) void pmullw(XMMRegister dst, XMMRegister src); void pmulld(XMMRegister dst, XMMRegister src); + void pmuludq(XMMRegister dst, XMMRegister src); void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpmuludq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len); void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len); void vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + // Minimum of packed integers + void pminsb(XMMRegister dst, XMMRegister src); + void vpminsb(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void pminsw(XMMRegister dst, XMMRegister src); + void vpminsw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void pminsd(XMMRegister dst, XMMRegister src); + void vpminsd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void vpminsq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void minps(XMMRegister dst, XMMRegister src); + void vminps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void minpd(XMMRegister dst, XMMRegister src); + void vminpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + + // Maximum of packed integers + void pmaxsb(XMMRegister dst, XMMRegister src); + void vpmaxsb(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void pmaxsw(XMMRegister dst, XMMRegister src); + void vpmaxsw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void pmaxsd(XMMRegister dst, XMMRegister src); + void vpmaxsd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void vpmaxsq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void maxps(XMMRegister dst, XMMRegister src); + void vmaxps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void maxpd(XMMRegister dst, XMMRegister src); + void vmaxpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + // Shift left packed integers void psllw(XMMRegister dst, int shift); void pslld(XMMRegister dst, int shift); @@ -2169,9 +2336,22 @@ void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len); void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void evpsravw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpsraq(XMMRegister dst, XMMRegister src, int shift, int vector_len); void evpsraq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + // Variable shift left packed integers + void vpsllvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void vpsllvq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + + // Variable shift right packed integers + void vpsrlvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void vpsrlvq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + + // Variable shift right arithmetic packed integers + void vpsravd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void evpsravq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void vpshldvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); void vpshrdvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); @@ -2179,6 +2359,7 @@ void pand(XMMRegister dst, XMMRegister src); void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void evpandd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); void vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); // Andn packed integers @@ -2191,14 +2372,18 @@ void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len); void vporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); + void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len); + // Xor packed integers void pxor(XMMRegister dst, XMMRegister src); void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void evpxord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); void evpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpxorq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); - // vinserti forms void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8); void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8); @@ -2257,7 +2442,21 @@ void evpbroadcastd(XMMRegister dst, Register src, int vector_len); void evpbroadcastq(XMMRegister dst, Register src, int vector_len); - void evpgatherdd(XMMRegister dst, KRegister k1, Address src, int vector_len); + // Gather AVX2 and AVX3 + void vpgatherdd(XMMRegister dst, Address src, XMMRegister mask, int vector_len); + void vpgatherdq(XMMRegister dst, Address src, XMMRegister mask, int vector_len); + void vgatherdpd(XMMRegister dst, Address src, XMMRegister mask, int vector_len); + void vgatherdps(XMMRegister dst, Address src, XMMRegister mask, int vector_len); + void evpgatherdd(XMMRegister dst, KRegister mask, Address src, int vector_len); + void evpgatherdq(XMMRegister dst, KRegister mask, Address src, int vector_len); + void evgatherdpd(XMMRegister dst, KRegister mask, Address src, int vector_len); + void evgatherdps(XMMRegister dst, KRegister mask, Address src, int vector_len); + + //Scatter AVX3 only + void evpscatterdd(Address dst, KRegister mask, XMMRegister src, int vector_len); + void evpscatterdq(Address dst, KRegister mask, XMMRegister src, int vector_len); + void evscatterdps(Address dst, KRegister mask, XMMRegister src, int vector_len); + void evscatterdpd(Address dst, KRegister mask, XMMRegister src, int vector_len); // Carry-Less Multiplication Quadword void pclmulqdq(XMMRegister dst, XMMRegister src, int mask); @@ -2270,13 +2469,55 @@ // runtime code and native libraries. void vzeroupper(); - // AVX support for vectorized conditional move (float/double). The following two instructions used only coupled. - void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len); - void blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len); - void cmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len); - void blendvps(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len); + // Vector double compares + void vcmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len); + void evcmppd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + ComparisonPredicateFP comparison, int vector_len); + + // Vector float compares + void vcmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int comparison, int vector_len); + void evcmpps(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + ComparisonPredicateFP comparison, int vector_len); + + // Vector integer compares + void vpcmpgtd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + int comparison, int vector_len); + void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, Address src, + int comparison, int vector_len); + + // Vector long compares + void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + int comparison, int vector_len); + void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, Address src, + int comparison, int vector_len); + + // Vector byte compares + void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + int comparison, int vector_len); + void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, + int comparison, int vector_len); + + // Vector short compares + void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + int comparison, int vector_len); + void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, Address src, + int comparison, int vector_len); + + // Vector blends + void blendvps(XMMRegister dst, XMMRegister src); + void blendvpd(XMMRegister dst, XMMRegister src); + void pblendvb(XMMRegister dst, XMMRegister src); + void vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len); + void vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len); + void vpblendvb(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len); void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len); - + void evblendmpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); + void evblendmps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); + void evpblendmb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); + void evpblendmw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); + void evpblendmd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); + void evpblendmq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); protected: // Next instructions require address alignment 16 bytes SSE mode. // They should be called only from corresponding MacroAssembler instructions. @@ -2372,7 +2613,8 @@ // Internal encoding data used in compressed immediate offset programming void set_evex_encoding(int value) { _evex_encoding = value; } - // Set the Evex.Z field to be used to clear all non directed XMM/YMM/ZMM components + // When the Evex.Z field is set (true), it is used to clear all non directed XMM/YMM/ZMM components. + // This method unsets it so that merge semantics are used instead. void reset_is_clear_context(void) { _is_clear_context = false; } // Map back to current asembler so that we can manage object level assocation --- old/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp 2020-04-02 18:03:44.388854098 -0700 +++ new/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp 2020-04-02 18:03:44.220854098 -0700 @@ -33,6 +33,21 @@ #include "runtime/objectMonitor.hpp" #include "runtime/stubRoutines.hpp" +inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { + switch (vlen_in_bytes) { + case 4: // fall-through + case 8: // fall-through + case 16: return Assembler::AVX_128bit; + case 32: return Assembler::AVX_256bit; + case 64: return Assembler::AVX_512bit; + + default: { + ShouldNotReachHere(); + return Assembler::AVX_NoVec; + } + } +} + void C2_MacroAssembler::setvectmask(Register dst, Register src) { guarantee(PostLoopMultiversioning, "must be"); Assembler::movl(dst, 1); @@ -855,6 +870,167 @@ } } +void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { + assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); + + if (opcode == Op_MinV) { + if (elem_bt == T_BYTE) { + pminsb(dst, src); + } else if (elem_bt == T_SHORT) { + pminsw(dst, src); + } else if (elem_bt == T_INT) { + pminsd(dst, src); + } else { + assert(elem_bt == T_LONG, "required"); + assert(tmp == xmm0, "required"); + movdqu(xmm0, dst); + pcmpgtq(xmm0, src); + blendvpd(dst, src); // xmm0 as mask + } + } else { // opcode == Op_MaxV + if (elem_bt == T_BYTE) { + pmaxsb(dst, src); + } else if (elem_bt == T_SHORT) { + pmaxsw(dst, src); + } else if (elem_bt == T_INT) { + pmaxsd(dst, src); + } else { + assert(elem_bt == T_LONG, "required"); + assert(tmp == xmm0, "required"); + movdqu(xmm0, src); + pcmpgtq(xmm0, dst); + blendvpd(dst, src); // xmm0 as mask + } + } +} + +void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, + XMMRegister dst, XMMRegister src1, XMMRegister src2, + int vlen_enc) { + assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); + + if (opcode == Op_MinV) { + if (elem_bt == T_BYTE) { + vpminsb(dst, src1, src2, vlen_enc); + } else if (elem_bt == T_SHORT) { + vpminsw(dst, src1, src2, vlen_enc); + } else if (elem_bt == T_INT) { + vpminsd(dst, src1, src2, vlen_enc); + } else { + assert(elem_bt == T_LONG, "required"); + if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { + vpminsq(dst, src1, src2, vlen_enc); + } else { + vpcmpgtq(dst, src1, src2, vlen_enc); + vblendvpd(dst, src1, src2, dst, vlen_enc); + } + } + } else { // opcode == Op_MaxV + if (elem_bt == T_BYTE) { + vpmaxsb(dst, src1, src2, vlen_enc); + } else if (elem_bt == T_SHORT) { + vpmaxsw(dst, src1, src2, vlen_enc); + } else if (elem_bt == T_INT) { + vpmaxsd(dst, src1, src2, vlen_enc); + } else { + assert(elem_bt == T_LONG, "required"); + if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { + vpmaxsq(dst, src1, src2, vlen_enc); + } else { + vpcmpgtq(dst, src1, src2, vlen_enc); + vblendvpd(dst, src2, src1, dst, vlen_enc); + } + } + } +} + +// Float/Double min max + +void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, + XMMRegister dst, XMMRegister a, XMMRegister b, + XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, + int vlen_enc) { + assert(UseAVX > 0, "required"); + assert(opcode == Op_MinV || opcode == Op_MinReductionV || + opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); + assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); + + bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); + bool is_double_word = is_double_word_type(elem_bt); + + if (!is_double_word && is_min) { + vblendvps(atmp, a, b, a, vlen_enc); + vblendvps(btmp, b, a, a, vlen_enc); + vminps(tmp, atmp, btmp, vlen_enc); + vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + vblendvps(dst, tmp, atmp, btmp, vlen_enc); + } else if (!is_double_word && !is_min) { + vblendvps(btmp, b, a, b, vlen_enc); + vblendvps(atmp, a, b, b, vlen_enc); + vmaxps(tmp, atmp, btmp, vlen_enc); + vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + vblendvps(dst, tmp, atmp, btmp, vlen_enc); + } else if (is_double_word && is_min) { + vblendvpd(atmp, a, b, a, vlen_enc); + vblendvpd(btmp, b, a, a, vlen_enc); + vminpd(tmp, atmp, btmp, vlen_enc); + vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + vblendvpd(dst, tmp, atmp, btmp, vlen_enc); + } else { + assert(is_double_word && !is_min, "sanity"); + vblendvpd(btmp, b, a, b, vlen_enc); + vblendvpd(atmp, a, b, b, vlen_enc); + vmaxpd(tmp, atmp, btmp, vlen_enc); + vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + vblendvpd(dst, tmp, atmp, btmp, vlen_enc); + } +} + +void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, + XMMRegister dst, XMMRegister a, XMMRegister b, + KRegister ktmp, XMMRegister atmp, XMMRegister btmp, + int vlen_enc) { + assert(UseAVX > 2, "required"); + assert(opcode == Op_MinV || opcode == Op_MinReductionV || + opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); + assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); + + bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); + bool is_double_word = is_double_word_type(elem_bt); + bool merge = true; + + if (!is_double_word && is_min) { + evpmovd2m(ktmp, a, vlen_enc); + evblendmps(atmp, ktmp, a, b, merge, vlen_enc); + evblendmps(btmp, ktmp, b, a, merge, vlen_enc); + vminps(dst, atmp, btmp, vlen_enc); + evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + evmovdqul(dst, ktmp, atmp, merge, vlen_enc); + } else if (!is_double_word && !is_min) { + evpmovd2m(ktmp, b, vlen_enc); + evblendmps(atmp, ktmp, a, b, merge, vlen_enc); + evblendmps(btmp, ktmp, b, a, merge, vlen_enc); + vmaxps(dst, atmp, btmp, vlen_enc); + evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + evmovdqul(dst, ktmp, atmp, merge, vlen_enc); + } else if (is_double_word && is_min) { + evpmovq2m(ktmp, a, vlen_enc); + evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); + evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); + vminpd(dst, atmp, btmp, vlen_enc); + evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + evmovdquq(dst, ktmp, atmp, merge, vlen_enc); + } else { + assert(is_double_word && !is_min, "sanity"); + evpmovq2m(ktmp, b, vlen_enc); + evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); + evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); + vmaxpd(dst, atmp, btmp, vlen_enc); + evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); + evmovdquq(dst, ktmp, atmp, merge, vlen_enc); + } +} + void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { if (sign) { pmovsxbw(dst, src); @@ -871,111 +1047,418 @@ } } -void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) { - if (opcode == Op_RShiftVI) { - psrad(dst, src); - } else if (opcode == Op_LShiftVI) { - pslld(dst, src); +void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { + if (sign) { + vpmovsxbd(dst, src, vector_len); } else { - assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); - psrld(dst, src); + vpmovzxbd(dst, src, vector_len); } } -void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { - if (opcode == Op_RShiftVI) { - vpsrad(dst, nds, src, vector_len); - } else if (opcode == Op_LShiftVI) { - vpslld(dst, nds, src, vector_len); +void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { + if (sign) { + vpmovsxwd(dst, src, vector_len); } else { - assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); - vpsrld(dst, nds, src, vector_len); + vpmovzxwd(dst, src, vector_len); } } -void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) { - if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) { - psraw(dst, src); - } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) { - psllw(dst, src); - } else { - assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB"); - psrlw(dst, src); +void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { + switch (opcode) { + case Op_RShiftVI: psrad(dst, shift); break; + case Op_LShiftVI: pslld(dst, shift); break; + case Op_URShiftVI: psrld(dst, shift); break; + + default: assert(false, "%s", NodeClassNames[opcode]); } } -void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { - if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) { - vpsraw(dst, nds, src, vector_len); - } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) { - vpsllw(dst, nds, src, vector_len); - } else { - assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB"); - vpsrlw(dst, nds, src, vector_len); +void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { + switch (opcode) { + case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; + case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; + case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; + + default: assert(false, "%s", NodeClassNames[opcode]); } } -void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) { - if (opcode == Op_RShiftVL) { - psrlq(dst, src); // using srl to implement sra on pre-avs512 systems - } else if (opcode == Op_LShiftVL) { - psllq(dst, src); +void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { + switch (opcode) { + case Op_RShiftVB: // fall-through + case Op_RShiftVS: psraw(dst, shift); break; + + case Op_LShiftVB: // fall-through + case Op_LShiftVS: psllw(dst, shift); break; + + case Op_URShiftVS: // fall-through + case Op_URShiftVB: psrlw(dst, shift); break; + + default: assert(false, "%s", NodeClassNames[opcode]); + } +} + +void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { + switch (opcode) { + case Op_RShiftVB: // fall-through + case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; + + case Op_LShiftVB: // fall-through + case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; + + case Op_URShiftVS: // fall-through + case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; + + default: assert(false, "%s", NodeClassNames[opcode]); + } +} + +void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { + switch (opcode) { + case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems + case Op_LShiftVL: psllq(dst, shift); break; + case Op_URShiftVL: psrlq(dst, shift); break; + + default: assert(false, "%s", NodeClassNames[opcode]); + } +} + +void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { + switch (opcode) { + case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; + case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; + case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; + + default: assert(false, "%s", NodeClassNames[opcode]); + } +} + +void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { + switch (opcode) { + case Op_VRShiftV: vpsravd(dst, src, shift, vlen_enc); break; + case Op_VLShiftV: vpsllvd(dst, src, shift, vlen_enc); break; + case Op_VURShiftV: vpsrlvd(dst, src, shift, vlen_enc); break; + + default: assert(false, "%s", NodeClassNames[opcode]); + } +} + +void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { + switch (opcode) { + case Op_VRShiftV: evpsravw(dst, src, shift, vlen_enc); break; + case Op_VLShiftV: evpsllvw(dst, src, shift, vlen_enc); break; + case Op_VURShiftV: evpsrlvw(dst, src, shift, vlen_enc); break; + + default: assert(false, "%s", NodeClassNames[opcode]); + } +} + +void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { + assert(UseAVX >= 2, "required"); + switch (opcode) { + case Op_VRShiftV: { + if (UseAVX > 2) { + assert(tmp == xnoreg, "not used"); + if (!VM_Version::supports_avx512vl()) { + vlen_enc = Assembler::AVX_512bit; + } + evpsravq(dst, src, shift, vlen_enc); + } else { + vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); + vpsrlvq(dst, src, shift, vlen_enc); + vpsrlvq(tmp, tmp, shift, vlen_enc); + vpxor(dst, dst, tmp, vlen_enc); + vpsubq(dst, dst, tmp, vlen_enc); + } + break; + } + case Op_VLShiftV: { + assert(tmp == xnoreg, "not used"); + vpsllvq(dst, src, shift, vlen_enc); + break; + } + case Op_VURShiftV: { + assert(tmp == xnoreg, "not used"); + vpsrlvq(dst, src, shift, vlen_enc); + break; + } + default: assert(false, "%s", NodeClassNames[opcode]); + } +} + +// Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst +void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { + bool sign = (opcode == Op_VURShiftV) ? false : true; + assert(vector_len == 0, "required"); + vextendbd(sign, dst, src, 1); + vpmovzxbd(vtmp, shift, 1); + varshiftd(opcode, dst, dst, vtmp, 1); + vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch); + vextracti128_high(vtmp, dst); + vpackusdw(dst, dst, vtmp, 0); +} + +// Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst +void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { + bool sign = (opcode == Op_VURShiftV) ? false : true; + int ext_vector_len = vector_len + 1; + vextendbw(sign, dst, src, ext_vector_len); + vpmovzxbw(vtmp, shift, ext_vector_len); + varshiftw(opcode, dst, dst, vtmp, ext_vector_len); + vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch); + if (vector_len == 0) { + vextracti128_high(vtmp, dst); + vpackuswb(dst, dst, vtmp, vector_len); } else { - assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); - psrlq(dst, src); + vextracti64x4_high(vtmp, dst); + vpackuswb(dst, dst, vtmp, vector_len); + vpermq(dst, dst, 0xD8, vector_len); + } +} + +void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { + switch(typ) { + case T_BYTE: + pinsrb(dst, val, idx); + break; + case T_SHORT: + pinsrw(dst, val, idx); + break; + case T_INT: + pinsrd(dst, val, idx); + break; + case T_LONG: + pinsrq(dst, val, idx); + break; + default: + assert(false,"Should not reach here."); + break; + } +} + +void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { + switch(typ) { + case T_BYTE: + vpinsrb(dst, src, val, idx); + break; + case T_SHORT: + vpinsrw(dst, src, val, idx); + break; + case T_INT: + vpinsrd(dst, src, val, idx); + break; + case T_LONG: + vpinsrq(dst, src, val, idx); + break; + default: + assert(false,"Should not reach here."); + break; } } -void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { - if (opcode == Op_RShiftVL) { - evpsraq(dst, nds, src, vector_len); - } else if (opcode == Op_LShiftVL) { - vpsllq(dst, nds, src, vector_len); +void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { + switch(typ) { + case T_INT: + vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); + break; + case T_FLOAT: + vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); + break; + case T_LONG: + vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); + break; + case T_DOUBLE: + vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); + break; + default: + assert(false,"Should not reach here."); + break; + } +} + +void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { + switch(typ) { + case T_INT: + evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); + break; + case T_FLOAT: + evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); + break; + case T_LONG: + evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); + break; + case T_DOUBLE: + evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); + break; + default: + assert(false,"Should not reach here."); + break; + } +} + +void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { + switch(typ) { + case T_INT: + evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); + break; + case T_FLOAT: + evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); + break; + case T_LONG: + evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); + break; + case T_DOUBLE: + evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); + break; + default: + assert(false,"Should not reach here."); + break; + } +} + +void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) { + if (vlen_in_bytes <= 16) { + pxor (dst, dst); + psubb(dst, src); + switch (elem_bt) { + case T_BYTE: /* nothing to do */ break; + case T_SHORT: pmovsxbw(dst, dst); break; + case T_INT: pmovsxbd(dst, dst); break; + case T_FLOAT: pmovsxbd(dst, dst); break; + case T_LONG: pmovsxbq(dst, dst); break; + case T_DOUBLE: pmovsxbq(dst, dst); break; + + default: assert(false, "%s", type2name(elem_bt)); + } } else { - assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); - vpsrlq(dst, nds, src, vector_len); + int vlen_enc = vector_length_encoding(vlen_in_bytes); + + vpxor (dst, dst, dst, vlen_enc); + vpsubb(dst, dst, src, vlen_enc); + switch (elem_bt) { + case T_BYTE: /* nothing to do */ break; + case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; + case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; + case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; + case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; + case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; + + default: assert(false, "%s", type2name(elem_bt)); + } } } -// Reductions for vectors of ints, longs, floats, and doubles. +void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) { + ExternalAddress addr(StubRoutines::x86::vector_iota_indices()); + if (vlen_in_bytes <= 16) { + movdqu(dst, addr, scratch); + } else if (vlen_in_bytes == 32) { + vmovdqu(dst, addr, scratch); + } else { + assert(vlen_in_bytes == 64, "%d", vlen_in_bytes); + evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch); + } +} +// Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. -void C2_MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) { +void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { int vector_len = Assembler::AVX_128bit; switch (opcode) { case Op_AndReductionV: pand(dst, src); break; case Op_OrReductionV: por (dst, src); break; case Op_XorReductionV: pxor(dst, src); break; - + case Op_MinReductionV: + switch (typ) { + case T_BYTE: pminsb(dst, src); break; + case T_SHORT: pminsw(dst, src); break; + case T_INT: pminsd(dst, src); break; + case T_LONG: assert(UseAVX > 2, "required"); + vpminsq(dst, dst, src, Assembler::AVX_128bit); break; + default: assert(false, "wrong type"); + } + break; + case Op_MaxReductionV: + switch (typ) { + case T_BYTE: pmaxsb(dst, src); break; + case T_SHORT: pmaxsw(dst, src); break; + case T_INT: pmaxsd(dst, src); break; + case T_LONG: assert(UseAVX > 2, "required"); + vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; + default: assert(false, "wrong type"); + } + break; case Op_AddReductionVF: addss(dst, src); break; case Op_AddReductionVD: addsd(dst, src); break; - case Op_AddReductionVI: paddd(dst, src); break; + case Op_AddReductionVI: + switch (typ) { + case T_BYTE: paddb(dst, src); break; + case T_SHORT: paddw(dst, src); break; + case T_INT: paddd(dst, src); break; + default: assert(false, "wrong type"); + } + break; case Op_AddReductionVL: paddq(dst, src); break; - case Op_MulReductionVF: mulss(dst, src); break; case Op_MulReductionVD: mulsd(dst, src); break; - case Op_MulReductionVI: pmulld(dst, src); break; - case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break; - - default: assert(false, "wrong opcode"); + case Op_MulReductionVI: + switch (typ) { + case T_SHORT: pmullw(dst, src); break; + case T_INT: pmulld(dst, src); break; + default: assert(false, "wrong type"); + } + break; + case Op_MulReductionVL: assert(UseAVX > 2, "required"); + vpmullq(dst, dst, src, vector_len); break; + default: assert(false, "wrong opcode"); } } -void C2_MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { +void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { int vector_len = Assembler::AVX_256bit; switch (opcode) { case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; - - case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break; + case Op_MinReductionV: + switch (typ) { + case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; + case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; + case T_INT: vpminsd(dst, src1, src2, vector_len); break; + case T_LONG: assert(UseAVX > 2, "required"); + vpminsq(dst, src1, src2, vector_len); break; + default: assert(false, "wrong type"); + } + break; + case Op_MaxReductionV: + switch (typ) { + case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; + case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; + case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; + case T_LONG: assert(UseAVX > 2, "required"); + vpmaxsq(dst, src1, src2, vector_len); break; + default: assert(false, "wrong type"); + } + break; + case Op_AddReductionVI: + switch (typ) { + case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; + case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; + case T_INT: vpaddd(dst, src1, src2, vector_len); break; + default: assert(false, "wrong type"); + } + break; case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; - - case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break; + case Op_MulReductionVI: + switch (typ) { + case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; + case T_INT: vpmulld(dst, src1, src2, vector_len); break; + default: assert(false, "wrong type"); + } + break; case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; - - default: assert(false, "wrong opcode"); + default: assert(false, "wrong opcode"); } } @@ -997,9 +1480,48 @@ } } +void C2_MacroAssembler::reduceB(int opcode, int vlen, + Register dst, Register src1, XMMRegister src2, + XMMRegister vtmp1, XMMRegister vtmp2) { + switch (vlen) { + case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + + default: assert(false, "wrong vector length"); + } +} + +void C2_MacroAssembler::mulreduceB(int opcode, int vlen, + Register dst, Register src1, XMMRegister src2, + XMMRegister vtmp1, XMMRegister vtmp2) { + switch (vlen) { + case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; + + default: assert(false, "wrong vector length"); + } +} + +void C2_MacroAssembler::reduceS(int opcode, int vlen, + Register dst, Register src1, XMMRegister src2, + XMMRegister vtmp1, XMMRegister vtmp2) { + switch (vlen) { + case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; + case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; + + default: assert(false, "wrong vector length"); + } +} + void C2_MacroAssembler::reduceI(int opcode, int vlen, - Register dst, Register src1, XMMRegister src2, - XMMRegister vtmp1, XMMRegister vtmp2) { + Register dst, Register src1, XMMRegister src2, + XMMRegister vtmp1, XMMRegister vtmp2) { switch (vlen) { case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; @@ -1012,8 +1534,8 @@ #ifdef _LP64 void C2_MacroAssembler::reduceL(int opcode, int vlen, - Register dst, Register src1, XMMRegister src2, - XMMRegister vtmp1, XMMRegister vtmp2) { + Register dst, Register src1, XMMRegister src2, + XMMRegister vtmp1, XMMRegister vtmp2) { switch (vlen) { case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; @@ -1068,10 +1590,10 @@ phaddd(vtmp1, vtmp1); } else { pshufd(vtmp1, src2, 0x1); - reduce_operation_128(opcode, vtmp1, src2); + reduce_operation_128(T_INT, opcode, vtmp1, src2); } movdl(vtmp2, src1); - reduce_operation_128(opcode, vtmp1, vtmp2); + reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); movdl(dst, vtmp1); } @@ -1084,7 +1606,7 @@ reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } else { pshufd(vtmp2, src2, 0xE); - reduce_operation_128(opcode, vtmp2, src2); + reduce_operation_128(T_INT, opcode, vtmp2, src2); reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); } } @@ -1097,51 +1619,176 @@ reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } else { vextracti128_high(vtmp1, src2); - reduce_operation_128(opcode, vtmp1, src2); + reduce_operation_128(T_INT, opcode, vtmp1, src2); reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } } void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { vextracti64x4_high(vtmp2, src2); - reduce_operation_256(opcode, vtmp2, vtmp2, src2); + reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); } +void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + pshufd(vtmp2, src2, 0x1); + reduce_operation_128(T_BYTE, opcode, vtmp2, src2); + movdqu(vtmp1, vtmp2); + psrldq(vtmp1, 2); + reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); + movdqu(vtmp2, vtmp1); + psrldq(vtmp2, 1); + reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); + movdl(vtmp2, src1); + pmovsxbd(vtmp1, vtmp1); + reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); + pextrb(dst, vtmp1, 0x0); + movsbl(dst, dst); +} + +void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + pshufd(vtmp1, src2, 0xE); + reduce_operation_128(T_BYTE, opcode, vtmp1, src2); + reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); +} + +void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + vextracti128_high(vtmp2, src2); + reduce_operation_128(T_BYTE, opcode, vtmp2, src2); + reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); +} + +void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + vextracti64x4_high(vtmp1, src2); + reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); + reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); +} + +void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + pmovsxbw(vtmp2, src2); + reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); +} + +void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + if (UseAVX > 1) { + int vector_len = Assembler::AVX_256bit; + vpmovsxbw(vtmp1, src2, vector_len); + reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); + } else { + pmovsxbw(vtmp2, src2); + reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); + pshufd(vtmp2, src2, 0x1); + pmovsxbw(vtmp2, src2); + reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); + } +} + +void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + if (UseAVX > 2 && VM_Version::supports_avx512bw()) { + int vector_len = Assembler::AVX_512bit; + vpmovsxbw(vtmp1, src2, vector_len); + reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); + } else { + assert(UseAVX >= 2,"Should not reach here."); + mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); + vextracti128_high(vtmp2, src2); + mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); + } +} + +void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); + vextracti64x4_high(vtmp2, src2); + mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); +} + +void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + if (opcode == Op_AddReductionVI) { + if (vtmp1 != src2) { + movdqu(vtmp1, src2); + } + phaddw(vtmp1, vtmp1); + phaddw(vtmp1, vtmp1); + } else { + pshufd(vtmp2, src2, 0x1); + reduce_operation_128(T_SHORT, opcode, vtmp2, src2); + movdqu(vtmp1, vtmp2); + psrldq(vtmp1, 2); + reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); + } + movdl(vtmp2, src1); + pmovsxwd(vtmp1, vtmp1); + reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); + pextrw(dst, vtmp1, 0x0); + movswl(dst, dst); +} + +void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + if (opcode == Op_AddReductionVI) { + if (vtmp1 != src2) { + movdqu(vtmp1, src2); + } + phaddw(vtmp1, src2); + } else { + pshufd(vtmp1, src2, 0xE); + reduce_operation_128(T_SHORT, opcode, vtmp1, src2); + } + reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); +} + +void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + if (opcode == Op_AddReductionVI) { + int vector_len = Assembler::AVX_256bit; + vphaddw(vtmp2, src2, src2, vector_len); + vpermq(vtmp2, vtmp2, 0xD8, vector_len); + } else { + vextracti128_high(vtmp2, src2); + reduce_operation_128(T_SHORT, opcode, vtmp2, src2); + } + reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); +} + +void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { + int vector_len = Assembler::AVX_256bit; + vextracti64x4_high(vtmp1, src2); + reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); + reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); +} + #ifdef _LP64 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { pshufd(vtmp2, src2, 0xE); - reduce_operation_128(opcode, vtmp2, src2); + reduce_operation_128(T_LONG, opcode, vtmp2, src2); movdq(vtmp1, src1); - reduce_operation_128(opcode, vtmp1, vtmp2); + reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); movdq(dst, vtmp1); } void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { vextracti128_high(vtmp1, src2); - reduce_operation_128(opcode, vtmp1, src2); + reduce_operation_128(T_LONG, opcode, vtmp1, src2); reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); } void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { vextracti64x4_high(vtmp2, src2); - reduce_operation_256(opcode, vtmp2, vtmp2, src2); + reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); } #endif // _LP64 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { - reduce_operation_128(opcode, dst, src); + reduce_operation_128(T_FLOAT, opcode, dst, src); pshufd(vtmp, src, 0x1); - reduce_operation_128(opcode, dst, vtmp); + reduce_operation_128(T_FLOAT, opcode, dst, vtmp); } void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { reduce2F(opcode, dst, src, vtmp); pshufd(vtmp, src, 0x2); - reduce_operation_128(opcode, dst, vtmp); + reduce_operation_128(T_FLOAT, opcode, dst, vtmp); pshufd(vtmp, src, 0x3); - reduce_operation_128(opcode, dst, vtmp); + reduce_operation_128(T_FLOAT, opcode, dst, vtmp); } void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { @@ -1157,9 +1804,9 @@ } void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { - reduce_operation_128(opcode, dst, src); + reduce_operation_128(T_DOUBLE, opcode, dst, src); pshufd(vtmp, src, 0xE); - reduce_operation_128(opcode, dst, vtmp); + reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); } void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { @@ -1174,6 +1821,207 @@ reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); } +void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, + XMMRegister dst, XMMRegister src, + XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, + XMMRegister xmm_0, XMMRegister xmm_1) { + int permconst[] = {1, 14}; + XMMRegister wsrc = src; + XMMRegister wdst = xmm_0; + XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; + + int vlen_enc = Assembler::AVX_128bit; + if (vlen == 16) { + vlen_enc = Assembler::AVX_256bit; + } + + for (int i = log2(vlen) - 1; i >=0; i--) { + if (i == 0 && !is_dst_valid) { + wdst = dst; + } + if (i == 3) { + vextracti64x4_high(wtmp, wsrc); + } else if (i == 2) { + vextracti128_high(wtmp, wsrc); + } else { // i = [0,1] + vpermilps(wtmp, wsrc, permconst[i], vlen_enc); + } + vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); + wsrc = wdst; + vlen_enc = Assembler::AVX_128bit; + } + if (is_dst_valid) { + vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); + } +} + +void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, + XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, + XMMRegister xmm_0, XMMRegister xmm_1) { + XMMRegister wsrc = src; + XMMRegister wdst = xmm_0; + XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; + int vlen_enc = Assembler::AVX_128bit; + if (vlen == 8) { + vlen_enc = Assembler::AVX_256bit; + } + for (int i = log2(vlen) - 1; i >=0; i--) { + if (i == 0 && !is_dst_valid) { + wdst = dst; + } + if (i == 1) { + vextracti128_high(wtmp, wsrc); + } else if (i == 2) { + vextracti64x4_high(wtmp, wsrc); + } else { + assert(i == 0, "%d", i); + vpermilpd(wtmp, wsrc, 1, vlen_enc); + } + vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); + wsrc = wdst; + vlen_enc = Assembler::AVX_128bit; + } + if (is_dst_valid) { + vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); + } +} + +void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { + switch (bt) { + case T_BYTE: pextrb(dst, src, idx); break; + case T_SHORT: pextrw(dst, src, idx); break; + case T_INT: pextrd(dst, src, idx); break; + case T_LONG: pextrq(dst, src, idx); break; + + default: + assert(false,"Should not reach here."); + break; + } +} + +XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { + int esize = type2aelembytes(typ); + int elem_per_lane = 16/esize; + int lane = elemindex / elem_per_lane; + int eindex = elemindex % elem_per_lane; + + if (lane >= 2) { + assert(UseAVX > 2, "required"); + vextractf32x4(dst, src, lane & 3); + return dst; + } else if (lane > 0) { + assert(UseAVX > 0, "required"); + vextractf128(dst, src, lane); + return dst; + } else { + return src; + } +} + +void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { + int esize = type2aelembytes(typ); + int elem_per_lane = 16/esize; + int eindex = elemindex % elem_per_lane; + assert(is_integral_type(typ),"required"); + + if (eindex == 0) { + if (typ == T_LONG) { + movq(dst, src); + } else { + movdl(dst, src); + if (typ == T_BYTE) + movsbl(dst, dst); + else if (typ == T_SHORT) + movswl(dst, dst); + } + } else { + extract(typ, dst, src, eindex); + } +} + +void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) { + int esize = type2aelembytes(typ); + int elem_per_lane = 16/esize; + int eindex = elemindex % elem_per_lane; + assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); + + if (eindex == 0) { + movq(dst, src); + } else { + if (typ == T_FLOAT) { + if (UseAVX == 0) { + movdqu(dst, src); + pshufps(dst, dst, eindex); + } else { + vpshufps(dst, src, src, eindex, Assembler::AVX_128bit); + } + } else { + if (UseAVX == 0) { + movdqu(dst, src); + psrldq(dst, eindex*esize); + } else { + vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); + } + movq(dst, dst); + } + } + // Zero upper bits + if (typ == T_FLOAT) { + if (UseAVX == 0) { + assert((vtmp != xnoreg) && (tmp != noreg), "required."); + movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp); + pand(dst, vtmp); + } else { + assert((tmp != noreg), "required."); + vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp); + } + } +} + +void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) { + switch(typ) { + case T_BYTE: + evpcmpb(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); + break; + case T_SHORT: + evpcmpw(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); + break; + case T_INT: + case T_FLOAT: + evpcmpd(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); + break; + case T_LONG: + case T_DOUBLE: + evpcmpq(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); + break; + default: + assert(false,"Should not reach here."); + break; + } +} + +void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { + switch(typ) { + case T_BYTE: + evpblendmb(dst, kmask, src1, src2, merge, vector_len); + break; + case T_SHORT: + evpblendmw(dst, kmask, src1, src2, merge, vector_len); + break; + case T_INT: + case T_FLOAT: + evpblendmd(dst, kmask, src1, src2, merge, vector_len); + break; + case T_LONG: + case T_DOUBLE: + evpblendmq(dst, kmask, src1, src2, merge, vector_len); + break; + default: + assert(false,"Should not reach here."); + break; + } +} + //------------------------------------------------------------------------------------------- // IndexOf for constant substrings with size >= 8 chars --- old/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp 2020-04-02 18:03:44.944854099 -0700 +++ new/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp 2020-04-02 18:03:44.784854099 -0700 @@ -28,6 +28,8 @@ // C2_MacroAssembler contains high-level macros for C2 public: + Assembler::AvxVectorLen vector_length_encoding(int vlen_in_bytes); + // special instructions for EVEX void setvectmask(Register dst, Register src); void restorevectmask(); @@ -71,18 +73,61 @@ void vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr); void vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr); void vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr); + + void pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, + XMMRegister tmp = xnoreg); + void vpminmax(int opcode, BasicType elem_bt, + XMMRegister dst, XMMRegister src1, XMMRegister src2, + int vlen_enc); + + void vminmax_fp(int opcode, BasicType elem_bt, + XMMRegister dst, XMMRegister a, XMMRegister b, + XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, + int vlen_enc); + void evminmax_fp(int opcode, BasicType elem_bt, + XMMRegister dst, XMMRegister a, XMMRegister b, + KRegister ktmp, XMMRegister atmp, XMMRegister btmp, + int vlen_enc); + void vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len); void vextendbw(bool sign, XMMRegister dst, XMMRegister src); - void vshiftd(int opcode, XMMRegister dst, XMMRegister src); - void vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); - void vshiftw(int opcode, XMMRegister dst, XMMRegister src); - void vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); - void vshiftq(int opcode, XMMRegister dst, XMMRegister src); - void vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len); + void vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len); - // Reductions for vectors of ints, longs, floats, and doubles. + void vshiftd(int opcode, XMMRegister dst, XMMRegister shift); + void vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc); + void vshiftw(int opcode, XMMRegister dst, XMMRegister shift); + void vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc); + void vshiftq(int opcode, XMMRegister dst, XMMRegister shift); + void vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc); + void varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc); + void varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc); + void varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister vtmp = xnoreg); + void varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch); + void evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch); + + void insert(BasicType typ, XMMRegister dst, Register val, int idx); + void vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx); + void vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len); + void evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len); + void evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len); + + // extract + void extract(BasicType typ, Register dst, XMMRegister src, int idx); + XMMRegister get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex); + void get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex); + void get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp = noreg, XMMRegister vtmp = xnoreg); + + // blend + void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch = rscratch1); + void evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len); - // dst = src1 + reduce(op, src2) using vtmp as temps + void load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt); + void load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes); + + // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. + + // dst = src1 reduce(op, src2) using vtmp as temps void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); #ifdef _LP64 void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); @@ -92,32 +137,62 @@ void reduce_fp(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2 = xnoreg); + void reduceB(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void mulreduceB(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduceS(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, + XMMRegister dst, XMMRegister src, + XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, XMMRegister xmm_0, XMMRegister xmm_1 = xnoreg); + void reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, + XMMRegister dst, XMMRegister src, + XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, XMMRegister xmm_0, XMMRegister xmm_1 = xnoreg); private: void reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2); void reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2); + // Int Reduction void reduce2I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); void reduce4I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); void reduce8I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); void reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + // Byte Reduction + void reduce8B (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void mulreduce8B (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + + // Short Reduction + void reduce4S (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduce8S (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + + // Long Reduction #ifdef _LP64 void reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); void reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); void reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); #endif // _LP64 + // Float Reduction void reduce2F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp); void reduce4F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp); void reduce8F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2); void reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2); + // Double Reduction void reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp); void reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2); void reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2); - void reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src); - void reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2); + // Base reduction instruction + void reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src); + void reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2); public: --- old/src/hotspot/cpu/x86/macroAssembler_x86.cpp 2020-04-02 18:03:45.492854099 -0700 +++ new/src/hotspot/cpu/x86/macroAssembler_x86.cpp 2020-04-02 18:03:45.316854099 -0700 @@ -112,6 +112,7 @@ cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); } + void MacroAssembler::cmpklass(Register src1, Metadata* obj) { cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); } @@ -2500,6 +2501,7 @@ void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) { assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); + if (dst->encoding() == src->encoding()) return; Assembler::movdqu(dst, src); } @@ -2524,6 +2526,7 @@ void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) { assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); + if (dst->encoding() == src->encoding()) return; Assembler::vmovdqu(dst, src); } @@ -2537,6 +2540,64 @@ } } + +void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg) { + if (reachable(src)) { + kmovwl(dst, as_Address(src)); + } else { + lea(scratch_reg, src); + kmovwl(dst, Address(scratch_reg, 0)); + } +} + +void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, + int vector_len, Register scratch_reg) { + if (reachable(src)) { + if (mask == k0) { + Assembler::evmovdqub(dst, as_Address(src), merge, vector_len); + } else { + Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len); + } + } else { + lea(scratch_reg, src); + if (mask == k0) { + Assembler::evmovdqub(dst, Address(scratch_reg, 0), merge, vector_len); + } else { + Assembler::evmovdqub(dst, mask, Address(scratch_reg, 0), merge, vector_len); + } + } +} + +void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, + int vector_len, Register scratch_reg) { + if (reachable(src)) { + Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len); + } else { + lea(scratch_reg, src); + Assembler::evmovdquw(dst, mask, Address(scratch_reg, 0), merge, vector_len); + } +} + +void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, + int vector_len, Register scratch_reg) { + if (reachable(src)) { + Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len); + } else { + lea(scratch_reg, src); + Assembler::evmovdqul(dst, mask, Address(scratch_reg, 0), merge, vector_len); + } +} + +void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, + int vector_len, Register scratch_reg) { + if (reachable(src)) { + Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len); + } else { + lea(scratch_reg, src); + Assembler::evmovdquq(dst, mask, Address(scratch_reg, 0), merge, vector_len); + } +} + void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { if (reachable(src)) { Assembler::evmovdquq(dst, as_Address(src), vector_len); @@ -3023,6 +3084,98 @@ Assembler::vpcmpeqw(dst, nds, src, vector_len); } +void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, + AddressLiteral src, int vector_len, Register scratch_reg) { + if (reachable(src)) { + Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len); + } else { + lea(scratch_reg, src); + Assembler::evpcmpeqd(kdst, mask, nds, Address(scratch_reg, 0), vector_len); + } +} + +void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, + int comparison, int vector_len, Register scratch_reg) { + if (reachable(src)) { + Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, vector_len); + } else { + lea(scratch_reg, src); + Assembler::evpcmpd(kdst, mask, nds, Address(scratch_reg, 0), comparison, vector_len); + } +} + +void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, + int comparison, int vector_len, Register scratch_reg) { + if (reachable(src)) { + Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, vector_len); + } else { + lea(scratch_reg, src); + Assembler::evpcmpq(kdst, mask, nds, Address(scratch_reg, 0), comparison, vector_len); + } +} + +void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, + int comparison, int vector_len, Register scratch_reg) { + if (reachable(src)) { + Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, vector_len); + } else { + lea(scratch_reg, src); + Assembler::evpcmpb(kdst, mask, nds, Address(scratch_reg, 0), comparison, vector_len); + } +} + +void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, + int comparison, int vector_len, Register scratch_reg) { + if (reachable(src)) { + Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, vector_len); + } else { + lea(scratch_reg, src); + Assembler::evpcmpw(kdst, mask, nds, Address(scratch_reg, 0), comparison, vector_len); + } +} + +void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) { + if (width == Assembler::Q) { + Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len); + } else { + Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len); + } +} + +void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg) { + int eq_cond_enc = 0x29; + int gt_cond_enc = 0x37; + if (width != Assembler::Q) { + eq_cond_enc = 0x74 + width; + gt_cond_enc = 0x64 + width; + } + switch (cond) { + case eq: + vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len); + break; + case neq: + vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len); + vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg); + break; + case le: + vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len); + vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg); + break; + case nlt: + vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len); + vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg); + break; + case lt: + vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len); + break; + case nle: + vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len); + break; + default: + assert(false, "Should not reach here"); + } +} + void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) { assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); Assembler::vpmovzxbw(dst, src, vector_len); @@ -3147,6 +3300,16 @@ } } +void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, + bool merge, int vector_len, Register scratch_reg) { + if (reachable(src)) { + Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len); + } else { + lea(scratch_reg, src); + Assembler::evpord(dst, mask, nds, Address(scratch_reg, 0), merge, vector_len); + } +} + void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { if (reachable(src)) { vdivsd(dst, nds, as_Address(src)); @@ -3243,7 +3406,14 @@ } } -//------------------------------------------------------------------------------------------- +void MacroAssembler::vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { + if (reachable(src)) { + Assembler::vpermd(dst, nds, as_Address(src), vector_len); + } else { + lea(scratch_reg, src); + Assembler::vpermd(dst, nds, Address(scratch_reg, 0), vector_len); + } +} void MacroAssembler::clear_jweak_tag(Register possibly_jweak) { const int32_t inverted_jweak_mask = ~static_cast(JNIHandles::weak_tag_mask); @@ -5773,7 +5943,7 @@ bind(VECTOR64_LOOP); // AVX512 code to compare 64 byte vectors. - evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit); + evmovdqub(rymm0, Address(obja, result), false, Assembler::AVX_512bit); evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit); kortestql(k7, k7); jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch @@ -5792,7 +5962,7 @@ notq(tmp2); kmovql(k3, tmp2); - evmovdqub(rymm0, k3, Address(obja, result), Assembler::AVX_512bit); + evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit); evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit); ktestql(k7, k3); @@ -7231,7 +7401,7 @@ notl(result); kmovdl(k3, result); - evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit); + evmovdquw(tmp1Reg, k3, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); ktestd(k2, k3); jcc(Assembler::carryClear, return_zero); @@ -7256,7 +7426,7 @@ negptr(len); bind(copy_32_loop); - evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit); + evmovdquw(tmp1Reg, Address(src, len, Address::times_2), /*merge*/ false, Assembler::AVX_512bit); evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); kortestdl(k2, k2); jcc(Assembler::carryClear, return_zero); @@ -7281,7 +7451,7 @@ kmovdl(k3, result); - evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit); + evmovdquw(tmp1Reg, k3, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); ktestd(k2, k3); jcc(Assembler::carryClear, return_zero); @@ -7426,7 +7596,7 @@ // inflate 32 chars per iter bind(copy_32_loop); vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit); - evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit); + evmovdquw(Address(dst, len, Address::times_2), tmp1, /*merge*/ false, Assembler::AVX_512bit); addptr(len, 32); jcc(Assembler::notZero, copy_32_loop); @@ -7441,7 +7611,7 @@ notl(tmp3_aliased); kmovdl(k2, tmp3_aliased); evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit); - evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit); + evmovdquw(Address(dst, 0), k2, tmp1, /*merge*/ true, Assembler::AVX_512bit); jmp(done); bind(avx3_threshold); --- old/src/hotspot/cpu/x86/macroAssembler_x86.hpp 2020-04-02 18:03:46.020854100 -0700 +++ new/src/hotspot/cpu/x86/macroAssembler_x86.hpp 2020-04-02 18:03:45.856854100 -0700 @@ -1083,15 +1083,59 @@ void movdqu(XMMRegister dst, Address src); void movdqu(XMMRegister dst, XMMRegister src); void movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg = rscratch1); + + void kmovwl(KRegister dst, Register src) { Assembler::kmovwl(dst, src); } + void kmovwl(Register dst, KRegister src) { Assembler::kmovwl(dst, src); } + void kmovwl(KRegister dst, Address src) { Assembler::kmovwl(dst, src); } + void kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); + // AVX Unaligned forms void vmovdqu(Address dst, XMMRegister src); void vmovdqu(XMMRegister dst, Address src); void vmovdqu(XMMRegister dst, XMMRegister src); void vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); + + // AVX512 Unaligned + void evmovdqub(Address dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); } + void evmovdqub(XMMRegister dst, Address src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); } + void evmovdqub(XMMRegister dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); } + void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); } + void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register scratch_reg); + + void evmovdquw(Address dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdquw(dst, src, merge, vector_len); } + void evmovdquw(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); } + void evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len) { Assembler::evmovdquw(dst, src, merge, vector_len); } + void evmovdquw(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdquw(dst, mask, src, merge, vector_len); } + void evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register scratch_reg); + + void evmovdqul(Address dst, XMMRegister src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); } + void evmovdqul(XMMRegister dst, Address src, int vector_len) { Assembler::evmovdqul(dst, src, vector_len); } + void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) { + if (dst->encoding() == src->encoding()) return; + Assembler::evmovdqul(dst, src, vector_len); + } + void evmovdqul(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); } + void evmovdqul(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdqul(dst, mask, src, merge, vector_len); } + void evmovdqul(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { + if (dst->encoding() == src->encoding() && mask == k0) return; + Assembler::evmovdqul(dst, mask, src, merge, vector_len); + } + void evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register scratch_reg); + void evmovdquq(XMMRegister dst, Address src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); } - void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); } void evmovdquq(Address dst, XMMRegister src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); } void evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch); + void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) { + if (dst->encoding() == src->encoding()) return; + Assembler::evmovdquq(dst, src, vector_len); + } + void evmovdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); } + void evmovdquq(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdquq(dst, mask, src, merge, vector_len); } + void evmovdquq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { + if (dst->encoding() == src->encoding() && mask == k0) return; + Assembler::evmovdquq(dst, mask, src, merge, vector_len); + } + void evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register scratch_reg); // Move Aligned Double Quadword void movdqa(XMMRegister dst, Address src) { Assembler::movdqa(dst, src); } @@ -1213,6 +1257,30 @@ void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg); + + // Vector compares + void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + int comparison, int vector_len) { Assembler::evpcmpd(kdst, mask, nds, src, comparison, vector_len); } + void evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, + int comparison, int vector_len, Register scratch_reg); + void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + int comparison, int vector_len) { Assembler::evpcmpq(kdst, mask, nds, src, comparison, vector_len); } + void evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, + int comparison, int vector_len, Register scratch_reg); + void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + int comparison, int vector_len) { Assembler::evpcmpb(kdst, mask, nds, src, comparison, vector_len); } + void evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, + int comparison, int vector_len, Register scratch_reg); + void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, + int comparison, int vector_len) { Assembler::evpcmpw(kdst, mask, nds, src, comparison, vector_len); } + void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, + int comparison, int vector_len, Register scratch_reg); + + + // Emit comparison instruction for the specified comparison predicate. + void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg); + void vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len); void vpmovzxbw(XMMRegister dst, Address src, int vector_len); void vpmovzxbw(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vpmovzxbw(dst, src, vector_len); } @@ -1241,6 +1309,7 @@ void vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len); void vptest(XMMRegister dst, XMMRegister src); + void vptest(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vptest(dst, src, vector_len); } void punpcklbw(XMMRegister dst, XMMRegister src); void punpcklbw(XMMRegister dst, Address src) { Assembler::punpcklbw(dst, src); } @@ -1259,6 +1328,8 @@ void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); } void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1); + void evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register scratch_reg); + void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); } void vdivsd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vdivsd(dst, nds, src); } void vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src); @@ -1314,6 +1385,9 @@ void vpxor(XMMRegister dst, XMMRegister src) { Assembler::vpxor(dst, dst, src, true); } void vpxor(XMMRegister dst, Address src) { Assembler::vpxor(dst, dst, src, true); } + void vpermd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vpermd(dst, nds, src, vector_len); } + void vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg); + void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) { if (UseAVX > 2 && VM_Version::supports_avx512novl()) { Assembler::vinserti32x4(dst, dst, src, imm8); --- old/src/hotspot/cpu/x86/stubGenerator_x86_32.cpp 2020-04-02 18:03:46.576854100 -0700 +++ new/src/hotspot/cpu/x86/stubGenerator_x86_32.cpp 2020-04-02 18:03:46.404854100 -0700 @@ -619,6 +619,29 @@ return start; } + address generate_iota_indices(const char *stub_name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + __ emit_data(0x03020100, relocInfo::none, 0); + __ emit_data(0x07060504, relocInfo::none, 0); + __ emit_data(0x0B0A0908, relocInfo::none, 0); + __ emit_data(0x0F0E0D0C, relocInfo::none, 0); + __ emit_data(0x13121110, relocInfo::none, 0); + __ emit_data(0x17161514, relocInfo::none, 0); + __ emit_data(0x1B1A1918, relocInfo::none, 0); + __ emit_data(0x1F1E1D1C, relocInfo::none, 0); + __ emit_data(0x23222120, relocInfo::none, 0); + __ emit_data(0x27262524, relocInfo::none, 0); + __ emit_data(0x2B2A2928, relocInfo::none, 0); + __ emit_data(0x2F2E2D2C, relocInfo::none, 0); + __ emit_data(0x33323130, relocInfo::none, 0); + __ emit_data(0x37363534, relocInfo::none, 0); + __ emit_data(0x3B3A3938, relocInfo::none, 0); + __ emit_data(0x3F3E3D3C, relocInfo::none, 0); + return start; + } + address generate_vector_mask_long_double(const char *stub_name, int32_t maskhi, int32_t masklo) { __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", stub_name); @@ -659,6 +682,40 @@ return start; } + address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len, + int32_t val0, int32_t val1, int32_t val2, int32_t val3, + int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0, + int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0, + int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + + assert(len != Assembler::AVX_NoVec, "vector len must be specified"); + __ emit_data(val0, relocInfo::none, 0); + __ emit_data(val1, relocInfo::none, 0); + __ emit_data(val2, relocInfo::none, 0); + __ emit_data(val3, relocInfo::none, 0); + if (len >= Assembler::AVX_256bit) { + __ emit_data(val4, relocInfo::none, 0); + __ emit_data(val5, relocInfo::none, 0); + __ emit_data(val6, relocInfo::none, 0); + __ emit_data(val7, relocInfo::none, 0); + if (len >= Assembler::AVX_512bit) { + __ emit_data(val8, relocInfo::none, 0); + __ emit_data(val9, relocInfo::none, 0); + __ emit_data(val10, relocInfo::none, 0); + __ emit_data(val11, relocInfo::none, 0); + __ emit_data(val12, relocInfo::none, 0); + __ emit_data(val13, relocInfo::none, 0); + __ emit_data(val14, relocInfo::none, 0); + __ emit_data(val15, relocInfo::none, 0); + } + } + + return start; + } + //---------------------------------------------------------------------------------------------------- // Non-destructive plausibility checks for oops @@ -3889,8 +3946,19 @@ StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask_long_double("vector_double_sign_mask", 0x7FFFFFFF, 0xFFFFFFFF); StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask_long_double("vector_double_sign_flip", 0x80000000, 0x00000000); StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff); + StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff); + StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff); + StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit, + 0xFFFFFFFF, 0, 0, 0); + StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit, + 0xFFFFFFFF, 0xFFFFFFFF, 0, 0); + StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x03020100); + StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x01000100); + StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask_long_double("vector_long_shuffle_mask", 0x00000001, 0x0); StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask"); StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask_long_double("vector_long_sign_mask", 0x80000000, 0x00000000); + StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFF); + StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices"); // support for verify_oop (must happen after universe_init) StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); --- old/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp 2020-04-02 18:03:47.144854101 -0700 +++ new/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp 2020-04-02 18:03:46.980854101 -0700 @@ -973,6 +973,21 @@ return start; } + address generate_iota_indices(const char *stub_name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + __ emit_data64(0x0706050403020100, relocInfo::none); + __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); + __ emit_data64(0x1716151413121110, relocInfo::none); + __ emit_data64(0x1F1E1D1C1B1A1918, relocInfo::none); + __ emit_data64(0x2726252423222120, relocInfo::none); + __ emit_data64(0x2F2E2D2C2B2A2928, relocInfo::none); + __ emit_data64(0x3736353433323130, relocInfo::none); + __ emit_data64(0x3F3E3D3C3B3A3938, relocInfo::none); + return start; + } + address generate_fp_mask(const char *stub_name, int64_t mask) { __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", stub_name); @@ -1018,6 +1033,57 @@ return start; } + address generate_vector_fp_mask(const char *stub_name, int64_t mask) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + + return start; + } + + address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len, + int32_t val0, int32_t val1, int32_t val2, int32_t val3, + int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0, + int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0, + int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + + assert(len != Assembler::AVX_NoVec, "vector len must be specified"); + __ emit_data(val0, relocInfo::none, 0); + __ emit_data(val1, relocInfo::none, 0); + __ emit_data(val2, relocInfo::none, 0); + __ emit_data(val3, relocInfo::none, 0); + if (len >= Assembler::AVX_256bit) { + __ emit_data(val4, relocInfo::none, 0); + __ emit_data(val5, relocInfo::none, 0); + __ emit_data(val6, relocInfo::none, 0); + __ emit_data(val7, relocInfo::none, 0); + if (len >= Assembler::AVX_512bit) { + __ emit_data(val8, relocInfo::none, 0); + __ emit_data(val9, relocInfo::none, 0); + __ emit_data(val10, relocInfo::none, 0); + __ emit_data(val11, relocInfo::none, 0); + __ emit_data(val12, relocInfo::none, 0); + __ emit_data(val13, relocInfo::none, 0); + __ emit_data(val14, relocInfo::none, 0); + __ emit_data(val15, relocInfo::none, 0); + } + } + + return start; + } + // Non-destructive plausibility checks for oops // // Arguments: @@ -6446,9 +6512,20 @@ StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000); StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF); StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000); + StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFFFFFFFFFF); StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff); StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask"); + StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff000000ff); + StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff0000ffff); + StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit, + 0xFFFFFFFF, 0, 0, 0); + StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit, + 0xFFFFFFFF, 0xFFFFFFFF, 0, 0); + StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x0302010003020100); + StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x0100010001000100); + StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask("vector_long_shuffle_mask", 0x0000000100000000); StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000); + StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices"); // support for verify_oop (must happen after universe_init) StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); --- old/src/hotspot/cpu/x86/stubRoutines_x86.cpp 2020-04-02 18:03:47.724854101 -0700 +++ new/src/hotspot/cpu/x86/stubRoutines_x86.cpp 2020-04-02 18:03:47.560854101 -0700 @@ -44,12 +44,21 @@ address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL; address StubRoutines::x86::_k256_adr = NULL; address StubRoutines::x86::_vector_short_to_byte_mask = NULL; +address StubRoutines::x86::_vector_int_to_byte_mask = NULL; +address StubRoutines::x86::_vector_int_to_short_mask = NULL; +address StubRoutines::x86::_vector_all_bits_set = NULL; +address StubRoutines::x86::_vector_short_shuffle_mask = NULL; +address StubRoutines::x86::_vector_int_shuffle_mask = NULL; +address StubRoutines::x86::_vector_long_shuffle_mask = NULL; address StubRoutines::x86::_vector_float_sign_mask = NULL; address StubRoutines::x86::_vector_float_sign_flip = NULL; address StubRoutines::x86::_vector_double_sign_mask = NULL; address StubRoutines::x86::_vector_double_sign_flip = NULL; address StubRoutines::x86::_vector_byte_perm_mask = NULL; address StubRoutines::x86::_vector_long_sign_mask = NULL; +address StubRoutines::x86::_vector_iota_indices = NULL; +address StubRoutines::x86::_vector_32_bit_mask = NULL; +address StubRoutines::x86::_vector_64_bit_mask = NULL; #ifdef _LP64 address StubRoutines::x86::_k256_W_adr = NULL; address StubRoutines::x86::_k512_W_addr = NULL; --- old/src/hotspot/cpu/x86/stubRoutines_x86.hpp 2020-04-02 18:03:48.360854102 -0700 +++ new/src/hotspot/cpu/x86/stubRoutines_x86.hpp 2020-04-02 18:03:48.196854102 -0700 @@ -141,8 +141,17 @@ static address _vector_float_sign_flip; static address _vector_double_sign_mask; static address _vector_double_sign_flip; - static address _vector_byte_perm_mask; static address _vector_long_sign_mask; + static address _vector_all_bits_set; + static address _vector_byte_perm_mask; + static address _vector_int_to_byte_mask; + static address _vector_int_to_short_mask; + static address _vector_32_bit_mask; + static address _vector_64_bit_mask; + static address _vector_int_shuffle_mask; + static address _vector_short_shuffle_mask; + static address _vector_long_shuffle_mask; + static address _vector_iota_indices; #ifdef _LP64 static juint _k256_W[]; static address _k256_W_adr; @@ -238,13 +247,50 @@ return _vector_double_sign_flip; } + static address vector_all_bits_set() { + return _vector_all_bits_set; + } + static address vector_byte_perm_mask() { return _vector_byte_perm_mask; } + static address vector_int_to_byte_mask() { + return _vector_int_to_byte_mask; + } + + static address vector_int_to_short_mask() { + return _vector_int_to_short_mask; + } + + static address vector_32_bit_mask() { + return _vector_32_bit_mask; + } + + static address vector_64_bit_mask() { + return _vector_64_bit_mask; + } + + static address vector_int_shuffle_mask() { + return _vector_int_shuffle_mask; + } + + static address vector_short_shuffle_mask() { + return _vector_short_shuffle_mask; + } + + static address vector_long_shuffle_mask() { + return _vector_long_shuffle_mask; + } + static address vector_long_sign_mask() { return _vector_long_sign_mask; } + + static address vector_iota_indices() { + return _vector_iota_indices; + } + #ifdef _LP64 static address k256_W_addr() { return _k256_W_adr; } static address k512_W_addr() { return _k512_W_addr; } --- old/src/hotspot/cpu/x86/x86.ad 2020-04-02 18:03:48.924854103 -0700 +++ new/src/hotspot/cpu/x86/x86.ad 2020-04-02 18:03:48.744854102 -0700 @@ -1097,6 +1097,7 @@ reg_class_dynamic vectorz_reg (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} ); reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} ); +reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d); %} @@ -1165,6 +1166,64 @@ #endif }; + +inline uint vector_length(const Node* n) { + const TypeVect* vt = n->bottom_type()->is_vect(); + return vt->length(); +} + +inline uint vector_length(const MachNode* use, MachOper* opnd) { + uint def_idx = use->operand_index(opnd); + Node* def = use->in(def_idx); + return def->bottom_type()->is_vect()->length(); +} + +inline uint vector_length_in_bytes(const Node* n) { + const TypeVect* vt = n->bottom_type()->is_vect(); + return vt->length_in_bytes(); +} + +inline uint vector_length_in_bytes(const MachNode* use, MachOper* opnd) { + uint def_idx = use->operand_index(opnd); + Node* def = use->in(def_idx); + return def->bottom_type()->is_vect()->length_in_bytes(); +} + +inline BasicType vector_element_basic_type(const Node *n) { + return n->bottom_type()->is_vect()->element_basic_type(); +} + +inline BasicType vector_element_basic_type(const MachNode *use, MachOper* opnd) { + uint def_idx = use->operand_index(opnd); + Node* def = use->in(def_idx); + return def->bottom_type()->is_vect()->element_basic_type(); +} + +inline Assembler::AvxVectorLen vector_length_encoding(int bytes) { + switch(bytes) { + case 4: // fall-through + case 8: // fall-through + case 16: return Assembler::AVX_128bit; + case 32: return Assembler::AVX_256bit; + case 64: return Assembler::AVX_512bit; + + default: { + ShouldNotReachHere(); + return Assembler::AVX_NoVec; + } + } +} + +static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) { + return vector_length_encoding(vector_length_in_bytes(n)); +} + +static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) { + uint def_idx = use->operand_index(opnd); + Node* def = use->in(def_idx); + return vector_length_encoding(def); +} + %} // end source_hpp source %{ @@ -1225,6 +1284,18 @@ return offset; } +Assembler::Width widthForType(BasicType bt) { + if (bt == T_BYTE) { + return Assembler::B; + } else if (bt == T_SHORT) { + return Assembler::W; + } else if (bt == T_INT) { + return Assembler::D; + } else { + assert(bt == T_LONG, "not a long: %s", type2name(bt)); + return Assembler::Q; + } +} //============================================================================= @@ -1241,8 +1312,16 @@ static address double_signflip() { return (address)double_signflip_pool; } #endif static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); } + static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); } static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); } static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); } + static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); } + static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); } + static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); } + static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); } + static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); } + static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); } + static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); } //============================================================================= const bool Matcher::match_rule_supported(int opcode) { @@ -1251,6 +1330,7 @@ } switch (opcode) { case Op_AbsVL: + case Op_StoreVectorScatter: if (UseAVX < 3) { return false; } @@ -1272,11 +1352,20 @@ } break; case Op_MulVL: + if (UseSSE < 4) { // only with SSE4_1 or AVX + return false; + } + break; case Op_MulReductionVL: if (VM_Version::supports_avx512dq() == false) { return false; } break; + case Op_AddReductionVL: + if (UseSSE < 2) { // requires at least SSE2 + return false; + } + break; case Op_AbsVB: case Op_AbsVS: case Op_AbsVI: @@ -1288,6 +1377,8 @@ return false; } break; + case Op_VectorLoadShuffle: + case Op_VectorRearrange: case Op_MulReductionVI: if (UseSSE < 4) { // requires at least SSE4 return false; @@ -1295,6 +1386,13 @@ break; case Op_SqrtVD: case Op_SqrtVF: + case Op_VectorMaskCmp: + case Op_VectorCastB2X: + case Op_VectorCastS2X: + case Op_VectorCastI2X: + case Op_VectorCastL2X: + case Op_VectorCastF2X: + case Op_VectorCastD2X: if (UseAVX < 1) { // enabled for AVX only return false; } @@ -1309,7 +1407,7 @@ break; case Op_CMoveVF: case Op_CMoveVD: - if (UseAVX < 1 || UseAVX > 2) { + if (UseAVX < 1) { // enabled for AVX only return false; } break; @@ -1332,6 +1430,10 @@ case Op_LShiftVB: case Op_RShiftVB: case Op_URShiftVB: + case Op_VectorInsert: + case Op_VectorLoadMask: + case Op_VectorStoreMask: + case Op_VectorBlend: if (UseSSE < 4) { return false; } @@ -1353,6 +1455,9 @@ return false; } break; + case Op_ExtractB: + case Op_ExtractL: + case Op_ExtractI: case Op_RoundDoubleMode: if (UseSSE < 4) { return false; @@ -1363,6 +1468,20 @@ return false; // 128bit vroundpd is not available } break; + case Op_VLShiftV: + case Op_VRShiftV: + case Op_VURShiftV: + case Op_LoadVectorGather: + if (UseAVX < 2) { + return false; + } + break; + case Op_FmaVD: + case Op_FmaVF: + if (!UseFMA) { + return false; + } + break; #ifndef _LP64 case Op_AddReductionVF: case Op_AddReductionVD: @@ -1401,6 +1520,8 @@ // * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types. // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE). // And MaxVectorSize is taken into account as well. + + int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte; if (!vector_size_supported(bt, vlen)) { return false; } @@ -1417,8 +1538,9 @@ break; case Op_AbsVD: case Op_NegVD: + case Op_MulVL: if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) { - return false; // 512bit vandpd and vxorpd are not available + return false; // 512bit vpmullq, vandpd and vxorpd are not available } break; case Op_CMoveVF: @@ -1431,6 +1553,142 @@ return false; // implementation limitation (only vcmov4D_reg is present) } break; + case Op_MaxV: + case Op_MinV: + if (UseSSE < 4 && is_integral_type(bt)) { + return false; + } + if ((bt == T_FLOAT || bt == T_DOUBLE)) { + // Float/Double intrinsics are enabled for AVX family currently. + if (UseAVX == 0) { + return false; + } + if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ + return false; + } + } + break; + case Op_AddReductionVI: + if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) { + return false; + } + // fallthrough + case Op_AndReductionV: + case Op_OrReductionV: + case Op_XorReductionV: + if (is_subword_type(bt) && (UseSSE < 4)) { + return false; + } +#ifndef _LP64 + if (bt == T_BYTE || bt == T_LONG) { + return false; + } +#endif + break; +#ifndef _LP64 + case Op_VectorInsert: + if (bt == T_LONG || bt == T_DOUBLE) { + return false; + } + break; +#endif + case Op_MinReductionV: + case Op_MaxReductionV: + if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) { + return false; + } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) { + return false; + } + // Float/Double intrinsics enabled for AVX family. + if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) { + return false; + } + if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { + return false; + } +#ifndef _LP64 + if (bt == T_BYTE || bt == T_LONG) { + return false; + } +#endif + break; + case Op_VectorTest: + if (UseSSE < 4) { + return false; // Implementation limitation + } else if (size_in_bits < 128) { + return false; // Implementation limitation + } else if (size_in_bits == 512 && (VM_Version::supports_avx512bw() == false)) { + return false; // Implementation limitation + } + break; + case Op_VectorLoadShuffle: + case Op_VectorRearrange: + if(vlen == 2) { + return false; // Implementation limitation due to how shuffle is loaded + } else if (size_in_bits == 256 && UseAVX < 2) { + return false; // Implementation limitation + } else if (bt == T_BYTE && size_in_bits >= 256 && !VM_Version::supports_avx512_vbmi()) { + return false; // Implementation limitation + } else if (bt == T_SHORT && size_in_bits >= 256 && !VM_Version::supports_avx512bw()) { + return false; // Implementation limitation + } + break; + case Op_VectorLoadMask: + if (size_in_bits == 256 && UseAVX < 2) { + return false; // Implementation limitation + } + // fallthrough + case Op_VectorStoreMask: + if (vlen == 2) { + return false; // Implementation limitation + } + break; + case Op_VectorCastB2X: + if (size_in_bits == 256 && UseAVX < 2) { + return false; // Implementation limitation + } + break; + case Op_VectorCastS2X: + if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { + return false; + } + break; + case Op_VectorCastI2X: + if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { + return false; + } + break; + case Op_VectorCastL2X: + if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { + return false; + } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) { + return false; + } + break; + case Op_VectorCastF2X: + case Op_VectorCastD2X: + if (is_integral_type(bt)) { + // Casts from FP to integral types require special fixup logic not easily + // implementable with vectors. + return false; // Implementation limitation + } + case Op_MulReductionVI: + if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) { + return false; + } + break; + case Op_StoreVectorScatter: + if(bt == T_BYTE || bt == T_SHORT) { + return false; + } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) { + return false; + } + // fallthrough + case Op_LoadVectorGather: + if (size_in_bits == 64 ) { + return false; + } + break; } return true; // Per default match rules are supported. } @@ -1668,40 +1926,28 @@ void Compile::reshape_address(AddPNode* addp) { } -static inline uint vector_length(const MachNode* n) { - const TypeVect* vt = n->bottom_type()->is_vect(); - return vt->length(); -} - -static inline uint vector_length(const MachNode* use, MachOper* opnd) { - uint def_idx = use->operand_index(opnd); - Node* def = use->in(def_idx); - return def->bottom_type()->is_vect()->length(); -} - -static inline uint vector_length_in_bytes(const MachNode* n) { - const TypeVect* vt = n->bottom_type()->is_vect(); - return vt->length_in_bytes(); -} - -static inline uint vector_length_in_bytes(const MachNode* use, MachOper* opnd) { - uint def_idx = use->operand_index(opnd); - Node* def = use->in(def_idx); - return def->bottom_type()->is_vect()->length_in_bytes(); +static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) { + switch (bt) { + case BoolTest::eq: return Assembler::eq; + case BoolTest::ne: return Assembler::neq; + case BoolTest::le: return Assembler::le; + case BoolTest::ge: return Assembler::nlt; + case BoolTest::lt: return Assembler::lt; + case BoolTest::gt: return Assembler::nle; + default : ShouldNotReachHere(); return Assembler::_false; + } } -static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* n) { - switch(vector_length_in_bytes(n)) { - case 4: // fall-through - case 8: // fall-through - case 16: return Assembler::AVX_128bit; - case 32: return Assembler::AVX_256bit; - case 64: return Assembler::AVX_512bit; - - default: { - ShouldNotReachHere(); - return Assembler::AVX_NoVec; - } +static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) { + switch (bt) { + case BoolTest::eq: return Assembler::EQ_OQ; // ordered non-signaling + // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare. + case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling + case BoolTest::le: return Assembler::LE_OQ; // ordered non-signaling + case BoolTest::ge: return Assembler::GE_OQ; // ordered non-signaling + case BoolTest::lt: return Assembler::LT_OQ; // ordered non-signaling + case BoolTest::gt: return Assembler::GT_OQ; // ordered non-signaling + default: ShouldNotReachHere(); return Assembler::FALSE_OS; } } @@ -2028,6 +2274,13 @@ %} +// Operands for bound floating pointer register arguments +operand rxmm0() %{ + constraint(ALLOC_IN_RC(xmm0_reg)); + match(VecX); + format%{%} + interface(REG_INTER); +%} //----------OPERANDS----------------------------------------------------------- // Operand definitions must precede instruction definitions for correct parsing @@ -2792,9 +3045,9 @@ ins_cost(150); format %{ "vandps $dst, $src, [0x7fffffff]\t# abs float by sign masking" %} ins_encode %{ - int vector_len = 0; + int vlen_enc = Assembler::AVX_128bit; __ vandps($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(float_signmask()), vector_len); + ExternalAddress(float_signmask()), vlen_enc); %} ins_pipe(pipe_slow); %} @@ -2818,9 +3071,9 @@ format %{ "vandpd $dst, $src, [0x7fffffffffffffff]\t" "# abs double by sign masking" %} ins_encode %{ - int vector_len = 0; + int vlen_enc = Assembler::AVX_128bit; __ vandpd($dst$$XMMRegister, $src$$XMMRegister, - ExternalAddress(double_signmask()), vector_len); + ExternalAddress(double_signmask()), vlen_enc); %} ins_pipe(pipe_slow); %} @@ -2944,6 +3197,93 @@ ins_pipe(pipe_slow); %} +// ---------------------------------------- VectorReinterpret ------------------------------------ + +instruct reinterpret(vec dst) %{ + predicate(vector_length_in_bytes(n) == vector_length_in_bytes(n->in(1))); // dst == src + match(Set dst (VectorReinterpret dst)); + ins_cost(125); + format %{ "vector_reinterpret $dst\t!" %} + ins_encode %{ + // empty + %} + ins_pipe( pipe_slow ); +%} + +instruct reinterpret_expand(vec dst, vec src, rRegP scratch) %{ + predicate(UseAVX == 0 && + (vector_length_in_bytes(n->in(1)) < vector_length_in_bytes(n))); // src < dst + match(Set dst (VectorReinterpret src)); + ins_cost(125); + effect(TEMP dst, TEMP scratch); + format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %} + ins_encode %{ + assert(vector_length_in_bytes(this) <= 16, "required"); + assert(vector_length_in_bytes(this, $src) <= 8, "required"); + + int src_vlen_in_bytes = vector_length_in_bytes(this, $src); + if (src_vlen_in_bytes == 4) { + __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register); + } else { + assert(src_vlen_in_bytes == 8, ""); + __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register); + } + __ pand($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vreinterpret_expand4(legVec dst, vec src, rRegP scratch) %{ + predicate(UseAVX > 0 && + (vector_length_in_bytes(n->in(1)) == 4) && // src + (vector_length_in_bytes(n->in(1)) < vector_length_in_bytes(n))); // src < dst + match(Set dst (VectorReinterpret src)); + ins_cost(125); + effect(TEMP scratch); + format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %} + ins_encode %{ + __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + + +instruct vreinterpret_expand(legVec dst, vec src) %{ + predicate(UseAVX > 0 && + (vector_length_in_bytes(n->in(1)) > 4) && // src + (vector_length_in_bytes(n->in(1)) < vector_length_in_bytes(n))); // src < dst + match(Set dst (VectorReinterpret src)); + ins_cost(125); + format %{ "vector_reinterpret_expand $dst,$src\t!" %} + ins_encode %{ + switch (vector_length_in_bytes(this, $src)) { + case 8: __ movq ($dst$$XMMRegister, $src$$XMMRegister); break; + case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break; + case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break; + default: ShouldNotReachHere(); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct reinterpret_shrink(vec dst, legVec src) %{ + predicate(vector_length_in_bytes(n->in(1)) > vector_length_in_bytes(n)); // src > dst + match(Set dst (VectorReinterpret src)); + ins_cost(125); + format %{ "vector_reinterpret_shrink $dst,$src\t!" %} + ins_encode %{ + switch (vector_length_in_bytes(this)) { + case 4: __ movflt ($dst$$XMMRegister, $src$$XMMRegister); break; + case 8: __ movq ($dst$$XMMRegister, $src$$XMMRegister); break; + case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break; + case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break; + default: ShouldNotReachHere(); + } + %} + ins_pipe( pipe_slow ); +%} + +// ---------------------------------------------------------------------------------------------------- #ifdef _LP64 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{ @@ -2981,19 +3321,19 @@ %} instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{ - predicate(n->as_Vector()->length() < 8); + predicate(vector_length(n) < 8); match(Set dst (RoundDoubleModeV src rmode)); format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %} ins_encode %{ assert(UseAVX > 0, "required"); - int vector_len = vector_length_encoding(this); - __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc); %} ins_pipe( pipe_slow ); %} instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{ - predicate(n->as_Vector()->length() == 8); + predicate(vector_length(n) == 8); match(Set dst (RoundDoubleModeV src rmode)); format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %} ins_encode %{ @@ -3004,19 +3344,19 @@ %} instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{ - predicate(n->as_Vector()->length() < 8); + predicate(vector_length(n) < 8); match(Set dst (RoundDoubleModeV (LoadVector mem) rmode)); format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %} ins_encode %{ assert(UseAVX > 0, "required"); - int vector_len = vector_length_encoding(this); - __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc); %} ins_pipe( pipe_slow ); %} instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{ - predicate(n->as_Vector()->length() == 8); + predicate(vector_length(n) == 8); match(Set dst (RoundDoubleModeV (LoadVector mem) rmode)); format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %} ins_encode %{ @@ -3088,7 +3428,7 @@ // ============================================================================ -// Load vectors +// Load vectors generic operand pattern instruct loadV(vec dst, memory mem) %{ match(Set dst (LoadVector mem)); ins_cost(125); @@ -3124,6 +3464,81 @@ ins_pipe( pipe_slow ); %} +// ---------------------------------------- Gather ------------------------------------ + +// Gather INT, LONG, FLOAT, DOUBLE + +instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{ + predicate(vector_length_in_bytes(n) <= 32); + match(Set dst (LoadVectorGather mem idx)); + effect(TEMP dst, TEMP tmp, TEMP mask); + format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %} + ins_encode %{ + assert(UseAVX >= 2, "sanity"); + + int vlen_enc = vector_length_encoding(this); + BasicType elem_bt = vector_element_basic_type(this); + + assert(vector_length_in_bytes(this) >= 16, "sanity"); + assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE + + if (vlen_enc == Assembler::AVX_128bit) { + __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set())); + } else { + __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set())); + } + __ lea($tmp$$Register, $mem$$Address); + __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct evgather(vec dst, memory mem, vec idx, rRegP tmp) %{ + predicate(vector_length_in_bytes(n) == 64); + match(Set dst (LoadVectorGather mem idx)); + effect(TEMP dst, TEMP tmp); + format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and k2 as TEMP" %} + ins_encode %{ + assert(UseAVX > 2, "sanity"); + + int vlen_enc = vector_length_encoding(this); + BasicType elem_bt = vector_element_basic_type(this); + + assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE + + KRegister ktmp = k2; + __ kmovwl(k2, ExternalAddress(vector_all_bits_set()), $tmp$$Register); + __ lea($tmp$$Register, $mem$$Address); + __ evgather(elem_bt, $dst$$XMMRegister, ktmp, $tmp$$Register, $idx$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +// ====================Scatter======================================= + +// Scatter INT, LONG, FLOAT, DOUBLE + +instruct scatter(memory mem, vec src, vec idx, rRegP tmp) %{ + match(Set mem (StoreVectorScatter mem (Binary src idx))); + effect(TEMP tmp); + format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %} + ins_encode %{ + assert(UseAVX > 2, "sanity"); + + int vlen_enc = vector_length_encoding(this, $src); + BasicType elem_bt = vector_element_basic_type(this, $src); + + assert(vector_length_in_bytes(this, $src) >= 16, "sanity"); + assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE + + KRegister ktmp = k2; + __ kmovwl(k2, ExternalAddress(vector_all_bits_set()), $tmp$$Register); + __ lea($tmp$$Register, $mem$$Address); + __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, ktmp, $src$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + // ====================REPLICATE======================================= // Replicate byte scalar to be vector @@ -3157,8 +3572,8 @@ match(Set dst (ReplicateB (LoadB mem))); format %{ "replicateB $dst,$mem" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -3188,7 +3603,7 @@ %} // Replicate byte scalar zero to be vector -instruct ReplB_zero(vec dst, immI0 zero) %{ +instruct ReplB_zero(vec dst, immI_0 zero) %{ match(Set dst (ReplicateB zero)); format %{ "replicateB $dst,$zero" %} ins_encode %{ @@ -3265,7 +3680,7 @@ ins_pipe( fpu_reg_reg ); %} -instruct ReplS_zero(vec dst, immI0 zero) %{ +instruct ReplS_zero(vec dst, immI_0 zero) %{ match(Set dst (ReplicateS zero)); format %{ "replicateS $dst,$zero" %} ins_encode %{ @@ -3312,8 +3727,8 @@ __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); } else { assert(VM_Version::supports_avx2(), "sanity"); - int vector_len = vector_length_encoding(this); - __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc); } %} ins_pipe( pipe_slow ); @@ -3332,16 +3747,16 @@ } } else { assert(VM_Version::supports_avx2(), "sanity"); - int vector_len = vector_length_encoding(this); + int vlen_enc = vector_length_encoding(this); __ movq($dst$$XMMRegister, const_addr); - __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); } %} ins_pipe( pipe_slow ); %} // Replicate integer (4 byte) scalar zero to be vector -instruct ReplI_zero(vec dst, immI0 zero) %{ +instruct ReplI_zero(vec dst, immI_0 zero) %{ match(Set dst (ReplicateI zero)); format %{ "replicateI $dst,$zero" %} ins_encode %{ @@ -3383,7 +3798,7 @@ #else // _LP64 // Replicate long (8 byte) scalar to be vector instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{ - predicate(n->as_Vector()->length() <= 4); + predicate(vector_length(n) <= 4); match(Set dst (ReplicateL src)); effect(TEMP dst, USE src, TEMP tmp); format %{ "replicateL $dst,$src" %} @@ -3395,11 +3810,11 @@ __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister); __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister); } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands - int vector_len = Assembler::AVX_256bit; + int vlen_enc = Assembler::AVX_256bit; __ movdl($dst$$XMMRegister, $src$$Register); __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register)); __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister); - __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); } else { __ movdl($dst$$XMMRegister, $src$$Register); __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register)); @@ -3412,7 +3827,7 @@ %} instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{ - predicate(n->as_Vector()->length() == 8); + predicate(vector_length(n) == 8); match(Set dst (ReplicateL src)); effect(TEMP dst, USE src, TEMP tmp); format %{ "replicateL $dst,$src" %} @@ -3425,11 +3840,11 @@ __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister); __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1); } else { - int vector_len = Assembler::AVX_512bit; + int vlen_enc = Assembler::AVX_512bit; __ movdl($dst$$XMMRegister, $src$$Register); __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register)); __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister); - __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); } %} ins_pipe( pipe_slow ); @@ -3498,8 +3913,8 @@ if (vlen <= 4) { __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); } else if (VM_Version::supports_avx2()) { - int vector_len = vector_length_encoding(this); - __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len); // reg-to-reg variant requires AVX2 + int vlen_enc = vector_length_encoding(this); + __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2 } else { assert(vlen == 8, "sanity"); __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00); @@ -3519,8 +3934,8 @@ __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00); } else { assert(VM_Version::supports_avx(), "sanity"); - int vector_len = vector_length_encoding(this); - __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc); } %} ins_pipe( pipe_slow ); @@ -3552,8 +3967,8 @@ if (vlen == 2) { __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44); } else if (VM_Version::supports_avx2()) { - int vector_len = vector_length_encoding(this); - __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len); // reg-to-reg variant requires AVX2 + int vlen_enc = vector_length_encoding(this); + __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2 } else { assert(vlen == 4, "sanity"); __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44); @@ -3573,8 +3988,8 @@ __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44); } else { assert(VM_Version::supports_avx(), "sanity"); - int vector_len = vector_length_encoding(this); - __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc); } %} ins_pipe( pipe_slow ); @@ -3595,19 +4010,243 @@ ins_pipe( fpu_reg_reg ); %} -// ====================REDUCTION ARITHMETIC======================================= -// =======================Int Reduction========================================== +// ====================VECTOR INSERT======================================= -instruct reductionI(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{ - predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT && - n->in(2)->bottom_type()->is_vect()->length() < 16); - match(Set dst (AddReductionVI src1 src2)); - match(Set dst (MulReductionVI src1 src2)); - match(Set dst (AndReductionV src1 src2)); - match(Set dst ( OrReductionV src1 src2)); - match(Set dst (XorReductionV src1 src2)); - effect(TEMP vtmp1, TEMP vtmp2); - format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %} +instruct insert(vec dst, rRegI val, immU8 idx) %{ + predicate(vector_length_in_bytes(n) >= 8 && + vector_length_in_bytes(n) <= 16); + match(Set dst (VectorInsert (Binary dst val) idx)); + format %{ "vector_insert $dst,$val,$idx" %} + ins_encode %{ + assert(UseSSE >= 4, "required"); + + BasicType elem_bt = vector_element_basic_type(this); + + assert(is_integral_type(elem_bt), ""); + assert($idx$$constant < (int)vector_length(this), "out of bounds"); + + __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{ + predicate(vector_length_in_bytes(n) == 32); + match(Set dst (VectorInsert (Binary src val) idx)); + effect(TEMP vtmp); + format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} + ins_encode %{ + int vlen_enc = Assembler::AVX_256bit; + BasicType elem_bt = vector_element_basic_type(this); + int elem_per_lane = 16/type2aelembytes(elem_bt); + int log2epr = log2(elem_per_lane); + + assert(is_integral_type(elem_bt), "sanity"); + assert($idx$$constant < (int)vector_length(this), "out of bounds"); + + uint x_idx = $idx$$constant & right_n_bits(log2epr); + uint y_idx = ($idx$$constant >> log2epr) & 1; + __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); + __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx); + __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); + %} + ins_pipe( pipe_slow ); +%} + +instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{ + predicate(vector_length_in_bytes(n) == 64); + match(Set dst (VectorInsert (Binary src val) idx)); + effect(TEMP vtmp); + format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} + ins_encode %{ + assert(UseAVX > 2, "sanity"); + + BasicType elem_bt = vector_element_basic_type(this); + int elem_per_lane = 16/type2aelembytes(elem_bt); + int log2epr = log2(elem_per_lane); + + assert(is_integral_type(elem_bt), ""); + assert($idx$$constant < (int)vector_length(this), "out of bounds"); + + uint x_idx = $idx$$constant & right_n_bits(log2epr); + uint y_idx = ($idx$$constant >> log2epr) & 3; + __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); + __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx); + __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); + %} + ins_pipe( pipe_slow ); +%} + +#ifdef _LP64 +instruct insert2L(vec dst, rRegL val, immU8 idx) %{ + predicate(vector_length(n) == 2); + match(Set dst (VectorInsert (Binary dst val) idx)); + format %{ "vector_insert $dst,$val,$idx" %} + ins_encode %{ + assert(UseSSE >= 4, "required"); + assert(vector_element_basic_type(this) == T_LONG, ""); + assert($idx$$constant < (int)vector_length(this), "out of bounds"); + + __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{ + predicate(vector_length(n) == 4); + match(Set dst (VectorInsert (Binary src val) idx)); + effect(TEMP vtmp); + format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} + ins_encode %{ + assert(vector_element_basic_type(this) == T_LONG, ""); + assert($idx$$constant < (int)vector_length(this), "out of bounds"); + + uint x_idx = $idx$$constant & right_n_bits(1); + uint y_idx = ($idx$$constant >> 1) & 1; + int vlen_enc = Assembler::AVX_256bit; + __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); + __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx); + __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); + %} + ins_pipe( pipe_slow ); +%} + +instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{ + predicate(vector_length(n) == 8); + match(Set dst (VectorInsert (Binary src val) idx)); + effect(TEMP vtmp); + format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} + ins_encode %{ + assert(vector_element_basic_type(this) == T_LONG, "sanity"); + assert($idx$$constant < (int)vector_length(this), "out of bounds"); + + uint x_idx = $idx$$constant & right_n_bits(1); + uint y_idx = ($idx$$constant >> 1) & 3; + __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); + __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx); + __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); + %} + ins_pipe( pipe_slow ); +%} +#endif + +instruct insertF(vec dst, regF val, immU8 idx) %{ + predicate(vector_length(n) >= 2 && + vector_length(n) <= 4); + match(Set dst (VectorInsert (Binary dst val) idx)); + format %{ "vector_insert $dst,$val,$idx" %} + ins_encode %{ + assert(UseSSE >= 4, "sanity"); + + assert(vector_element_basic_type(this) == T_FLOAT, "sanity"); + assert($idx$$constant < (int)vector_length(this), "out of bounds"); + + __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{ + predicate(vector_length(n) >= 8); + match(Set dst (VectorInsert (Binary src val) idx)); + effect(TEMP vtmp); + format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} + ins_encode %{ + assert(vector_element_basic_type(this) == T_FLOAT, "sanity"); + assert($idx$$constant < (int)vector_length(this), "out of bounds"); + + int vlen = vector_length(this); + uint x_idx = $idx$$constant & right_n_bits(2); + if (vlen == 8) { + uint y_idx = ($idx$$constant >> 2) & 1; + int vlen_enc = Assembler::AVX_256bit; + __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); + __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx); + __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); + } else { + assert(vlen == 16, "sanity"); + uint y_idx = ($idx$$constant >> 2) & 3; + __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); + __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx); + __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); + } + %} + ins_pipe( pipe_slow ); +%} + +#ifdef _LP64 +instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{ + predicate(vector_length(n) == 2); + match(Set dst (VectorInsert (Binary dst val) idx)); + effect(TEMP tmp); + format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %} + ins_encode %{ + assert(UseSSE >= 4, "sanity"); + assert(vector_element_basic_type(this) == T_DOUBLE, "sanity"); + assert($idx$$constant < (int)vector_length(this), "out of bounds"); + + __ movq($tmp$$Register, $val$$XMMRegister); + __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{ + predicate(vector_length(n) == 4); + match(Set dst (VectorInsert (Binary src val) idx)); + effect(TEMP vtmp, TEMP tmp); + format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %} + ins_encode %{ + assert(vector_element_basic_type(this) == T_DOUBLE, "sanity"); + assert($idx$$constant < (int)vector_length(this), "out of bounds"); + + uint x_idx = $idx$$constant & right_n_bits(1); + uint y_idx = ($idx$$constant >> 1) & 1; + int vlen_enc = Assembler::AVX_256bit; + __ movq($tmp$$Register, $val$$XMMRegister); + __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); + __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx); + __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); + %} + ins_pipe( pipe_slow ); +%} + +instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{ + predicate(vector_length(n) == 8); + match(Set dst (VectorInsert (Binary src val) idx)); + effect(TEMP tmp, TEMP vtmp); + format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %} + ins_encode %{ + assert(vector_element_basic_type(this) == T_DOUBLE, "sanity"); + assert($idx$$constant < (int)vector_length(this), "out of bounds"); + + uint x_idx = $idx$$constant & right_n_bits(1); + uint y_idx = ($idx$$constant >> 1) & 3; + __ movq($tmp$$Register, $val$$XMMRegister); + __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx); + __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx); + __ vinserti32x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, y_idx); + %} + ins_pipe( pipe_slow ); +%} +#endif + +// ====================REDUCTION ARITHMETIC======================================= + +// =======================Int Reduction========================================== + +instruct reductionI(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{ + predicate(vector_element_basic_type(n->in(2)) == T_INT && + vector_length(n->in(2)) < 16); // src2 + match(Set dst (AddReductionVI src1 src2)); + match(Set dst (MulReductionVI src1 src2)); + match(Set dst (AndReductionV src1 src2)); + match(Set dst ( OrReductionV src1 src2)); + match(Set dst (XorReductionV src1 src2)); + match(Set dst (MinReductionV src1 src2)); + match(Set dst (MaxReductionV src1 src2)); + effect(TEMP vtmp1, TEMP vtmp2); + format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %} ins_encode %{ int opcode = this->ideal_Opcode(); int vlen = vector_length(this, $src2); @@ -3617,20 +4256,22 @@ %} instruct reduction16I(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{ - predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT && - n->in(2)->bottom_type()->is_vect()->length() == 16); + predicate(vector_element_basic_type(n->in(2)) == T_INT && + vector_length(n->in(2)) == 16); // src2 match(Set dst (AddReductionVI src1 src2)); match(Set dst (MulReductionVI src1 src2)); match(Set dst (AndReductionV src1 src2)); match(Set dst ( OrReductionV src1 src2)); match(Set dst (XorReductionV src1 src2)); + match(Set dst (MinReductionV src1 src2)); + match(Set dst (MaxReductionV src1 src2)); effect(TEMP vtmp1, TEMP vtmp2); format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %} ins_encode %{ int opcode = this->ideal_Opcode(); int vlen = vector_length(this, $src2); __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister); - %} +%} ins_pipe( pipe_slow ); %} @@ -3638,13 +4279,15 @@ #ifdef _LP64 instruct reductionL(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{ - predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG && - n->in(2)->bottom_type()->is_vect()->length() < 8); + predicate(vector_element_basic_type(n->in(2)) == T_LONG && + vector_length(n->in(2)) < 8); // src2 match(Set dst (AddReductionVL src1 src2)); match(Set dst (MulReductionVL src1 src2)); match(Set dst (AndReductionV src1 src2)); match(Set dst ( OrReductionV src1 src2)); match(Set dst (XorReductionV src1 src2)); + match(Set dst (MinReductionV src1 src2)); + match(Set dst (MaxReductionV src1 src2)); effect(TEMP vtmp1, TEMP vtmp2); format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %} ins_encode %{ @@ -3656,13 +4299,15 @@ %} instruct reduction8L(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{ - predicate(n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG && - n->in(2)->bottom_type()->is_vect()->length() == 8); + predicate(vector_element_basic_type(n->in(2)) == T_LONG && + vector_length(n->in(2)) == 8); // src2 match(Set dst (AddReductionVL src1 src2)); match(Set dst (MulReductionVL src1 src2)); match(Set dst (AndReductionV src1 src2)); match(Set dst ( OrReductionV src1 src2)); match(Set dst (XorReductionV src1 src2)); + match(Set dst (MinReductionV src1 src2)); + match(Set dst (MaxReductionV src1 src2)); effect(TEMP vtmp1, TEMP vtmp2); format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %} ins_encode %{ @@ -3677,11 +4322,11 @@ // =======================Float Reduction========================================== instruct reductionF128(regF dst, vec src, vec vtmp) %{ - predicate(n->in(2)->bottom_type()->is_vect()->length() <= 4); + predicate(vector_length(n->in(2)) <= 4); // src match(Set dst (AddReductionVF dst src)); match(Set dst (MulReductionVF dst src)); effect(TEMP dst, TEMP vtmp); - format %{ "vector_reduction_fp $dst,$src ; using $vtmp as TEMP" %} + format %{ "vector_reduction_float $dst,$src ; using $vtmp as TEMP" %} ins_encode %{ int opcode = this->ideal_Opcode(); int vlen = vector_length(this, $src); @@ -3691,7 +4336,7 @@ %} instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{ - predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); + predicate(vector_length(n->in(2)) == 8); // src match(Set dst (AddReductionVF dst src)); match(Set dst (MulReductionVF dst src)); effect(TEMP dst, TEMP vtmp1, TEMP vtmp2); @@ -3705,7 +4350,7 @@ %} instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{ - predicate(n->in(2)->bottom_type()->is_vect()->length() == 16); + predicate(vector_length(n->in(2)) == 16); // src match(Set dst (AddReductionVF dst src)); match(Set dst (MulReductionVF dst src)); effect(TEMP dst, TEMP vtmp1, TEMP vtmp2); @@ -3721,7 +4366,7 @@ // =======================Double Reduction========================================== instruct reduction2D(regD dst, vec src, vec vtmp) %{ - predicate(n->in(2)->bottom_type()->is_vect()->length() == 2); + predicate(vector_length(n->in(2)) == 2); // src match(Set dst (AddReductionVD dst src)); match(Set dst (MulReductionVD dst src)); effect(TEMP dst, TEMP vtmp); @@ -3730,12 +4375,12 @@ int opcode = this->ideal_Opcode(); int vlen = vector_length(this, $src); __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister); - %} +%} ins_pipe( pipe_slow ); %} instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{ - predicate(n->in(2)->bottom_type()->is_vect()->length() == 4); + predicate(vector_length(n->in(2)) == 4); // src match(Set dst (AddReductionVD dst src)); match(Set dst (MulReductionVD dst src)); effect(TEMP dst, TEMP vtmp1, TEMP vtmp2); @@ -3749,7 +4394,7 @@ %} instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{ - predicate(n->in(2)->bottom_type()->is_vect()->length() == 8); + predicate(vector_length(n->in(2)) == 8); // src match(Set dst (AddReductionVD dst src)); match(Set dst (MulReductionVD dst src)); effect(TEMP dst, TEMP vtmp1, TEMP vtmp2); @@ -3762,6 +4407,290 @@ ins_pipe( pipe_slow ); %} +// =======================Byte Reduction========================================== + +#ifdef _LP64 +instruct reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{ + predicate(vector_element_basic_type(n->in(2)) == T_BYTE && + vector_length(n->in(2)) <= 32); // src2 + match(Set dst (AddReductionVI src1 src2)); + match(Set dst (AndReductionV src1 src2)); + match(Set dst ( OrReductionV src1 src2)); + match(Set dst (XorReductionV src1 src2)); + match(Set dst (MinReductionV src1 src2)); + match(Set dst (MaxReductionV src1 src2)); + effect(TEMP vtmp1, TEMP vtmp2); + format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %} + ins_encode %{ + int opcode = this->ideal_Opcode(); + int vlen = vector_length(this, $src2); + __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{ + predicate(vector_element_basic_type(n->in(2)) == T_BYTE && + vector_length(n->in(2)) == 64); // src2 + match(Set dst (AddReductionVI src1 src2)); + match(Set dst (AndReductionV src1 src2)); + match(Set dst ( OrReductionV src1 src2)); + match(Set dst (XorReductionV src1 src2)); + match(Set dst (MinReductionV src1 src2)); + match(Set dst (MaxReductionV src1 src2)); + effect(TEMP vtmp1, TEMP vtmp2); + format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %} + ins_encode %{ + int opcode = this->ideal_Opcode(); + int vlen = vector_length(this, $src2); + __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} +#endif + +// =======================Short Reduction========================================== + +instruct reductionS(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{ + predicate(vector_element_basic_type(n->in(2)) == T_SHORT && + vector_length(n->in(2)) <= 16); // src2 + match(Set dst (AddReductionVI src1 src2)); + match(Set dst (MulReductionVI src1 src2)); + match(Set dst (AndReductionV src1 src2)); + match(Set dst ( OrReductionV src1 src2)); + match(Set dst (XorReductionV src1 src2)); + match(Set dst (MinReductionV src1 src2)); + match(Set dst (MaxReductionV src1 src2)); + effect(TEMP vtmp1, TEMP vtmp2); + format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %} + ins_encode %{ + int opcode = this->ideal_Opcode(); + int vlen = vector_length(this, $src2); + __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct reduction32S(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{ + predicate(vector_element_basic_type(n->in(2)) == T_SHORT && + vector_length(n->in(2)) == 32); // src2 + match(Set dst (AddReductionVI src1 src2)); + match(Set dst (MulReductionVI src1 src2)); + match(Set dst (AndReductionV src1 src2)); + match(Set dst ( OrReductionV src1 src2)); + match(Set dst (XorReductionV src1 src2)); + match(Set dst (MinReductionV src1 src2)); + match(Set dst (MaxReductionV src1 src2)); + effect(TEMP vtmp1, TEMP vtmp2); + format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %} + ins_encode %{ + int opcode = this->ideal_Opcode(); + int vlen = vector_length(this, $src2); + __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +// =======================Mul Reduction========================================== + +instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{ + predicate(vector_element_basic_type(n->in(2)) == T_BYTE && + vector_length(n->in(2)) <= 32); // src2 + match(Set dst (MulReductionVI src1 src2)); + effect(TEMP dst, TEMP vtmp1, TEMP vtmp2); + format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %} + ins_encode %{ + int opcode = this->ideal_Opcode(); + int vlen = vector_length(this, $src2); + __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{ + predicate(vector_element_basic_type(n->in(2)) == T_BYTE && + vector_length(n->in(2)) == 64); // src2 + match(Set dst (MulReductionVI src1 src2)); + effect(TEMP dst, TEMP vtmp1, TEMP vtmp2); + format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %} + ins_encode %{ + int opcode = this->ideal_Opcode(); + int vlen = vector_length(this, $src2); + __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +//--------------------Min/Max Float Reduction -------------------- +// Float Min Reduction +instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp, + legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{ + predicate(vector_element_basic_type(n->in(2)) == T_FLOAT && + ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) || + (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) && + vector_length(n->in(2)) == 2); + match(Set dst (MinReductionV src1 src2)); + match(Set dst (MaxReductionV src1 src2)); + effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr); + format %{ "vector_minmax2F_reduction $dst,$src1,$src2 ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %} + ins_encode %{ + assert(UseAVX > 0, "sanity"); + + int opcode = this->ideal_Opcode(); + int vlen = vector_length(this, $src2); + __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister, + $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp, + legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{ + predicate(vector_element_basic_type(n->in(2)) == T_FLOAT && + ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) || + (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) && + vector_length(n->in(2)) >= 4); + match(Set dst (MinReductionV src1 src2)); + match(Set dst (MaxReductionV src1 src2)); + effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr); + format %{ "vector_minmaxF_reduction $dst,$src1,$src2 ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %} + ins_encode %{ + assert(UseAVX > 0, "sanity"); + + int opcode = this->ideal_Opcode(); + int vlen = vector_length(this, $src2); + __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister, + $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp, + legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{ + predicate(vector_element_basic_type(n->in(2)) == T_FLOAT && + vector_length(n->in(2)) == 2); + match(Set dst (MinReductionV dst src)); + match(Set dst (MaxReductionV dst src)); + effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr); + format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %} + ins_encode %{ + assert(UseAVX > 0, "sanity"); + + int opcode = this->ideal_Opcode(); + int vlen = vector_length(this, $src); + __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister, + $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + + +instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp, + legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{ + predicate(vector_element_basic_type(n->in(2)) == T_FLOAT && + vector_length(n->in(2)) >= 4); + match(Set dst (MinReductionV dst src)); + match(Set dst (MaxReductionV dst src)); + effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr); + format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %} + ins_encode %{ + assert(UseAVX > 0, "sanity"); + + int opcode = this->ideal_Opcode(); + int vlen = vector_length(this, $src); + __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister, + $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + + +//--------------------Min Double Reduction -------------------- +instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2, + legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs + rFlagsReg cr) %{ + predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE && + ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) || + (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) && + vector_length(n->in(2)) == 2); + match(Set dst (MinReductionV src1 src2)); + match(Set dst (MaxReductionV src1 src2)); + effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr); + format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %} + ins_encode %{ + assert(UseAVX > 0, "sanity"); + + int opcode = this->ideal_Opcode(); + int vlen = vector_length(this, $src2); + __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, + $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct minmax_reductionD(legRegD dst, immD src1, legVec src2, + legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs + rFlagsReg cr) %{ + predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE && + ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) || + (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) && + vector_length(n->in(2)) >= 4); + match(Set dst (MinReductionV src1 src2)); + match(Set dst (MaxReductionV src1 src2)); + effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr); + format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %} + ins_encode %{ + assert(UseAVX > 0, "sanity"); + + int opcode = this->ideal_Opcode(); + int vlen = vector_length(this, $src2); + __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, + $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + + +instruct minmax_reduction2D_av(legRegD dst, legVec src, + legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs + rFlagsReg cr) %{ + predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE && + vector_length(n->in(2)) == 2); + match(Set dst (MinReductionV dst src)); + match(Set dst (MaxReductionV dst src)); + effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr); + format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %} + ins_encode %{ + assert(UseAVX > 0, "sanity"); + + int opcode = this->ideal_Opcode(); + int vlen = vector_length(this, $src); + __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, + $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct minmax_reductionD_av(legRegD dst, legVec src, + legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs + rFlagsReg cr) %{ + predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE && + vector_length(n->in(2)) >= 4); + match(Set dst (MinReductionV dst src)); + match(Set dst (MaxReductionV dst src)); + effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr); + format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %} + ins_encode %{ + assert(UseAVX > 0, "sanity"); + + int opcode = this->ideal_Opcode(); + int vlen = vector_length(this, $src); + __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, + $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + // ====================VECTOR ARITHMETIC======================================= // --------------------------------- ADD -------------------------------------- @@ -3782,8 +4711,8 @@ match(Set dst (AddVB src1 src2)); format %{ "vpaddb $dst,$src1,$src2\t! add packedB" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -3793,8 +4722,8 @@ match(Set dst (AddVB src (LoadVector mem))); format %{ "vpaddb $dst,$src,$mem\t! add packedB" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -3815,8 +4744,8 @@ match(Set dst (AddVS src1 src2)); format %{ "vpaddw $dst,$src1,$src2\t! add packedS" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -3826,8 +4755,8 @@ match(Set dst (AddVS src (LoadVector mem))); format %{ "vpaddw $dst,$src,$mem\t! add packedS" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -3848,8 +4777,8 @@ match(Set dst (AddVI src1 src2)); format %{ "vpaddd $dst,$src1,$src2\t! add packedI" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -3860,8 +4789,8 @@ match(Set dst (AddVI src (LoadVector mem))); format %{ "vpaddd $dst,$src,$mem\t! add packedI" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -3882,8 +4811,8 @@ match(Set dst (AddVL src1 src2)); format %{ "vpaddq $dst,$src1,$src2\t! add packedL" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -3893,8 +4822,8 @@ match(Set dst (AddVL src (LoadVector mem))); format %{ "vpaddq $dst,$src,$mem\t! add packedL" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -3915,8 +4844,8 @@ match(Set dst (AddVF src1 src2)); format %{ "vaddps $dst,$src1,$src2\t! add packedF" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -3926,8 +4855,8 @@ match(Set dst (AddVF src (LoadVector mem))); format %{ "vaddps $dst,$src,$mem\t! add packedF" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -3948,8 +4877,8 @@ match(Set dst (AddVD src1 src2)); format %{ "vaddpd $dst,$src1,$src2\t! add packedD" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -3959,8 +4888,8 @@ match(Set dst (AddVD src (LoadVector mem))); format %{ "vaddpd $dst,$src,$mem\t! add packedD" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -3983,8 +4912,8 @@ match(Set dst (SubVB src1 src2)); format %{ "vpsubb $dst,$src1,$src2\t! sub packedB" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -3994,8 +4923,8 @@ match(Set dst (SubVB src (LoadVector mem))); format %{ "vpsubb $dst,$src,$mem\t! sub packedB" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4017,8 +4946,8 @@ match(Set dst (SubVS src1 src2)); format %{ "vpsubw $dst,$src1,$src2\t! sub packedS" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4028,8 +4957,8 @@ match(Set dst (SubVS src (LoadVector mem))); format %{ "vpsubw $dst,$src,$mem\t! sub packedS" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4050,8 +4979,8 @@ match(Set dst (SubVI src1 src2)); format %{ "vpsubd $dst,$src1,$src2\t! sub packedI" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4061,8 +4990,8 @@ match(Set dst (SubVI src (LoadVector mem))); format %{ "vpsubd $dst,$src,$mem\t! sub packedI" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4083,8 +5012,8 @@ match(Set dst (SubVL src1 src2)); format %{ "vpsubq $dst,$src1,$src2\t! sub packedL" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4095,8 +5024,8 @@ match(Set dst (SubVL src (LoadVector mem))); format %{ "vpsubq $dst,$src,$mem\t! sub packedL" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4117,8 +5046,8 @@ match(Set dst (SubVF src1 src2)); format %{ "vsubps $dst,$src1,$src2\t! sub packedF" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4128,8 +5057,8 @@ match(Set dst (SubVF src (LoadVector mem))); format %{ "vsubps $dst,$src,$mem\t! sub packedF" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4150,8 +5079,8 @@ match(Set dst (SubVD src1 src2)); format %{ "vsubpd $dst,$src1,$src2\t! sub packedD" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4161,8 +5090,8 @@ match(Set dst (SubVD src (LoadVector mem))); format %{ "vsubpd $dst,$src,$mem\t! sub packedD" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4171,8 +5100,8 @@ // Byte vector mul instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{ - predicate(n->as_Vector()->length() == 4 || - n->as_Vector()->length() == 8); + predicate(vector_length(n) == 4 || + vector_length(n) == 8); match(Set dst (MulVB src1 src2)); effect(TEMP dst, TEMP tmp, TEMP scratch); format %{"vector_mulB $dst,$src1,$src2" %} @@ -4189,7 +5118,7 @@ %} instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{ - predicate(n->as_Vector()->length() == 16 && UseAVX <= 1); + predicate(vector_length(n) == 16 && UseAVX <= 1); match(Set dst (MulVB src1 src2)); effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); format %{"vector_mulB $dst,$src1,$src2" %} @@ -4212,17 +5141,17 @@ %} instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{ - predicate(n->as_Vector()->length() == 16 && UseAVX > 1); + predicate(vector_length(n) == 16 && UseAVX > 1); match(Set dst (MulVB src1 src2)); effect(TEMP dst, TEMP tmp, TEMP scratch); format %{"vector_mulB $dst,$src1,$src2" %} ins_encode %{ - int vector_len = Assembler::AVX_256bit; - __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vector_len); - __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len); - __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vector_len); + int vlen_enc = Assembler::AVX_256bit; + __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vlen_enc); + __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc); + __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vlen_enc); __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); - __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len); + __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc); __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister); __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0); %} @@ -4230,54 +5159,54 @@ %} instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{ - predicate(n->as_Vector()->length() == 32); + predicate(vector_length(n) == 32); match(Set dst (MulVB src1 src2)); effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); format %{"vector_mulB $dst,$src1,$src2" %} ins_encode %{ assert(UseAVX > 1, "required"); - int vector_len = Assembler::AVX_256bit; + int vlen_enc = Assembler::AVX_256bit; __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister); __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister); - __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len); - __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len); - __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len); - __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len); - __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len); - __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc); + __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc); + __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc); + __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc); __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); - __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len); - __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len); - __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len); - __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vector_len); - __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len); + __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vlen_enc); + __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vlen_enc); + __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc); %} ins_pipe( pipe_slow ); %} instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{ - predicate(n->as_Vector()->length() == 64); + predicate(vector_length(n) == 64); match(Set dst (MulVB src1 src2)); effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); format %{"vector_mulB $dst,$src1,$src2\n\t" %} ins_encode %{ assert(UseAVX > 2, "required"); - int vector_len = Assembler::AVX_512bit; + int vlen_enc = Assembler::AVX_512bit; __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister); __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister); - __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len); - __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len); - __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len); - __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len); - __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len); - __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc); + __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc); + __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc); + __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc); __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); - __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len); - __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len); - __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); - __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len); - __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register); - __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc); + __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register); + __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4298,8 +5227,8 @@ match(Set dst (MulVS src1 src2)); format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4309,8 +5238,8 @@ match(Set dst (MulVS src (LoadVector mem))); format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4332,8 +5261,8 @@ match(Set dst (MulVI src1 src2)); format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4343,31 +5272,84 @@ match(Set dst (MulVI src (LoadVector mem))); format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} // Longs vector mul instruct vmulL_reg(vec dst, vec src1, vec src2) %{ + predicate(VM_Version::supports_avx512dq()); match(Set dst (MulVL src1 src2)); format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %} ins_encode %{ assert(UseAVX > 2, "required"); - int vector_len = vector_length_encoding(this); - __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} instruct vmulL_mem(vec dst, vec src, memory mem) %{ + predicate(VM_Version::supports_avx512dq()); match(Set dst (MulVL src (LoadVector mem))); format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %} ins_encode %{ assert(UseAVX > 2, "required"); - int vector_len = vector_length_encoding(this); - __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct mul2L_reg(vec dst, vec src2, vec tmp) %{ + predicate(vector_length(n) == 2 && !VM_Version::supports_avx512dq()); + match(Set dst (MulVL dst src2)); + effect(TEMP dst, TEMP tmp); + format %{ "pshufd $tmp,$src2, 177\n\t" + "pmulld $tmp,$dst\n\t" + "phaddd $tmp,$tmp\n\t" + "pmovzxdq $tmp,$tmp\n\t" + "psllq $tmp, 32\n\t" + "pmuludq $dst,$src2\n\t" + "paddq $dst,$tmp\n\t! mul packed2L" %} + + ins_encode %{ + assert(VM_Version::supports_sse4_1(), "required"); + int vlen_enc = Assembler::AVX_128bit; + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177); + __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister); + __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister); + __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister); + __ psllq($tmp$$XMMRegister, 32); + __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister); + __ paddq($dst$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul4L_reg_avx(vec dst, vec src1, vec src2, vec tmp, vec tmp1) %{ + predicate(vector_length(n) == 4 && !VM_Version::supports_avx512dq()); + match(Set dst (MulVL src1 src2)); + effect(TEMP tmp1, TEMP tmp); + format %{ "vpshufd $tmp,$src2\n\t" + "vpmulld $tmp,$src1,$tmp\n\t" + "vphaddd $tmp,$tmp,$tmp\n\t" + "vpmovzxdq $tmp,$tmp\n\t" + "vpsllq $tmp,$tmp\n\t" + "vpmuludq $tmp1,$src1,$src2\n\t" + "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %} + ins_encode %{ + int vlen_enc = Assembler::AVX_256bit; + __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vlen_enc); + __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vlen_enc); + __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister); + __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc); + __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc); + __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vlen_enc); + __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); + __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4388,8 +5370,8 @@ match(Set dst (MulVF src1 src2)); format %{ "vmulps $dst,$src1,$src2\t! mul packedF" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4399,8 +5381,8 @@ match(Set dst (MulVF src (LoadVector mem))); format %{ "vmulps $dst,$src,$mem\t! mul packedF" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4421,8 +5403,8 @@ match(Set dst (MulVD src1 src2)); format %{ "vmulpd $dst,$src1,$src2\t! mul packedD" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4432,40 +5414,44 @@ match(Set dst (MulVD src (LoadVector mem))); format %{ "vmulpd $dst,$src,$mem\t! mul packedD" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + predicate(vector_length(n) == 8); match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2))); effect(TEMP dst, USE src1, USE src2); format %{ "cmpps.$copnd $dst, $src1, $src2 ! vcmovevf, cond=$cop\n\t" "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t" %} ins_encode %{ - int vector_len = 1; + assert(UseAVX > 0, "required"); + + int vlen_enc = Assembler::AVX_256bit; int cond = (Assembler::Condition)($copnd$$cmpcode); - __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len); - __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc); + __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + predicate(vector_length(n) == 4); match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2))); effect(TEMP dst, USE src1, USE src2); format %{ "cmppd.$copnd $dst, $src1, $src2 ! vcmovevd, cond=$cop\n\t" - "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t" + "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t" %} ins_encode %{ - int vector_len = 1; + assert(UseAVX > 0, "required"); + + int vlen_enc = Assembler::AVX_256bit; int cond = (Assembler::Condition)($copnd$$cmpcode); - __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len); - __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc); + __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4488,8 +5474,8 @@ match(Set dst (DivVF src1 src2)); format %{ "vdivps $dst,$src1,$src2\t! div packedF" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4499,8 +5485,8 @@ match(Set dst (DivVF src (LoadVector mem))); format %{ "vdivps $dst,$src,$mem\t! div packedF" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4521,8 +5507,8 @@ match(Set dst (DivVD src1 src2)); format %{ "vdivpd $dst,$src1,$src2\t! div packedD" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4532,8 +5518,145 @@ match(Set dst (DivVD src (LoadVector mem))); format %{ "vdivpd $dst,$src,$mem\t! div packedD" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +// ------------------------------ MinMax --------------------------------------- + +// Byte, Short, Int vector Min/Max +instruct minmax_reg_sse(vec dst, vec src) %{ + predicate(is_integral_type(vector_element_basic_type(n)) && vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT + UseAVX == 0); + match(Set dst (MinV dst src)); + match(Set dst (MaxV dst src)); + format %{ "vector_minmax $dst,$src\t! " %} + ins_encode %{ + assert(UseSSE >= 4, "required"); + + int opcode = this->ideal_Opcode(); + BasicType elem_bt = vector_element_basic_type(this); + __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vminmax_reg(vec dst, vec src1, vec src2) %{ + predicate(is_integral_type(vector_element_basic_type(n)) && vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT + UseAVX > 0); + match(Set dst (MinV src1 src2)); + match(Set dst (MaxV src1 src2)); + format %{ "vector_minmax $dst,$src1,$src2\t! " %} + ins_encode %{ + int opcode = this->ideal_Opcode(); + int vlen_enc = vector_length_encoding(this); + BasicType elem_bt = vector_element_basic_type(this); + + __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +// Long vector Min/Max +instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{ + predicate(vector_length_in_bytes(n) == 16 && vector_element_basic_type(n) == T_LONG && + UseAVX == 0); + match(Set dst (MinV dst src)); + match(Set dst (MaxV src dst)); + effect(TEMP dst, TEMP tmp); + format %{ "vector_minmaxL $dst,$src\t!using $tmp as TEMP" %} + ins_encode %{ + assert(UseSSE >= 4, "required"); + + int opcode = this->ideal_Opcode(); + BasicType elem_bt = vector_element_basic_type(this); + assert(elem_bt == T_LONG, "sanity"); + + __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{ + predicate(vector_length_in_bytes(n) <= 32 && vector_element_basic_type(n) == T_LONG && + UseAVX > 0 && !VM_Version::supports_avx512vl()); + match(Set dst (MinV src1 src2)); + match(Set dst (MaxV src1 src2)); + effect(TEMP dst); + format %{ "vector_minmaxL $dst,$src1,$src2\t! " %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + int opcode = this->ideal_Opcode(); + BasicType elem_bt = vector_element_basic_type(this); + assert(elem_bt == T_LONG, "sanity"); + + __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{ + predicate((vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) && + vector_element_basic_type(n) == T_LONG); + match(Set dst (MinV src1 src2)); + match(Set dst (MaxV src1 src2)); + format %{ "vector_minmaxL $dst,$src1,src2\t! " %} + ins_encode %{ + assert(UseAVX > 2, "required"); + + int vlen_enc = vector_length_encoding(this); + int opcode = this->ideal_Opcode(); + BasicType elem_bt = vector_element_basic_type(this); + assert(elem_bt == T_LONG, "sanity"); + + __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +// Float/Double vector Min/Max +instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{ + predicate(vector_length_in_bytes(n) <= 32 && + is_floating_point_type(vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE + UseAVX > 0); + match(Set dst (MinV a b)); + match(Set dst (MaxV a b)); + effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp); + format %{ "vector_minmaxFP $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %} + ins_encode %{ + assert(UseAVX > 0, "required"); + + int opcode = this->ideal_Opcode(); + int vlen_enc = vector_length_encoding(this); + BasicType elem_bt = vector_element_basic_type(this); + + __ vminmax_fp(opcode, elem_bt, + $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, + $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp) %{ + predicate(vector_length_in_bytes(n) == 64 && + is_floating_point_type(vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE + match(Set dst (MinV a b)); + match(Set dst (MaxV a b)); + effect(USE a, USE b, TEMP atmp, TEMP btmp); + format %{ "vector_minmaxFP $dst,$a,$b\t!using $atmp, $btmp as TEMP" %} + ins_encode %{ + assert(UseAVX > 2, "required"); + + int opcode = this->ideal_Opcode(); + int vlen_enc = vector_length_encoding(this); + BasicType elem_bt = vector_element_basic_type(this); + + KRegister ktmp = k1; + __ evminmax_fp(opcode, elem_bt, + $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, + ktmp, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4545,8 +5668,8 @@ format %{ "vsqrtps $dst,$src\t! sqrt packedF" %} ins_encode %{ assert(UseAVX > 0, "required"); - int vector_len = vector_length_encoding(this); - __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4556,8 +5679,8 @@ format %{ "vsqrtps $dst,$mem\t! sqrt packedF" %} ins_encode %{ assert(UseAVX > 0, "required"); - int vector_len = vector_length_encoding(this); - __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4568,8 +5691,8 @@ format %{ "vsqrtpd $dst,$src\t! sqrt packedD" %} ins_encode %{ assert(UseAVX > 0, "required"); - int vector_len = vector_length_encoding(this); - __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4579,8 +5702,8 @@ format %{ "vsqrtpd $dst,$mem\t! sqrt packedD" %} ins_encode %{ assert(UseAVX > 0, "required"); - int vector_len = vector_length_encoding(this); - __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4601,16 +5724,17 @@ // Byte vector shift instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{ - predicate(n->as_Vector()->length() <= 8); - match(Set dst (LShiftVB src shift)); - match(Set dst (RShiftVB src shift)); + predicate(vector_length(n) <= 8); + match(Set dst ( LShiftVB src shift)); + match(Set dst ( RShiftVB src shift)); match(Set dst (URShiftVB src shift)); effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch); format %{"vector_byte_shift $dst,$src,$shift" %} ins_encode %{ assert(UseSSE > 3, "required"); int opcode = this->ideal_Opcode(); - __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister); + bool sign = (opcode == Op_URShiftVB) ? false : true; + __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister); __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister); __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); __ pand($dst$$XMMRegister, $tmp$$XMMRegister); @@ -4620,20 +5744,20 @@ %} instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{ - predicate(n->as_Vector()->length() == 16 && UseAVX <= 1); - match(Set dst (LShiftVB src shift)); - match(Set dst (RShiftVB src shift)); + predicate(vector_length(n) == 16 && UseAVX <= 1); + match(Set dst ( LShiftVB src shift)); + match(Set dst ( RShiftVB src shift)); match(Set dst (URShiftVB src shift)); effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch); format %{"vector_byte_shift $dst,$src,$shift" %} ins_encode %{ assert(UseSSE > 3, "required"); int opcode = this->ideal_Opcode(); - - __ vextendbw(opcode, $tmp1$$XMMRegister, $src$$XMMRegister); + bool sign = (opcode == Op_URShiftVB) ? false : true; + __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister); __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister); __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE); - __ vextendbw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister); + __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister); __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister); __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); __ pand($tmp2$$XMMRegister, $dst$$XMMRegister); @@ -4644,18 +5768,19 @@ %} instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{ - predicate(n->as_Vector()->length() == 16 && UseAVX > 1); - match(Set dst (LShiftVB src shift)); - match(Set dst (RShiftVB src shift)); + predicate(vector_length(n) == 16 && UseAVX > 1); + match(Set dst ( LShiftVB src shift)); + match(Set dst ( RShiftVB src shift)); match(Set dst (URShiftVB src shift)); effect(TEMP dst, TEMP tmp, TEMP scratch); format %{"vector_byte_shift $dst,$src,$shift" %} ins_encode %{ int opcode = this->ideal_Opcode(); - int vector_len = Assembler::AVX_256bit; - __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister, vector_len); - __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len); - __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register); + bool sign = (opcode == Op_URShiftVB) ? false : true; + int vlen_enc = Assembler::AVX_256bit; + __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc); + __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc); + __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register); __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister); __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0); %} @@ -4663,52 +5788,54 @@ %} instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{ - predicate(n->as_Vector()->length() == 32); - match(Set dst (LShiftVB src shift)); - match(Set dst (RShiftVB src shift)); + predicate(vector_length(n) == 32); + match(Set dst ( LShiftVB src shift)); + match(Set dst ( RShiftVB src shift)); match(Set dst (URShiftVB src shift)); effect(TEMP dst, TEMP tmp, TEMP scratch); format %{"vector_byte_shift $dst,$src,$shift" %} ins_encode %{ assert(UseAVX > 1, "required"); int opcode = this->ideal_Opcode(); - int vector_len = Assembler::AVX_256bit; + bool sign = (opcode == Op_URShiftVB) ? false : true; + int vlen_enc = Assembler::AVX_256bit; __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister); - __ vextendbw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len); - __ vextendbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len); - __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len); - __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vector_len); - __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register); - __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register); - __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len); - __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len); + __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc); + __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc); + __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc); + __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register); + __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register); + __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc); + __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc); %} ins_pipe( pipe_slow ); %} instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{ - predicate(n->as_Vector()->length() == 64); - match(Set dst (LShiftVB src shift)); - match(Set dst (RShiftVB src shift)); + predicate(vector_length(n) == 64); + match(Set dst ( LShiftVB src shift)); + match(Set dst (RShiftVB src shift)); match(Set dst (URShiftVB src shift)); effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); format %{"vector_byte_shift $dst,$src,$shift" %} ins_encode %{ assert(UseAVX > 2, "required"); int opcode = this->ideal_Opcode(); - int vector_len = Assembler::AVX_512bit; + bool sign = (opcode == Op_URShiftVB) ? false : true; + int vlen_enc = Assembler::AVX_512bit; __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1); - __ vextendbw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len); - __ vextendbw(opcode, $tmp2$$XMMRegister, $src$$XMMRegister, vector_len); - __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vector_len); - __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len); + __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc); + __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc); + __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc); + __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc); __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); - __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len); - __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len); - __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); - __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len); - __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register); - __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc); + __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register); + __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4719,8 +5846,8 @@ // unsigned values. // Shorts/Chars vector left shift instruct vshiftS(vec dst, vec src, vec shift) %{ - match(Set dst (LShiftVS src shift)); - match(Set dst (RShiftVS src shift)); + match(Set dst ( LShiftVS src shift)); + match(Set dst ( RShiftVS src shift)); match(Set dst (URShiftVS src shift)); effect(TEMP dst, USE src, USE shift); format %{ "vshiftw $dst,$src,$shift\t! shift packedS" %} @@ -4749,16 +5876,16 @@ // Integers vector left shift instruct vshiftI(vec dst, vec src, vec shift) %{ - match(Set dst (LShiftVI src shift)); - match(Set dst (RShiftVI src shift)); + match(Set dst ( LShiftVI src shift)); + match(Set dst ( RShiftVI src shift)); match(Set dst (URShiftVI src shift)); effect(TEMP dst, USE src, USE shift); format %{ "vshiftd $dst,$src,$shift\t! shift packedI" %} ins_encode %{ int opcode = this->ideal_Opcode(); if (UseAVX > 0) { - int vector_len = vector_length_encoding(this); - __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); } else { int vlen = vector_length(this); if (vlen == 2) { @@ -4776,15 +5903,15 @@ // Longs vector shift instruct vshiftL(vec dst, vec src, vec shift) %{ - match(Set dst (LShiftVL src shift)); + match(Set dst ( LShiftVL src shift)); match(Set dst (URShiftVL src shift)); effect(TEMP dst, USE src, USE shift); format %{ "vshiftq $dst,$src,$shift\t! shift packedL" %} ins_encode %{ int opcode = this->ideal_Opcode(); if (UseAVX > 0) { - int vector_len = vector_length_encoding(this); - __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); } else { assert(vector_length(this) == 2, ""); __ movdqu($dst$$XMMRegister, $src$$XMMRegister); @@ -4814,12 +5941,12 @@ } else { assert(vlen == 4, "sanity"); assert(UseAVX > 1, "required"); - int vector_len = Assembler::AVX_256bit; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + int vlen_enc = Assembler::AVX_256bit; + __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register); - __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len); - __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len); - __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len); + __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc); + __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc); + __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc); } %} ins_pipe( pipe_slow ); @@ -4830,8 +5957,273 @@ match(Set dst (RShiftVL src shift)); format %{ "vshiftq $dst,$src,$shift" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +// ------------------- Variable Shift ----------------------------- +// Byte variable shift +instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{ + predicate(vector_length(n) <= 8 && vector_element_basic_type(n) == T_BYTE && + !VM_Version::supports_avx512bw()); + match(Set dst ( VLShiftV src shift)); + match(Set dst ( VRShiftV src shift)); + match(Set dst (VURShiftV src shift)); + effect(TEMP dst, TEMP vtmp, TEMP scratch); + format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %} + ins_encode %{ + assert(UseAVX >= 2, "required"); + + int opcode = this->ideal_Opcode(); + int vlen_enc = Assembler::AVX_128bit; + __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register); + __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{ + predicate(vector_length(n) == 16 && vector_element_basic_type(n) == T_BYTE && + !VM_Version::supports_avx512bw()); + match(Set dst ( VLShiftV src shift)); + match(Set dst ( VRShiftV src shift)); + match(Set dst (VURShiftV src shift)); + effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch); + format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %} + ins_encode %{ + assert(UseAVX >= 2, "required"); + + int opcode = this->ideal_Opcode(); + int vlen_enc = Assembler::AVX_128bit; + // Shift lower half and get word result in dst + __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register); + + // Shift upper half and get word result in vtmp1 + __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0); + __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0); + __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register); + + // Merge and down convert the two word results to byte in dst + __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{ + predicate(vector_length(n) == 32 && vector_element_basic_type(n) == T_BYTE && + !VM_Version::supports_avx512bw()); + match(Set dst ( VLShiftV src shift)); + match(Set dst ( VRShiftV src shift)); + match(Set dst (VURShiftV src shift)); + effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch); + format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %} + ins_encode %{ + assert(UseAVX >= 2, "required"); + + int opcode = this->ideal_Opcode(); + int vlen_enc = Assembler::AVX_128bit; + // Process lower 128 bits and get result in dst + __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register); + __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0); + __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0); + __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register); + __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0); + + // Process higher 128 bits and get result in vtmp3 + __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister); + __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister); + __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register); + __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0); + __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0); + __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register); + __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0); + + // Merge the two results in dst + __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{ + predicate(vector_length(n) <= 32 && vector_element_basic_type(n) == T_BYTE && + VM_Version::supports_avx512bw()); + match(Set dst ( VLShiftV src shift)); + match(Set dst ( VRShiftV src shift)); + match(Set dst (VURShiftV src shift)); + effect(TEMP dst, TEMP vtmp, TEMP scratch); + format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %} + ins_encode %{ + assert(UseAVX > 2, "required"); + + int opcode = this->ideal_Opcode(); + int vlen_enc = vector_length_encoding(this); + __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{ + predicate(vector_length(n) == 64 && vector_element_basic_type(n) == T_BYTE && + VM_Version::supports_avx512bw()); + match(Set dst ( VLShiftV src shift)); + match(Set dst ( VRShiftV src shift)); + match(Set dst (VURShiftV src shift)); + effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch); + format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %} + ins_encode %{ + assert(UseAVX > 2, "required"); + + int opcode = this->ideal_Opcode(); + int vlen_enc = Assembler::AVX_256bit; + __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register); + __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister); + __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister); + __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register); + __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1); + %} + ins_pipe( pipe_slow ); +%} + +// Short variable shift +instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{ + predicate(vector_length(n) <= 8 && vector_element_basic_type(n) == T_SHORT && + !VM_Version::supports_avx512bw()); + match(Set dst (VLShiftV src shift)); + match(Set dst (VRShiftV src shift)); + match(Set dst (VURShiftV src shift)); + effect(TEMP dst, TEMP vtmp, TEMP scratch); + format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %} + ins_encode %{ + assert(UseAVX >= 2, "required"); + + int opcode = this->ideal_Opcode(); + bool sign = (opcode == Op_VURShiftV) ? false : true; + int vlen_enc = Assembler::AVX_256bit; + __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1); + __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1); + __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc); + __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register); + __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister); + __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{ + predicate(vector_length(n) == 16 && vector_element_basic_type(n) == T_SHORT && + !VM_Version::supports_avx512bw()); + match(Set dst (VLShiftV src shift)); + match(Set dst (VRShiftV src shift)); + match(Set dst (VURShiftV src shift)); + effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch); + format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %} + ins_encode %{ + assert(UseAVX >= 2, "required"); + + int opcode = this->ideal_Opcode(); + bool sign = (opcode == Op_VURShiftV) ? false : true; + int vlen_enc = Assembler::AVX_256bit; + // Shift lower half, with result in vtmp2 usign vtmp1 as TEMP + __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc); + __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc); + __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc); + __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register); + + // Shift upper half, with result in dst usign vtmp1 as TEMP + __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister); + __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister); + __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc); + __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc); + __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register); + + // Merge lower and upper half result into dst + __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{ + predicate(vector_element_basic_type(n) == T_SHORT && + VM_Version::supports_avx512bw()); + match(Set dst (VLShiftV src shift)); + match(Set dst (VRShiftV src shift)); + match(Set dst (VURShiftV src shift)); + format %{ "vector_varshift_short $dst,$src,$shift\t!" %} + ins_encode %{ + assert(UseAVX > 2, "required"); + + int opcode = this->ideal_Opcode(); + int vlen_enc = vector_length_encoding(this); + if (!VM_Version::supports_avx512vl()) { + vlen_enc = Assembler::AVX_512bit; + } + __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +//Integer variable shift +instruct vshiftI_var(vec dst, vec src, vec shift) %{ + predicate(vector_element_basic_type(n) == T_INT); + match(Set dst ( VLShiftV src shift)); + match(Set dst ( VRShiftV src shift)); + match(Set dst (VURShiftV src shift)); + format %{ "vector_varshift_int $dst,$src,$shift\t!" %} + ins_encode %{ + assert(UseAVX >= 2, "required"); + + int opcode = this->ideal_Opcode(); + int vlen_enc = vector_length_encoding(this); + __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +//Long variable shift +instruct vshiftL_var(vec dst, vec src, vec shift) %{ + predicate(vector_element_basic_type(n) == T_LONG); + match(Set dst ( VLShiftV src shift)); + match(Set dst (VURShiftV src shift)); + format %{ "vector_varshift_long $dst,$src,$shift\t!" %} + ins_encode %{ + assert(UseAVX >= 2, "required"); + + int opcode = this->ideal_Opcode(); + int vlen_enc = vector_length_encoding(this); + __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +//Long variable right shift arithmetic +instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{ + predicate(vector_length(n) <= 4 && vector_element_basic_type(n) == T_LONG && + UseAVX == 2); + match(Set dst (VRShiftV src shift)); + effect(TEMP dst, TEMP vtmp); + format %{ "vector_varshift_long $dst,$src,$shift\n\t! using $vtmp as TEMP" %} + ins_encode %{ + int opcode = this->ideal_Opcode(); + int vlen_enc = vector_length_encoding(this); + __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, + $vtmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{ + predicate(vector_element_basic_type(n) == T_LONG && + UseAVX > 2); + match(Set dst (VRShiftV src shift)); + format %{ "vector_varfshift_long $dst,$src,$shift\t!" %} + ins_encode %{ + int opcode = this->ideal_Opcode(); + int vlen_enc = vector_length_encoding(this); + __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4853,8 +6245,8 @@ match(Set dst (AndV src1 src2)); format %{ "vpand $dst,$src1,$src2\t! and vectors" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4864,8 +6256,8 @@ match(Set dst (AndV src (LoadVector mem))); format %{ "vpand $dst,$src,$mem\t! and vectors" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4887,8 +6279,8 @@ match(Set dst (OrV src1 src2)); format %{ "vpor $dst,$src1,$src2\t! or vectors" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4898,8 +6290,8 @@ match(Set dst (OrV src (LoadVector mem))); format %{ "vpor $dst,$src,$mem\t! or vectors" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4921,8 +6313,8 @@ match(Set dst (XorV src1 src2)); format %{ "vpxor $dst,$src1,$src2\t! xor vectors" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4932,66 +6324,658 @@ match(Set dst (XorV src (LoadVector mem))); format %{ "vpxor $dst,$src,$mem\t! xor vectors" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc); %} ins_pipe( pipe_slow ); %} -// --------------------------------- ABS -------------------------------------- -// a = |a| -instruct vabsB_reg(vec dst, vec src) %{ - match(Set dst (AbsVB src)); - format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %} +// --------------------------------- VectorCast -------------------------------------- + +instruct vcastBtoX(vec dst, vec src) %{ + match(Set dst (VectorCastB2X src)); + format %{ "vector_cast_b2x $dst,$src\t!" %} ins_encode %{ - uint vlen = vector_length(this); - if (vlen <= 16) { - __ pabsb($dst$$XMMRegister, $src$$XMMRegister); - } else { - int vlen_enc = vector_length_encoding(this); - __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + assert(UseAVX > 0, "required"); + + BasicType to_elem_bt = vector_element_basic_type(this); + int vlen_enc = vector_length_encoding(this); + switch (to_elem_bt) { + case T_SHORT: + __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + break; + case T_INT: + __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + break; + case T_FLOAT: + __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + break; + case T_LONG: + __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + break; + case T_DOUBLE: + __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + break; + + default: assert(false, "%s", type2name(to_elem_bt)); } %} ins_pipe( pipe_slow ); %} -instruct vabsS_reg(vec dst, vec src) %{ - match(Set dst (AbsVS src)); - format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %} +instruct castStoX(vec dst, vec src, rRegP scratch) %{ + predicate(UseAVX <= 2 && + vector_length(n->in(1)) <= 8 && // src + vector_element_basic_type(n) == T_BYTE); + effect(TEMP scratch); + match(Set dst (VectorCastS2X src)); + format %{ "vector_cast_s2x $dst,$src\t! using $scratch as TEMP" %} ins_encode %{ - uint vlen = vector_length(this); - if (vlen <= 8) { - __ pabsw($dst$$XMMRegister, $src$$XMMRegister); - } else { - int vlen_enc = vector_length_encoding(this); - __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); - } + assert(UseAVX > 0, "required"); + + __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, $scratch$$Register); + __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0); %} ins_pipe( pipe_slow ); %} -instruct vabsI_reg(vec dst, vec src) %{ - match(Set dst (AbsVI src)); - format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %} +instruct vcastStoX(vec dst, vec src, vec vtmp, rRegP scratch) %{ + predicate(UseAVX <= 2 && + vector_length(n->in(1)) == 16 && // src + vector_element_basic_type(n) == T_BYTE); + effect(TEMP dst, TEMP vtmp, TEMP scratch); + match(Set dst (VectorCastS2X src)); + format %{ "vector_cast_s2x $dst,$src\t! using $vtmp, $scratch as TEMP" %} ins_encode %{ - uint vlen = vector_length(this); - if (vlen <= 4) { - __ pabsd($dst$$XMMRegister, $src$$XMMRegister); - } else { - int vlen_enc = vector_length_encoding(this); - __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); - } + assert(UseAVX > 0, "required"); + + int vlen_enc = vector_length_encoding(vector_length_in_bytes(this, $src)); + __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register); + __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1); + __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0); %} ins_pipe( pipe_slow ); %} -instruct vabsL_reg(vec dst, vec src) %{ - match(Set dst (AbsVL src)); +instruct vcastStoX_evex(vec dst, vec src) %{ + predicate(UseAVX > 2 || + (vector_length_in_bytes(n) >= vector_length_in_bytes(n->in(1)))); // dst >= src + match(Set dst (VectorCastS2X src)); + format %{ "vector_cast_s2x $dst,$src\t!" %} + ins_encode %{ + BasicType to_elem_bt = vector_element_basic_type(this); + int src_vlen_enc = vector_length_encoding(this, $src); + int vlen_enc = vector_length_encoding(this); + switch (to_elem_bt) { + case T_BYTE: + if (!VM_Version::supports_avx512vl()) { + vlen_enc = Assembler::AVX_512bit; + } + __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc); + break; + case T_INT: + __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + break; + case T_FLOAT: + __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + break; + case T_LONG: + __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + break; + case T_DOUBLE: + __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + break; + default: + ShouldNotReachHere(); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct castItoX(vec dst, vec src, rRegP scratch) %{ + predicate(UseAVX <= 2 && + (vector_length_in_bytes(n->in(1)) <= 16) && + (vector_length_in_bytes(n) < vector_length_in_bytes(n->in(1)))); // dst < src + match(Set dst (VectorCastI2X src)); + format %{ "vector_cast_i2x $dst,$src\t! using $scratch as TEMP" %} + effect(TEMP scratch); + ins_encode %{ + assert(UseAVX > 0, "required"); + + BasicType to_elem_bt = vector_element_basic_type(this); + int vlen_enc = vector_length_encoding(this, $src); + + if (to_elem_bt == T_BYTE) { + __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register); + __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + } else { + assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt)); + __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register); + __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vcastItoX(vec dst, vec src, vec vtmp, rRegP scratch) %{ + predicate(UseAVX <= 2 && + (vector_length_in_bytes(n->in(1)) == 32) && + (vector_length_in_bytes(n) < vector_length_in_bytes(n->in(1)))); // dst < src + match(Set dst (VectorCastI2X src)); + format %{ "vector_cast_i2x $dst,$src\t! using $vtmp and $scratch as TEMP" %} + effect(TEMP dst, TEMP vtmp, TEMP scratch); + ins_encode %{ + assert(UseAVX > 0, "required"); + + BasicType to_elem_bt = vector_element_basic_type(this); + int vlen_enc = vector_length_encoding(this, $src); + + if (to_elem_bt == T_BYTE) { + __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register); + __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1); + __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit); + } else { + assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt)); + __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register); + __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1); + __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vcastItoX_evex(vec dst, vec src) %{ + predicate(UseAVX > 2 || + (vector_length_in_bytes(n) >= vector_length_in_bytes(n->in(1)))); // dst >= src + match(Set dst (VectorCastI2X src)); + format %{ "vector_cast_i2x $dst,$src\t!" %} + ins_encode %{ + assert(UseAVX > 0, "required"); + + BasicType dst_elem_bt = vector_element_basic_type(this); + int src_vlen_enc = vector_length_encoding(this, $src); + int dst_vlen_enc = vector_length_encoding(this); + switch (dst_elem_bt) { + case T_BYTE: + if (!VM_Version::supports_avx512vl()) { + src_vlen_enc = Assembler::AVX_512bit; + } + __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc); + break; + case T_SHORT: + if (!VM_Version::supports_avx512vl()) { + src_vlen_enc = Assembler::AVX_512bit; + } + __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc); + break; + case T_FLOAT: + __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc); + break; + case T_LONG: + __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc); + break; + case T_DOUBLE: + __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc); + break; + default: + ShouldNotReachHere(); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vcastLtoBS(vec dst, vec src, rRegP scratch) %{ + predicate((vector_element_basic_type(n) == T_BYTE || vector_element_basic_type(n) == T_SHORT) && + UseAVX <= 2); + match(Set dst (VectorCastL2X src)); + effect(TEMP scratch); + format %{ "vector_cast_l2x $dst,$src\t! using $scratch as TEMP" %} + ins_encode %{ + assert(UseAVX > 0, "required"); + + int vlen = vector_length_in_bytes(this, $src); + BasicType to_elem_bt = vector_element_basic_type(this); + AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask()) + : ExternalAddress(vector_int_to_short_mask()); + if (vlen <= 16) { + __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit); + __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register); + __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit); + } else { + assert(vlen <= 32, "required"); + __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit); + __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit); + __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register); + __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit); + } + if (to_elem_bt == T_BYTE) { + __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vcastLtoX_evex(vec dst, vec src) %{ + predicate(UseAVX > 2 || + (vector_element_basic_type(n) == T_INT || + vector_element_basic_type(n) == T_FLOAT || + vector_element_basic_type(n) == T_DOUBLE)); + match(Set dst (VectorCastL2X src)); + format %{ "vector_cast_l2x $dst,$src\t!" %} + ins_encode %{ + BasicType to_elem_bt = vector_element_basic_type(this); + int vlen = vector_length_in_bytes(this, $src); + int vlen_enc = vector_length_encoding(this, $src); + switch (to_elem_bt) { + case T_BYTE: + if (UseAVX > 2 && !VM_Version::supports_avx512vl()) { + vlen_enc = Assembler::AVX_512bit; + } + __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + break; + case T_SHORT: + if (UseAVX > 2 && !VM_Version::supports_avx512vl()) { + vlen_enc = Assembler::AVX_512bit; + } + __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + break; + case T_INT: + if (vlen == 8) { + if ($dst$$XMMRegister != $src$$XMMRegister) { + __ movflt($dst$$XMMRegister, $src$$XMMRegister); + } + } else if (vlen == 16) { + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8); + } else if (vlen == 32) { + if (UseAVX > 2) { + if (!VM_Version::supports_avx512vl()) { + vlen_enc = Assembler::AVX_512bit; + } + __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + } else { + __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc); + __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc); + } + } else { // vlen == 64 + __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + } + break; + case T_FLOAT: + assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required"); + __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + break; + case T_DOUBLE: + assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required"); + __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + break; + + default: assert(false, "%s", type2name(to_elem_bt)); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vcastFtoD_reg(vec dst, vec src) %{ + predicate(vector_element_basic_type(n) == T_DOUBLE); + match(Set dst (VectorCastF2X src)); + format %{ "vector_cast_f2x $dst,$src\t!" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vcastDtoF_reg(vec dst, vec src) %{ + predicate(vector_element_basic_type(n) == T_FLOAT); + match(Set dst (VectorCastD2X src)); + format %{ "vector_cast_d2x $dst,$src\t!" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this, $src); + __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +// --------------------------------- VectorMaskCmp -------------------------------------- + +instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{ + predicate(vector_length_in_bytes(n->in(1)->in(1)) >= 8 && // src1 + vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1 + is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE + match(Set dst (VectorMaskCmp (Binary src1 src2) cond)); + format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this, $src1); + Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant); + if (vector_element_basic_type(this, $src1) == T_FLOAT) + __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); + else + __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct evcmpFD(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch) %{ + predicate(vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1 + is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE + match(Set dst (VectorMaskCmp (Binary src1 src2) cond)); + effect(TEMP scratch); + format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %} + ins_encode %{ + int vlen_enc = Assembler::AVX_512bit; + Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant); + KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation. + KRegister mask = k0; // The comparison itself is not being masked. + if (vector_element_basic_type(this, $src1) == T_FLOAT) { + __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); + __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register); + } else { + __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); + __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vcmp(legVec dst, legVec src1, legVec src2, immI8 cond, rRegP scratch) %{ + predicate(vector_length_in_bytes(n->in(1)->in(1)) >= 8 && // src1 + vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1 + is_integral_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 + match(Set dst (VectorMaskCmp (Binary src1 src2) cond)); + effect(TEMP scratch); + format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this, $src1); + Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant); + Assembler::Width ww = widthForType(vector_element_basic_type(this, $src1)); + __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, ww, vlen_enc, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct evcmp(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch) %{ + predicate(vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1 + is_integral_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 + match(Set dst (VectorMaskCmp (Binary src1 src2) cond)); + effect(TEMP scratch); + format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %} + ins_encode %{ + assert(UseAVX > 2, "required"); + + int vlen_enc = Assembler::AVX_512bit; + Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant); + KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation. + KRegister mask = k0; // The comparison itself is not being masked. + bool merge = false; + BasicType src1_elem_bt = vector_element_basic_type(this, $src1); + + switch (src1_elem_bt) { + case T_BYTE: { + __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); + __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register); + break; + } + case T_SHORT: { + __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); + __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register); + break; + } + case T_INT: { + __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); + __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register); + break; + } + case T_LONG: { + __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); + __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register); + break; + } + + default: assert(false, "%s", type2name(src1_elem_bt)); + } + %} + ins_pipe( pipe_slow ); +%} + +// Extract + +instruct extractI(rRegI dst, legVec src, immU8 idx) %{ + predicate(vector_length_in_bytes(n->in(1)) <= 16); // src + match(Set dst (ExtractI src idx)); + match(Set dst (ExtractS src idx)); +#ifdef _LP64 + match(Set dst (ExtractB src idx)); +#endif + ins_encode %{ + assert($idx$$constant < (int)vector_length(this, $src), "out of bounds"); + + BasicType elem_bt = vector_element_basic_type(this, $src); + __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{ + predicate(vector_length_in_bytes(n->in(1)) == 32 || // src + vector_length_in_bytes(n->in(1)) == 64); // src + match(Set dst (ExtractI src idx)); + match(Set dst (ExtractS src idx)); +#ifdef _LP64 + match(Set dst (ExtractB src idx)); +#endif + effect(TEMP vtmp); + ins_encode %{ + assert($idx$$constant < (int)vector_length(this, $src), "out of bounds"); + + BasicType elem_bt = vector_element_basic_type(this, $src); + XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant); + __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant); + %} + ins_pipe( pipe_slow ); +%} + +#ifdef _LP64 +instruct extractL(rRegL dst, legVec src, immU8 idx) %{ + predicate(vector_length(n->in(1)) <= 2); // src + match(Set dst (ExtractL src idx)); + ins_encode %{ + assert(UseSSE >= 4, "required"); + assert($idx$$constant < (int)vector_length(this, $src), "out of bounds"); + + __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{ + predicate(vector_length(n->in(1)) == 4 || // src + vector_length(n->in(1)) == 8); // src + match(Set dst (ExtractL src idx)); + effect(TEMP vtmp); + ins_encode %{ + assert($idx$$constant < (int)vector_length(this, $src), "out of bounds"); + + XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant); + __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant); + %} + ins_pipe( pipe_slow ); +%} +#endif + +instruct extractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{ + predicate(vector_length(n->in(1)) <= 4); + match(Set dst (ExtractF src idx)); + effect(TEMP dst, TEMP tmp, TEMP vtmp); + ins_encode %{ + assert($idx$$constant < (int)vector_length(this, $src), "out of bounds"); + + __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $tmp$$Register, $vtmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vextractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{ + predicate(vector_length(n->in(1)/*src*/) == 8 || + vector_length(n->in(1)/*src*/) == 16); + match(Set dst (ExtractF src idx)); + effect(TEMP tmp, TEMP vtmp); + ins_encode %{ + assert($idx$$constant < (int)vector_length(this, $src), "out of bounds"); + + XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant); + __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant, $tmp$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct extractD(legRegD dst, legVec src, immU8 idx) %{ + predicate(vector_length(n->in(1)) == 2); // src + match(Set dst (ExtractD src idx)); + ins_encode %{ + assert($idx$$constant < (int)vector_length(this, $src), "out of bounds"); + + __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant); + %} + ins_pipe( pipe_slow ); +%} + +instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{ + predicate(vector_length(n->in(1)) == 4 || // src + vector_length(n->in(1)) == 8); // src + match(Set dst (ExtractD src idx)); + effect(TEMP vtmp); + ins_encode %{ + assert($idx$$constant < (int)vector_length(this, $src), "out of bounds"); + + XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant); + __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant); + %} + ins_pipe( pipe_slow ); +%} + +// --------------------------------- Vector Blend -------------------------------------- + +instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{ + predicate(UseAVX == 0); + match(Set dst (VectorBlend (Binary dst src) mask)); + format %{ "vector_blend $dst,$src,$mask\t! using $tmp as TEMP" %} + effect(TEMP tmp); + ins_encode %{ + assert(UseSSE >= 4, "required"); + + if ($mask$$XMMRegister != $tmp$$XMMRegister) { + __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister); + } + __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask + %} + ins_pipe( pipe_slow ); +%} + +instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{ + predicate(UseAVX > 0 && + vector_length_in_bytes(n) <= 32 && + is_integral_type(vector_element_basic_type(n))); + match(Set dst (VectorBlend (Binary src1 src2) mask)); + format %{ "vector_blend $dst,$src1,$src2,$mask\t!" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{ + predicate(UseAVX > 0 && + vector_length_in_bytes(n) <= 32 && + !is_integral_type(vector_element_basic_type(n))); + match(Set dst (VectorBlend (Binary src1 src2) mask)); + format %{ "vector_blend $dst,$src1,$src2,$mask\t!" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, rRegP scratch) %{ + predicate(vector_length_in_bytes(n) == 64); + match(Set dst (VectorBlend (Binary src1 src2) mask)); + format %{ "vector_blend $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %} + effect(TEMP scratch); + ins_encode %{ + int vlen_enc = Assembler::AVX_512bit; + BasicType elem_bt = vector_element_basic_type(this); + KRegister ktmp = k2; + __ evpcmp(elem_bt, ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, $scratch$$Register); + __ evpblend(elem_bt, $dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +// --------------------------------- ABS -------------------------------------- +// a = |a| +instruct vabsB_reg(vec dst, vec src) %{ + match(Set dst (AbsVB src)); + format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %} + ins_encode %{ + uint vlen = vector_length(this); + if (vlen <= 16) { + __ pabsb($dst$$XMMRegister, $src$$XMMRegister); + } else { + int vlen_enc = vector_length_encoding(this); + __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vabsS_reg(vec dst, vec src) %{ + match(Set dst (AbsVS src)); + format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %} + ins_encode %{ + uint vlen = vector_length(this); + if (vlen <= 8) { + __ pabsw($dst$$XMMRegister, $src$$XMMRegister); + } else { + int vlen_enc = vector_length_encoding(this); + __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vabsI_reg(vec dst, vec src) %{ + match(Set dst (AbsVI src)); + format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %} + ins_encode %{ + uint vlen = vector_length(this); + if (vlen <= 4) { + __ pabsd($dst$$XMMRegister, $src$$XMMRegister); + } else { + int vlen_enc = vector_length_encoding(this); + __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vabsL_reg(vec dst, vec src) %{ + match(Set dst (AbsVL src)); format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %} ins_encode %{ assert(UseAVX > 2, "required"); - int vector_len = vector_length_encoding(this); - __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + if (!VM_Version::supports_avx512vl()) { + vlen_enc = Assembler::AVX_512bit; + } + __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -4999,7 +6983,7 @@ // --------------------------------- ABSNEG -------------------------------------- instruct vabsnegF(vec dst, vec src, rRegI scratch) %{ - predicate(n->as_Vector()->length() != 4); // handled by 1-operand instruction vabsneg4F + predicate(vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F match(Set dst (AbsVF src)); match(Set dst (NegVF src)); effect(TEMP scratch); @@ -5020,7 +7004,7 @@ %} instruct vabsneg4F(vec dst, rRegI scratch) %{ - predicate(n->as_Vector()->length() == 4); + predicate(vector_length(n) == 4); match(Set dst (AbsVF dst)); match(Set dst (NegVF dst)); effect(TEMP scratch); @@ -5052,6 +7036,546 @@ ins_pipe( pipe_slow ); %} +//------------------------------------- NOT -------------------------------------------- + +instruct vnotB(vec dst, vec src) %{ + predicate(UseAVX == 0); + match(Set dst (NotV src)); + effect(TEMP dst); + format %{ "vector_not $dst,$src\t!" %} + ins_encode %{ + int vlen = vector_length_in_bytes(this); + switch(vlen) { + default: + assert(0, "Incorrect vector length"); + break; + case 4: { + __ movdl($dst$$XMMRegister, ExternalAddress(vector_all_bits_set())); + __ pxor($dst$$XMMRegister, $src$$XMMRegister); + } break; + case 8: { + __ movq($dst$$XMMRegister, ExternalAddress(vector_all_bits_set())); + __ pxor($dst$$XMMRegister, $src$$XMMRegister); + } break; + case 16: { + __ movdqu($dst$$XMMRegister, ExternalAddress(vector_all_bits_set())); + __ pxor($dst$$XMMRegister, $src$$XMMRegister); + } break; + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vnotB_reg(vec dst, vec src, rRegP scratch) %{ + predicate(UseAVX > 0); + match(Set dst (NotV src)); + effect(TEMP scratch); + format %{ "vector_not $dst,$src\t! using $scratch as rRegP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + __ vpxor($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), vlen_enc, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +//------------------------------------- VectorTest -------------------------------------------- + +#ifdef _LP64 +instruct vptest_alltrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{ + predicate(static_cast(n)->get_predicate() == BoolTest::overflow); + match(Set dst (VectorTest src1 src2 )); + effect(KILL cr); + format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %} + ins_encode %{ + int vlen = vector_length_in_bytes(this, $src1); + int vlen_enc = vector_length_encoding(vlen); + if (vlen <= 32) { + if (UseAVX == 0) { + assert(vlen <= 16, "required"); + __ ptest($src1$$XMMRegister, $src2$$XMMRegister); + } else { + __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); + } + } else { + KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation. + __ evpcmpeqb(ktmp, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); + __ kortestql(ktmp, ktmp); + } + __ setb(Assembler::carrySet, $dst$$Register); + __ movzbl($dst$$Register, $dst$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vptest_anytrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{ + predicate(static_cast(n)->get_predicate() == BoolTest::ne); + match(Set dst (VectorTest src1 src2 )); + effect(KILL cr); + format %{ "vector_test_any_true $dst,$src1,$src2\t! using $cr as TEMP" %} + ins_encode %{ + int vlen = vector_length_in_bytes(this, $src1); + int vlen_enc = vector_length_encoding(vlen); + if (vlen <= 32) { + if (UseAVX == 0) { + assert(vlen <= 16, "required"); + __ ptest($src1$$XMMRegister, $src2$$XMMRegister); + } else { + __ vptest($src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); + } + } else { + KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation. + __ evpcmpeqb(ktmp, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); + __ ktestql(ktmp, ktmp); + } + __ setb(Assembler::notZero, $dst$$Register); + __ movzbl($dst$$Register, $dst$$Register); + %} + ins_pipe( pipe_slow ); +%} +#endif + +//------------------------------------- LoadMask -------------------------------------------- + +instruct loadMask(vec dst, vec src) %{ + match(Set dst (VectorLoadMask src)); + effect(TEMP dst); + format %{ "vector_loadmask_byte $dst,$src\n\t" %} + ins_encode %{ + int vlen_in_bytes = vector_length_in_bytes(this); + BasicType elem_bt = vector_element_basic_type(this); + + __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt); + %} + ins_pipe( pipe_slow ); +%} + +//------------------------------------- StoreMask -------------------------------------------- + +instruct storeMask1B(vec dst, vec src, immI_1 size) %{ + predicate(vector_length(n) < 64 || VM_Version::supports_avx512vlbw()); + match(Set dst (VectorStoreMask src size)); + format %{ "vector_store_mask $dst,$src\t!" %} + ins_encode %{ + assert(UseSSE >= 3, "required"); + if (vector_length_in_bytes(this) <= 16) { + __ pabsb($dst$$XMMRegister, $src$$XMMRegister); + } else { + assert(UseAVX >= 2, "required"); + int src_vlen_enc = vector_length_encoding(this, $src); + __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct storeMask2B(vec dst, vec src, immI_2 size) %{ + predicate(vector_length(n) <= 8); + match(Set dst (VectorStoreMask src size)); + format %{ "vector_store_mask $dst,$src\n\t" %} + ins_encode %{ + assert(UseSSE >= 3, "required"); + __ pabsw($dst$$XMMRegister, $src$$XMMRegister); + __ packsswb($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vstoreMask2B(vec dst, vec src, immI_2 size) %{ + predicate(vector_length(n) == 16 && !VM_Version::supports_avx512bw()); + match(Set dst (VectorStoreMask src size)); + effect(TEMP dst); + format %{ "vector_store_mask $dst,$src\t!" %} + ins_encode %{ + int vlen_enc = Assembler::AVX_128bit; + __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1); + __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister,vlen_enc); + __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vstoreMask2B_evex(vec dst, vec src, immI_2 size) %{ + predicate(VM_Version::supports_avx512bw()); + match(Set dst (VectorStoreMask src size)); + format %{ "vector_store_mask $dst,$src\t!" %} + ins_encode %{ + int src_vlen_enc = vector_length_encoding(this, $src); + int dst_vlen_enc = vector_length_encoding(this); + __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc); + __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct storeMask4B(vec dst, vec src, immI_4 size) %{ + predicate (vector_length(n) <= 4 && UseAVX <= 2); + match(Set dst (VectorStoreMask src size)); + format %{ "vector_store_mask $dst,$src\t!" %} + ins_encode %{ + assert(UseSSE >= 3, "required"); + __ pabsd($dst$$XMMRegister, $src$$XMMRegister); + __ packssdw($dst$$XMMRegister, $dst$$XMMRegister); + __ packsswb($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vstoreMask4B(vec dst, vec src, immI_4 size) %{ + predicate(vector_length(n) == 8 && UseAVX <= 2); + match(Set dst (VectorStoreMask src size)); + format %{ "vector_store_mask $dst,$src\t!" %} + effect(TEMP dst); + ins_encode %{ + int vlen_enc = Assembler::AVX_128bit; + __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1); + __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vstoreMask4B_evex(vec dst, vec src, immI_4 size) %{ + predicate(UseAVX > 2); + match(Set dst (VectorStoreMask src size)); + format %{ "vector_store_mask $dst,$src\t!" %} + ins_encode %{ + int src_vlen_enc = vector_length_encoding(this, $src); + int dst_vlen_enc = vector_length_encoding(this); + if (!VM_Version::supports_avx512vl()) { + src_vlen_enc = Assembler::AVX_512bit; + } + __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc); + __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct storeMask8B(vec dst, vec src, immI_8 size) %{ + predicate(vector_length(n) == 2 && UseAVX <= 2); + match(Set dst (VectorStoreMask src size)); + format %{ "vector_store_mask $dst,$src\t!" %} + ins_encode %{ + assert(UseSSE >= 3, "required"); + __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8); + __ packssdw($dst$$XMMRegister, $dst$$XMMRegister); + __ packsswb($dst$$XMMRegister, $dst$$XMMRegister); + __ pabsb($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct storeMask8B_avx(vec dst, vec src, immI_8 size, legVec vtmp) %{ + predicate(vector_length(n) == 4 && UseAVX <= 2); + match(Set dst (VectorStoreMask src size)); + format %{ "vector_store_mask $dst,$src\t! using $vtmp as TEMP" %} + effect(TEMP dst, TEMP vtmp); + ins_encode %{ + int vlen_enc = Assembler::AVX_128bit; + __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit); + __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1); + __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc); + __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct vstoreMask8B_evex(vec dst, vec src, immI_8 size) %{ + predicate(UseAVX > 2); + match(Set dst (VectorStoreMask src size)); + format %{ "vector_store_mask $dst,$src\t!" %} + ins_encode %{ + int src_vlen_enc = vector_length_encoding(this, $src); + int dst_vlen_enc = vector_length_encoding(this); + if (!VM_Version::supports_avx512vl()) { + src_vlen_enc = Assembler::AVX_512bit; + } + __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc); + __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +//-------------------------------- Load Iota Indices ---------------------------------- + +instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{ + predicate(vector_element_basic_type(n) == T_BYTE); + match(Set dst (VectorLoadConst src)); + effect(TEMP scratch); + format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %} + ins_encode %{ + int vlen_in_bytes = vector_length_in_bytes(this); + __ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen_in_bytes); + %} + ins_pipe( pipe_slow ); +%} + +//-------------------------------- Rearrange ---------------------------------- + +// LoadShuffle/Rearrange for Byte + +instruct loadShuffleB(vec dst) %{ + predicate(vector_element_basic_type(n) == T_BYTE); + match(Set dst (VectorLoadShuffle dst)); + format %{ "vector_load_shuffle $dst, $dst" %} + ins_encode %{ + // empty + %} + ins_pipe( pipe_slow ); +%} + +instruct rearrangeB(vec dst, vec shuffle) %{ + predicate(vector_element_basic_type(n) == T_BYTE && + vector_length(n) < 32); + match(Set dst (VectorRearrange dst shuffle)); + format %{ "vector_rearrange $dst, $shuffle, $dst" %} + ins_encode %{ + assert(UseSSE >= 4, "required"); + __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rearrangeB_avx(vec dst, vec src, vec shuffle) %{ + predicate(vector_element_basic_type(n) == T_BYTE && + vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi()); + match(Set dst (VectorRearrange src shuffle)); + format %{ "vector_rearrange $dst, $shuffle, $src" %} + ins_encode %{ + __ vpshufb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, Assembler::AVX_256bit); + %} + ins_pipe( pipe_slow ); +%} + +instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{ + predicate(vector_element_basic_type(n) == T_BYTE && + vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi()); + match(Set dst (VectorRearrange src shuffle)); + format %{ "vector_rearrange $dst, $shuffle, $src" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +// LoadShuffle/Rearrange for Short + +instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{ + predicate(vector_element_basic_type(n) == T_SHORT && + vector_length(n) <= 8 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS + match(Set dst (VectorLoadShuffle src)); + effect(TEMP dst, TEMP vtmp, TEMP scratch); + format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %} + ins_encode %{ + // Create a byte shuffle mask from short shuffle mask + // only byte shuffle instruction available on these platforms + + // Multiply each shuffle by two to get byte index + __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister); + __ psllw($vtmp$$XMMRegister, 1); + + // Duplicate to create 2 copies of byte index + __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister); + __ psllw($dst$$XMMRegister, 8); + __ por($dst$$XMMRegister, $vtmp$$XMMRegister); + + // Add one to get alternate byte index + __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register); + __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rearrangeS(vec dst, vec shuffle) %{ + predicate(vector_element_basic_type(n) == T_SHORT && + vector_length(n) <= 8 && !VM_Version::supports_avx512bw()); + match(Set dst (VectorRearrange dst shuffle)); + format %{ "vector_rearrange $dst, $shuffle, $dst" %} + ins_encode %{ + assert(UseSSE >= 4, "required"); + __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct loadShuffleS_evex(vec dst, vec src) %{ + predicate(vector_element_basic_type(n) == T_SHORT && + VM_Version::supports_avx512bw()); + match(Set dst (VectorLoadShuffle src)); + format %{ "vector_load_shuffle $dst, $src" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + if (!VM_Version::supports_avx512vl()) { + vlen_enc = Assembler::AVX_512bit; + } + __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{ + predicate(vector_element_basic_type(n) == T_SHORT && + VM_Version::supports_avx512bw()); + match(Set dst (VectorRearrange src shuffle)); + format %{ "vector_rearrange $dst, $shuffle, $src" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + if (!VM_Version::supports_avx512vl()) { + vlen_enc = Assembler::AVX_512bit; + } + __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +// LoadShuffle/Rearrange for Integer and Float + +instruct loadShuffleI(vec dst, vec src, vec vtmp, rRegP scratch) %{ + predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) && + vector_length(n) == 4 && UseAVX < 2); + match(Set dst (VectorLoadShuffle src)); + effect(TEMP dst, TEMP vtmp, TEMP scratch); + format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %} + ins_encode %{ + assert(UseSSE >= 4, "required"); + + // Create a byte shuffle mask from int shuffle mask + // only byte shuffle instruction available on these platforms + + // Duplicate and multiply each shuffle by 4 + __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister); + __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0); + __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0); + __ psllw($vtmp$$XMMRegister, 2); + + // Duplicate again to create 4 copies of byte index + __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister); + __ psllw($dst$$XMMRegister, 8); + __ por($vtmp$$XMMRegister, $dst$$XMMRegister); + + // Add 3,2,1,0 to get alternate byte index + __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register); + __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rearrangeI(vec dst, vec shuffle) %{ + predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) && + vector_length(n) == 4 && UseAVX < 2); + match(Set dst (VectorRearrange dst shuffle)); + format %{ "vector_rearrange $dst, $shuffle, $dst" %} + ins_encode %{ + assert(UseSSE >= 4, "required"); + __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct loadShuffleI_avx(vec dst, vec src) %{ + predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) && + UseAVX >= 2); + match(Set dst (VectorLoadShuffle src)); + format %{ "vector_load_shuffle $dst, $src" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{ + predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) && + UseAVX >= 2); + match(Set dst (VectorRearrange src shuffle)); + format %{ "vector_rearrange $dst, $shuffle, $src" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + if (vlen_enc == Assembler::AVX_128bit) { + vlen_enc = Assembler::AVX_256bit; + } + __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +// LoadShuffle/Rearrange for Long and Double + +instruct loadShuffleL(vec dst, vec src, vec vtmp, rRegP scratch) %{ + predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE + vector_length(n) < 8 && !VM_Version::supports_avx512vl()); + match(Set dst (VectorLoadShuffle src)); + effect(TEMP dst, TEMP vtmp, TEMP scratch); + format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %} + ins_encode %{ + assert(UseAVX >= 2, "required"); + + int vlen_enc = vector_length_encoding(this); + // Create a double word shuffle mask from long shuffle mask + // only double word shuffle instruction available on these platforms + + // Multiply each shuffle by two to get double word index + __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc); + __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc); + + // Duplicate each double word shuffle + __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc); + __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc); + + // Add one to get alternate double word index + __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct rearrangeL(vec dst, vec src, vec shuffle) %{ + predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE + vector_length(n) < 8 && !VM_Version::supports_avx512vl()); + match(Set dst (VectorRearrange src shuffle)); + format %{ "vector_rearrange $dst, $shuffle, $src" %} + ins_encode %{ + assert(UseAVX >= 2, "required"); + + int vlen_enc = vector_length_encoding(this); + __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct loadShuffleL_evex(vec dst, vec src) %{ + predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE + (vector_length(n) == 8 || VM_Version::supports_avx512vl())); + match(Set dst (VectorLoadShuffle src)); + format %{ "vector_load_shuffle $dst, $src" %} + ins_encode %{ + assert(UseAVX > 2, "required"); + + int vlen_enc = vector_length_encoding(this); + __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + +instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{ + predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE + (vector_length(n) == 8 || VM_Version::supports_avx512vl())); + match(Set dst (VectorRearrange src shuffle)); + format %{ "vector_rearrange $dst, $shuffle, $src" %} + ins_encode %{ + assert(UseAVX > 2, "required"); + + int vlen_enc = vector_length_encoding(this); + if (vlen_enc == Assembler::AVX_128bit) { + vlen_enc = Assembler::AVX_256bit; + } + __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc); + %} + ins_pipe( pipe_slow ); +%} + // --------------------------------- FMA -------------------------------------- // a * b + c @@ -5061,8 +7585,8 @@ ins_cost(150); ins_encode %{ assert(UseFMA, "not enabled"); - int vector_len = vector_length_encoding(this); - __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -5073,8 +7597,8 @@ ins_cost(150); ins_encode %{ assert(UseFMA, "not enabled"); - int vector_len = vector_length_encoding(this); - __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -5085,8 +7609,8 @@ ins_cost(150); ins_encode %{ assert(UseFMA, "not enabled"); - int vector_len = vector_length_encoding(this); - __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -5097,8 +7621,8 @@ ins_cost(150); ins_encode %{ assert(UseFMA, "not enabled"); - int vector_len = vector_length_encoding(this); - __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -5108,7 +7632,7 @@ instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{ predicate(UseAVX == 0); match(Set dst (MulAddVS2VI dst src1)); - format %{ "pmaddwd $dst,$dst,$src1\t! muladd packedStoI" %} + format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %} ins_encode %{ __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister); %} @@ -5120,8 +7644,8 @@ match(Set dst (MulAddVS2VI src1 src2)); format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %} ins_encode %{ - int vector_len = vector_length_encoding(this); - __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -5134,8 +7658,8 @@ format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %} ins_encode %{ assert(UseAVX > 2, "required"); - int vector_len = vector_length_encoding(this); - __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); ins_cost(10); @@ -5149,8 +7673,9 @@ ins_encode %{ assert(UsePopCountInstruction, "not enabled"); - int vector_len = vector_length_encoding(this); - __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + int vlen_enc = vector_length_encoding(this); + __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} + --- old/src/hotspot/cpu/x86/x86_32.ad 2020-04-02 18:03:49.552854103 -0700 +++ new/src/hotspot/cpu/x86/x86_32.ad 2020-04-02 18:03:49.388854103 -0700 @@ -3322,7 +3322,7 @@ %} // Constant for test vs zero -operand immI0() %{ +operand immI_0() %{ predicate(n->get_int() == 0); match(ConI); @@ -3332,7 +3332,7 @@ %} // Constant for increment -operand immI1() %{ +operand immI_1() %{ predicate(n->get_int() == 1); match(ConI); @@ -3369,6 +3369,16 @@ interface(CONST_INTER); %} +operand immU8() +%{ + predicate((0 <= n->get_int()) && (n->get_int() <= 255)); + match(ConI); + + op_cost(5); + format %{ %} + interface(CONST_INTER); +%} + operand immI16() %{ predicate((-32768 <= n->get_int()) && (n->get_int() <= 32767)); match(ConI); @@ -3417,8 +3427,8 @@ interface(CONST_INTER); %} -operand immI_1() %{ - predicate( n->get_int() == 1 ); +operand immI_2() %{ + predicate( n->get_int() == 2 ); match(ConI); op_cost(0); @@ -3426,8 +3436,8 @@ interface(CONST_INTER); %} -operand immI_2() %{ - predicate( n->get_int() == 2 ); +operand immI_3() %{ + predicate( n->get_int() == 3 ); match(ConI); op_cost(0); @@ -3435,8 +3445,19 @@ interface(CONST_INTER); %} -operand immI_3() %{ - predicate( n->get_int() == 3 ); +operand immI_4() +%{ + predicate(n->get_int() == 4); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + +operand immI_8() +%{ + predicate(n->get_int() == 8); match(ConI); op_cost(0); @@ -3813,6 +3834,18 @@ interface(REG_INTER); %} +operand rRegP() %{ + constraint(ALLOC_IN_RC(int_reg)); + match(RegP); + match(eAXRegP); + match(eBXRegP); + match(eCXRegP); + match(eDIRegP); + + format %{ %} + interface(REG_INTER); +%} + // On windows95, EBP is not safe to use for implicit null tests. operand eRegP_no_EBP() %{ constraint(ALLOC_IN_RC(int_reg_no_ebp)); @@ -3946,6 +3979,15 @@ %} // Flags register, used as output of compare instructions +operand rFlagsReg() %{ + constraint(ALLOC_IN_RC(int_flags)); + match(RegFlags); + + format %{ "EFLAGS" %} + interface(REG_INTER); +%} + +// Flags register, used as output of compare instructions operand eFlagsReg() %{ constraint(ALLOC_IN_RC(int_flags)); match(RegFlags); @@ -4075,6 +4117,14 @@ interface(REG_INTER); %} +operand legRegF() %{ + predicate( UseSSE>=1 ); + constraint(ALLOC_IN_RC(float_reg_legacy)); + match(RegF); + format %{ %} + interface(REG_INTER); +%} + // Float register operands operand vlRegF() %{ constraint(ALLOC_IN_RC(float_reg_vl)); @@ -4094,6 +4144,14 @@ %} // Double register operands +operand legRegD() %{ + predicate( UseSSE>=2 ); + constraint(ALLOC_IN_RC(double_reg_legacy)); + match(RegD); + format %{ %} + interface(REG_INTER); +%} + operand vlRegD() %{ constraint(ALLOC_IN_RC(double_reg_vl)); match(RegD); @@ -5844,6 +5902,46 @@ ins_pipe( ialu_reg_mem ); %} +// Load Float +instruct MoveF2LEG(legRegF dst, regF src) %{ + match(Set dst src); + format %{ "movss $dst,$src\t# if src != dst load float (4 bytes)" %} + ins_encode %{ + __ movflt($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + +// Load Float +instruct MoveLEG2F(regF dst, legRegF src) %{ + match(Set dst src); + format %{ "movss $dst,$src\t# if src != dst load float (4 bytes)" %} + ins_encode %{ + __ movflt($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + +// Load Double +instruct MoveD2LEG(legRegD dst, regD src) %{ + match(Set dst src); + format %{ "movsd $dst,$src\t# if src != dst load double (8 bytes)" %} + ins_encode %{ + __ movdbl($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + +// Load Double +instruct MoveLEG2D(regD dst, legRegD src) %{ + match(Set dst src); + format %{ "movsd $dst,$src\t# if src != dst load double (8 bytes)" %} + ins_encode %{ + __ movdbl($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( fpu_reg_reg ); +%} + // Load Double instruct loadDPR(regDPR dst, memory mem) %{ predicate(UseSSE<=1); @@ -5969,7 +6067,7 @@ %} // Load Constant zero -instruct loadConI0(rRegI dst, immI0 src, eFlagsReg cr) %{ +instruct loadConI0(rRegI dst, immI_0 src, eFlagsReg cr) %{ match(Set dst src); effect(KILL cr); @@ -7081,7 +7179,7 @@ ins_pipe( ialu_reg ); %} -instruct incI_eReg(rRegI dst, immI1 src, eFlagsReg cr) %{ +instruct incI_eReg(rRegI dst, immI_1 src, eFlagsReg cr) %{ predicate(UseIncDec); match(Set dst (AddI dst src)); effect(KILL cr); @@ -7181,7 +7279,7 @@ ins_pipe( ialu_mem_imm ); %} -instruct incI_mem(memory dst, immI1 src, eFlagsReg cr) %{ +instruct incI_mem(memory dst, immI_1 src, eFlagsReg cr) %{ match(Set dst (StoreI dst (AddI (LoadI dst) src))); effect(KILL cr); @@ -7559,7 +7657,7 @@ %} // Subtract from a pointer -instruct subP_eReg(eRegP dst, rRegI src, immI0 zero, eFlagsReg cr) %{ +instruct subP_eReg(eRegP dst, rRegI src, immI_0 zero, eFlagsReg cr) %{ match(Set dst (AddP dst (SubI zero src))); effect(KILL cr); @@ -7570,7 +7668,7 @@ ins_pipe( ialu_reg_reg ); %} -instruct negI_eReg(rRegI dst, immI0 zero, eFlagsReg cr) %{ +instruct negI_eReg(rRegI dst, immI_0 zero, eFlagsReg cr) %{ match(Set dst (SubI zero dst)); effect(KILL cr); @@ -8024,7 +8122,7 @@ // Integer Shift Instructions // Shift Left by one -instruct shlI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{ +instruct shlI_eReg_1(rRegI dst, immI_1 shift, eFlagsReg cr) %{ match(Set dst (LShiftI dst shift)); effect(KILL cr); @@ -8060,7 +8158,7 @@ %} // Arithmetic shift right by one -instruct sarI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{ +instruct sarI_eReg_1(rRegI dst, immI_1 shift, eFlagsReg cr) %{ match(Set dst (RShiftI dst shift)); effect(KILL cr); @@ -8072,7 +8170,7 @@ %} // Arithmetic shift right by one -instruct sarI_mem_1(memory dst, immI1 shift, eFlagsReg cr) %{ +instruct sarI_mem_1(memory dst, immI_1 shift, eFlagsReg cr) %{ match(Set dst (StoreI dst (RShiftI (LoadI dst) shift))); effect(KILL cr); format %{ "SAR $dst,$shift" %} @@ -8117,7 +8215,7 @@ %} // Logical shift right by one -instruct shrI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{ +instruct shrI_eReg_1(rRegI dst, immI_1 shift, eFlagsReg cr) %{ match(Set dst (URShiftI dst shift)); effect(KILL cr); @@ -8273,7 +8371,7 @@ ins_pipe(ialu_reg_mem); %} -instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI0 imm_zero, eFlagsReg cr) %{ +instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI_0 imm_zero, eFlagsReg cr) %{ match(Set dst (AndI (SubI imm_zero src) src)); predicate(UseBMI1Instructions); effect(KILL cr); @@ -8286,7 +8384,7 @@ ins_pipe(ialu_reg); %} -instruct blsiI_rReg_mem(rRegI dst, memory src, immI0 imm_zero, eFlagsReg cr) %{ +instruct blsiI_rReg_mem(rRegI dst, memory src, immI_0 imm_zero, eFlagsReg cr) %{ match(Set dst (AndI (SubI imm_zero (LoadI src) ) (LoadI src) )); predicate(UseBMI1Instructions); effect(KILL cr); @@ -8438,7 +8536,7 @@ // ROL/ROR // ROL expand -instruct rolI_eReg_imm1(rRegI dst, immI1 shift, eFlagsReg cr) %{ +instruct rolI_eReg_imm1(rRegI dst, immI_1 shift, eFlagsReg cr) %{ effect(USE_DEF dst, USE shift, KILL cr); format %{ "ROL $dst, $shift" %} @@ -8467,7 +8565,7 @@ // end of ROL expand // ROL 32bit by one once -instruct rolI_eReg_i1(rRegI dst, immI1 lshift, immI_M1 rshift, eFlagsReg cr) %{ +instruct rolI_eReg_i1(rRegI dst, immI_1 lshift, immI_M1 rshift, eFlagsReg cr) %{ match(Set dst ( OrI (LShiftI dst lshift) (URShiftI dst rshift))); expand %{ @@ -8486,7 +8584,7 @@ %} // ROL 32bit var by var once -instruct rolI_eReg_Var_C0(ncxRegI dst, eCXRegI shift, immI0 zero, eFlagsReg cr) %{ +instruct rolI_eReg_Var_C0(ncxRegI dst, eCXRegI shift, immI_0 zero, eFlagsReg cr) %{ match(Set dst ( OrI (LShiftI dst shift) (URShiftI dst (SubI zero shift)))); expand %{ @@ -8504,7 +8602,7 @@ %} // ROR expand -instruct rorI_eReg_imm1(rRegI dst, immI1 shift, eFlagsReg cr) %{ +instruct rorI_eReg_imm1(rRegI dst, immI_1 shift, eFlagsReg cr) %{ effect(USE_DEF dst, USE shift, KILL cr); format %{ "ROR $dst, $shift" %} @@ -8533,7 +8631,7 @@ // end of ROR expand // ROR right once -instruct rorI_eReg_i1(rRegI dst, immI1 rshift, immI_M1 lshift, eFlagsReg cr) %{ +instruct rorI_eReg_i1(rRegI dst, immI_1 rshift, immI_M1 lshift, eFlagsReg cr) %{ match(Set dst ( OrI (URShiftI dst rshift) (LShiftI dst lshift))); expand %{ @@ -8552,7 +8650,7 @@ %} // ROR 32bit var by var once -instruct rorI_eReg_Var_C0(ncxRegI dst, eCXRegI shift, immI0 zero, eFlagsReg cr) %{ +instruct rorI_eReg_Var_C0(ncxRegI dst, eCXRegI shift, immI_0 zero, eFlagsReg cr) %{ match(Set dst ( OrI (URShiftI dst shift) (LShiftI dst (SubI zero shift)))); expand %{ @@ -8720,7 +8818,7 @@ ins_pipe(pipe_slow); %} -instruct cmpLTMask0(rRegI dst, immI0 zero, eFlagsReg cr) %{ +instruct cmpLTMask0(rRegI dst, immI_0 zero, eFlagsReg cr) %{ match(Set dst (CmpLTMask dst zero)); effect(DEF dst, KILL cr); ins_cost(100); @@ -8834,7 +8932,7 @@ ins_pipe(ialu_reg_reg); %} -instruct overflowNegI_rReg(eFlagsReg cr, immI0 zero, eAXRegI op2) +instruct overflowNegI_rReg(eFlagsReg cr, immI_0 zero, eAXRegI op2) %{ match(Set cr (OverflowSubI zero op2)); effect(DEF cr, USE_KILL op2); @@ -11972,7 +12070,7 @@ ins_pipe( ialu_cr_reg_mem ); %} -instruct testI_reg( eFlagsReg cr, rRegI src, immI0 zero ) %{ +instruct testI_reg( eFlagsReg cr, rRegI src, immI_0 zero ) %{ match(Set cr (CmpI src zero)); effect( DEF cr, USE src ); @@ -11982,7 +12080,7 @@ ins_pipe( ialu_cr_reg_imm ); %} -instruct testI_reg_imm( eFlagsReg cr, rRegI src, immI con, immI0 zero ) %{ +instruct testI_reg_imm( eFlagsReg cr, rRegI src, immI con, immI_0 zero ) %{ match(Set cr (CmpI (AndI src con) zero)); format %{ "TEST $src,$con" %} @@ -11991,7 +12089,7 @@ ins_pipe( ialu_cr_reg_imm ); %} -instruct testI_reg_mem( eFlagsReg cr, rRegI src, memory mem, immI0 zero ) %{ +instruct testI_reg_mem( eFlagsReg cr, rRegI src, memory mem, immI_0 zero ) %{ match(Set cr (CmpI (AndI src mem) zero)); format %{ "TEST $src,$mem" %} @@ -12041,7 +12139,7 @@ // ins_encode( OpcP, RegMem( op1, op2) ); //%} -instruct testU_reg( eFlagsRegU cr, rRegI src, immI0 zero ) %{ +instruct testU_reg( eFlagsRegU cr, rRegI src, immI_0 zero ) %{ match(Set cr (CmpU src zero)); format %{ "TESTu $src,$src" %} @@ -12118,7 +12216,7 @@ // Cisc-spilled version of testP_reg // This will generate a signed flags result. This should be ok // since any compare to a zero should be eq/neq. -instruct testP_Reg_mem( eFlagsReg cr, memory op, immI0 zero ) %{ +instruct testP_Reg_mem( eFlagsReg cr, memory op, immI_0 zero ) %{ match(Set cr (CmpP (LoadP op) zero)); format %{ "TEST $op,0xFFFFFFFF" %} @@ -13489,7 +13587,7 @@ // match(Set dst (CopyI src)); // %} // -// instruct incI_eReg(rRegI dst, immI1 src, eFlagsReg cr) %{ +// instruct incI_eReg(rRegI dst, immI_1 src, eFlagsReg cr) %{ // match(Set dst (AddI dst src)); // effect(KILL cr); // %} --- old/src/hotspot/cpu/x86/x86_64.ad 2020-04-02 18:03:50.148854104 -0700 +++ new/src/hotspot/cpu/x86/x86_64.ad 2020-04-02 18:03:49.964854104 -0700 @@ -2878,7 +2878,7 @@ %} // Constant for test vs zero -operand immI0() +operand immI_0() %{ predicate(n->get_int() == 0); match(ConI); @@ -2889,7 +2889,7 @@ %} // Constant for increment -operand immI1() +operand immI_1() %{ predicate(n->get_int() == 1); match(ConI); @@ -2910,6 +2910,36 @@ interface(CONST_INTER); %} +operand immI_2() +%{ + predicate(n->get_int() == 2); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + +operand immI_4() +%{ + predicate(n->get_int() == 4); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + +operand immI_8() +%{ + predicate(n->get_int() == 8); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + // Valid scale values for addressing modes operand immI2() %{ @@ -5267,19 +5297,19 @@ match(Set dst (MaxF a b)); effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp); format %{ - "blendvps $btmp,$b,$a,$b \n\t" - "blendvps $atmp,$a,$b,$b \n\t" + "vblendvps $btmp,$b,$a,$b \n\t" + "vblendvps $atmp,$a,$b,$b \n\t" "vmaxss $tmp,$atmp,$btmp \n\t" - "cmpps.unordered $btmp,$atmp,$atmp \n\t" - "blendvps $dst,$tmp,$atmp,$btmp \n\t" + "vcmpps.unordered $btmp,$atmp,$atmp \n\t" + "vblendvps $dst,$tmp,$atmp,$btmp \n\t" %} ins_encode %{ int vector_len = Assembler::AVX_128bit; - __ blendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len); - __ blendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len); + __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len); + __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len); __ vmaxss($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister); - __ cmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len); - __ blendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len); + __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len); + __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5303,19 +5333,19 @@ match(Set dst (MaxD a b)); effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp); format %{ - "blendvpd $btmp,$b,$a,$b \n\t" - "blendvpd $atmp,$a,$b,$b \n\t" + "vblendvpd $btmp,$b,$a,$b \n\t" + "vblendvpd $atmp,$a,$b,$b \n\t" "vmaxsd $tmp,$atmp,$btmp \n\t" - "cmppd.unordered $btmp,$atmp,$atmp \n\t" - "blendvpd $dst,$tmp,$atmp,$btmp \n\t" + "vcmppd.unordered $btmp,$atmp,$atmp \n\t" + "vblendvpd $dst,$tmp,$atmp,$btmp \n\t" %} ins_encode %{ int vector_len = Assembler::AVX_128bit; - __ blendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len); - __ blendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len); + __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len); + __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len); __ vmaxsd($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister); - __ cmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len); - __ blendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len); + __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len); + __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5339,19 +5369,19 @@ match(Set dst (MinF a b)); effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp); format %{ - "blendvps $atmp,$a,$b,$a \n\t" - "blendvps $btmp,$b,$a,$a \n\t" + "vblendvps $atmp,$a,$b,$a \n\t" + "vblendvps $btmp,$b,$a,$a \n\t" "vminss $tmp,$atmp,$btmp \n\t" - "cmpps.unordered $btmp,$atmp,$atmp \n\t" - "blendvps $dst,$tmp,$atmp,$btmp \n\t" + "vcmpps.unordered $btmp,$atmp,$atmp \n\t" + "vblendvps $dst,$tmp,$atmp,$btmp \n\t" %} ins_encode %{ int vector_len = Assembler::AVX_128bit; - __ blendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len); - __ blendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len); + __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len); + __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len); __ vminss($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister); - __ cmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len); - __ blendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len); + __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len); + __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5375,19 +5405,19 @@ match(Set dst (MinD a b)); effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp); format %{ - "blendvpd $atmp,$a,$b,$a \n\t" - "blendvpd $btmp,$b,$a,$a \n\t" + "vblendvpd $atmp,$a,$b,$a \n\t" + "vblendvpd $btmp,$b,$a,$a \n\t" "vminsd $tmp,$atmp,$btmp \n\t" - "cmppd.unordered $btmp,$atmp,$atmp \n\t" - "blendvpd $dst,$tmp,$atmp,$btmp \n\t" + "vcmppd.unordered $btmp,$atmp,$atmp \n\t" + "vblendvpd $dst,$tmp,$atmp,$btmp \n\t" %} ins_encode %{ int vector_len = Assembler::AVX_128bit; - __ blendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len); - __ blendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len); + __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len); + __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len); __ vminsd($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister); - __ cmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len); - __ blendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len); + __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len); + __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len); %} ins_pipe( pipe_slow ); %} @@ -5611,7 +5641,7 @@ ins_pipe(ialu_reg_fat); // XXX %} -instruct loadConI0(rRegI dst, immI0 src, rFlagsReg cr) +instruct loadConI0(rRegI dst, immI_0 src, rFlagsReg cr) %{ match(Set dst src); effect(KILL cr); @@ -6047,7 +6077,7 @@ %} // Store Integer Immediate -instruct storeImmI0(memory mem, immI0 zero) +instruct storeImmI0(memory mem, immI_0 zero) %{ predicate(UseCompressedOops && (CompressedOops::base() == NULL) && (CompressedKlassPointers::base() == NULL)); match(Set mem (StoreI mem zero)); @@ -6097,7 +6127,7 @@ %} // Store Short/Char Immediate -instruct storeImmC0(memory mem, immI0 zero) +instruct storeImmC0(memory mem, immI_0 zero) %{ predicate(UseCompressedOops && (CompressedOops::base() == NULL) && (CompressedKlassPointers::base() == NULL)); match(Set mem (StoreC mem zero)); @@ -6123,7 +6153,7 @@ %} // Store Byte Immediate -instruct storeImmB0(memory mem, immI0 zero) +instruct storeImmB0(memory mem, immI_0 zero) %{ predicate(UseCompressedOops && (CompressedOops::base() == NULL) && (CompressedKlassPointers::base() == NULL)); match(Set mem (StoreB mem zero)); @@ -6148,7 +6178,7 @@ %} // Store CMS card-mark Immediate -instruct storeImmCM0_reg(memory mem, immI0 zero) +instruct storeImmCM0_reg(memory mem, immI_0 zero) %{ predicate(UseCompressedOops && (CompressedOops::base() == NULL) && (CompressedKlassPointers::base() == NULL)); match(Set mem (StoreCM mem zero)); @@ -6161,7 +6191,7 @@ ins_pipe(ialu_mem_reg); %} -instruct storeImmCM0(memory mem, immI0 src) +instruct storeImmCM0(memory mem, immI_0 src) %{ match(Set mem (StoreCM mem src)); @@ -7253,7 +7283,7 @@ ins_pipe(ialu_mem_imm); %} -instruct incI_rReg(rRegI dst, immI1 src, rFlagsReg cr) +instruct incI_rReg(rRegI dst, immI_1 src, rFlagsReg cr) %{ predicate(UseIncDec); match(Set dst (AddI dst src)); @@ -7265,7 +7295,7 @@ ins_pipe(ialu_reg); %} -instruct incI_mem(memory dst, immI1 src, rFlagsReg cr) +instruct incI_mem(memory dst, immI_1 src, rFlagsReg cr) %{ predicate(UseIncDec); match(Set dst (StoreI dst (AddI (LoadI dst) src))); @@ -8159,7 +8189,7 @@ // Subtract from a pointer // XXX hmpf??? -instruct subP_rReg(rRegP dst, rRegI src, immI0 zero, rFlagsReg cr) +instruct subP_rReg(rRegP dst, rRegI src, immI_0 zero, rFlagsReg cr) %{ match(Set dst (AddP dst (SubI zero src))); effect(KILL cr); @@ -8170,7 +8200,7 @@ ins_pipe(ialu_reg_reg); %} -instruct negI_rReg(rRegI dst, immI0 zero, rFlagsReg cr) +instruct negI_rReg(rRegI dst, immI_0 zero, rFlagsReg cr) %{ match(Set dst (SubI zero dst)); effect(KILL cr); @@ -8181,7 +8211,19 @@ ins_pipe(ialu_reg); %} -instruct negI_mem(memory dst, immI0 zero, rFlagsReg cr) +instruct negI_rReg_2(rRegI dst, rFlagsReg cr) +%{ + match(Set dst (NegI dst)); + effect(KILL cr); + + format %{ "negl $dst\t# int" %} + ins_encode %{ + __ negl($dst$$Register); + %} + ins_pipe(ialu_reg); +%} + +instruct negI_mem(memory dst, immI_0 zero, rFlagsReg cr) %{ match(Set dst (StoreI dst (SubI zero (LoadI dst)))); effect(KILL cr); @@ -8203,6 +8245,18 @@ ins_pipe(ialu_reg); %} +instruct negL_rReg_2(rRegL dst, rFlagsReg cr) +%{ + match(Set dst (NegL dst)); + effect(KILL cr); + + format %{ "negq $dst\t# int" %} + ins_encode %{ + __ negq($dst$$Register); + %} + ins_pipe(ialu_reg); +%} + instruct negL_mem(memory dst, immL0 zero, rFlagsReg cr) %{ match(Set dst (StoreL dst (SubL zero (LoadL dst)))); @@ -8528,7 +8582,7 @@ // Integer Shift Instructions // Shift Left by one -instruct salI_rReg_1(rRegI dst, immI1 shift, rFlagsReg cr) +instruct salI_rReg_1(rRegI dst, immI_1 shift, rFlagsReg cr) %{ match(Set dst (LShiftI dst shift)); effect(KILL cr); @@ -8540,7 +8594,7 @@ %} // Shift Left by one -instruct salI_mem_1(memory dst, immI1 shift, rFlagsReg cr) +instruct salI_mem_1(memory dst, immI_1 shift, rFlagsReg cr) %{ match(Set dst (StoreI dst (LShiftI (LoadI dst) shift))); effect(KILL cr); @@ -8600,7 +8654,7 @@ %} // Arithmetic shift right by one -instruct sarI_rReg_1(rRegI dst, immI1 shift, rFlagsReg cr) +instruct sarI_rReg_1(rRegI dst, immI_1 shift, rFlagsReg cr) %{ match(Set dst (RShiftI dst shift)); effect(KILL cr); @@ -8612,7 +8666,7 @@ %} // Arithmetic shift right by one -instruct sarI_mem_1(memory dst, immI1 shift, rFlagsReg cr) +instruct sarI_mem_1(memory dst, immI_1 shift, rFlagsReg cr) %{ match(Set dst (StoreI dst (RShiftI (LoadI dst) shift))); effect(KILL cr); @@ -8672,7 +8726,7 @@ %} // Logical shift right by one -instruct shrI_rReg_1(rRegI dst, immI1 shift, rFlagsReg cr) +instruct shrI_rReg_1(rRegI dst, immI_1 shift, rFlagsReg cr) %{ match(Set dst (URShiftI dst shift)); effect(KILL cr); @@ -8684,7 +8738,7 @@ %} // Logical shift right by one -instruct shrI_mem_1(memory dst, immI1 shift, rFlagsReg cr) +instruct shrI_mem_1(memory dst, immI_1 shift, rFlagsReg cr) %{ match(Set dst (StoreI dst (URShiftI (LoadI dst) shift))); effect(KILL cr); @@ -8745,7 +8799,7 @@ // Long Shift Instructions // Shift Left by one -instruct salL_rReg_1(rRegL dst, immI1 shift, rFlagsReg cr) +instruct salL_rReg_1(rRegL dst, immI_1 shift, rFlagsReg cr) %{ match(Set dst (LShiftL dst shift)); effect(KILL cr); @@ -8757,7 +8811,7 @@ %} // Shift Left by one -instruct salL_mem_1(memory dst, immI1 shift, rFlagsReg cr) +instruct salL_mem_1(memory dst, immI_1 shift, rFlagsReg cr) %{ match(Set dst (StoreL dst (LShiftL (LoadL dst) shift))); effect(KILL cr); @@ -8818,7 +8872,7 @@ %} // Arithmetic shift right by one -instruct sarL_rReg_1(rRegL dst, immI1 shift, rFlagsReg cr) +instruct sarL_rReg_1(rRegL dst, immI_1 shift, rFlagsReg cr) %{ match(Set dst (RShiftL dst shift)); effect(KILL cr); @@ -8830,7 +8884,7 @@ %} // Arithmetic shift right by one -instruct sarL_mem_1(memory dst, immI1 shift, rFlagsReg cr) +instruct sarL_mem_1(memory dst, immI_1 shift, rFlagsReg cr) %{ match(Set dst (StoreL dst (RShiftL (LoadL dst) shift))); effect(KILL cr); @@ -8891,7 +8945,7 @@ %} // Logical shift right by one -instruct shrL_rReg_1(rRegL dst, immI1 shift, rFlagsReg cr) +instruct shrL_rReg_1(rRegL dst, immI_1 shift, rFlagsReg cr) %{ match(Set dst (URShiftL dst shift)); effect(KILL cr); @@ -8903,7 +8957,7 @@ %} // Logical shift right by one -instruct shrL_mem_1(memory dst, immI1 shift, rFlagsReg cr) +instruct shrL_mem_1(memory dst, immI_1 shift, rFlagsReg cr) %{ match(Set dst (StoreL dst (URShiftL (LoadL dst) shift))); effect(KILL cr); @@ -9021,7 +9075,7 @@ // end of ROL expand // Rotate Left by one -instruct rolI_rReg_i1(rRegI dst, immI1 lshift, immI_M1 rshift, rFlagsReg cr) +instruct rolI_rReg_i1(rRegI dst, immI_1 lshift, immI_M1 rshift, rFlagsReg cr) %{ match(Set dst (OrI (LShiftI dst lshift) (URShiftI dst rshift))); @@ -9042,7 +9096,7 @@ %} // Rotate Left by variable -instruct rolI_rReg_Var_C0(no_rcx_RegI dst, rcx_RegI shift, immI0 zero, rFlagsReg cr) +instruct rolI_rReg_Var_C0(no_rcx_RegI dst, rcx_RegI shift, immI_0 zero, rFlagsReg cr) %{ match(Set dst (OrI (LShiftI dst shift) (URShiftI dst (SubI zero shift)))); @@ -9094,7 +9148,7 @@ // end of ROR expand // Rotate Right by one -instruct rorI_rReg_i1(rRegI dst, immI1 rshift, immI_M1 lshift, rFlagsReg cr) +instruct rorI_rReg_i1(rRegI dst, immI_1 rshift, immI_M1 lshift, rFlagsReg cr) %{ match(Set dst (OrI (URShiftI dst rshift) (LShiftI dst lshift))); @@ -9115,7 +9169,7 @@ %} // Rotate Right by variable -instruct rorI_rReg_Var_C0(no_rcx_RegI dst, rcx_RegI shift, immI0 zero, rFlagsReg cr) +instruct rorI_rReg_Var_C0(no_rcx_RegI dst, rcx_RegI shift, immI_0 zero, rFlagsReg cr) %{ match(Set dst (OrI (URShiftI dst shift) (LShiftI dst (SubI zero shift)))); @@ -9166,7 +9220,7 @@ // end of ROL expand // Rotate Left by one -instruct rolL_rReg_i1(rRegL dst, immI1 lshift, immI_M1 rshift, rFlagsReg cr) +instruct rolL_rReg_i1(rRegL dst, immI_1 lshift, immI_M1 rshift, rFlagsReg cr) %{ match(Set dst (OrL (LShiftL dst lshift) (URShiftL dst rshift))); @@ -9187,7 +9241,7 @@ %} // Rotate Left by variable -instruct rolL_rReg_Var_C0(no_rcx_RegL dst, rcx_RegI shift, immI0 zero, rFlagsReg cr) +instruct rolL_rReg_Var_C0(no_rcx_RegL dst, rcx_RegI shift, immI_0 zero, rFlagsReg cr) %{ match(Set dst (OrL (LShiftL dst shift) (URShiftL dst (SubI zero shift)))); @@ -9239,7 +9293,7 @@ // end of ROR expand // Rotate Right by one -instruct rorL_rReg_i1(rRegL dst, immI1 rshift, immI_M1 lshift, rFlagsReg cr) +instruct rorL_rReg_i1(rRegL dst, immI_1 rshift, immI_M1 lshift, rFlagsReg cr) %{ match(Set dst (OrL (URShiftL dst rshift) (LShiftL dst lshift))); @@ -9260,7 +9314,7 @@ %} // Rotate Right by variable -instruct rorL_rReg_Var_C0(no_rcx_RegL dst, rcx_RegI shift, immI0 zero, rFlagsReg cr) +instruct rorL_rReg_Var_C0(no_rcx_RegL dst, rcx_RegI shift, immI_0 zero, rFlagsReg cr) %{ match(Set dst (OrL (URShiftL dst shift) (LShiftL dst (SubI zero shift)))); @@ -9432,7 +9486,7 @@ ins_pipe(ialu_reg); %} -instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI0 imm_zero, rFlagsReg cr) %{ +instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI_0 imm_zero, rFlagsReg cr) %{ match(Set dst (AndI (SubI imm_zero src) src)); predicate(UseBMI1Instructions); effect(KILL cr); @@ -9445,7 +9499,7 @@ ins_pipe(ialu_reg); %} -instruct blsiI_rReg_mem(rRegI dst, memory src, immI0 imm_zero, rFlagsReg cr) %{ +instruct blsiI_rReg_mem(rRegI dst, memory src, immI_0 imm_zero, rFlagsReg cr) %{ match(Set dst (AndI (SubI imm_zero (LoadI src) ) (LoadI src) )); predicate(UseBMI1Instructions); effect(KILL cr); @@ -10128,7 +10182,7 @@ ins_pipe(pipe_slow); %} -instruct cmpLTMask0(rRegI dst, immI0 zero, rFlagsReg cr) +instruct cmpLTMask0(rRegI dst, immI_0 zero, rFlagsReg cr) %{ match(Set dst (CmpLTMask dst zero)); effect(KILL cr); @@ -11461,7 +11515,7 @@ ins_pipe(ialu_reg_reg); %} -instruct overflowNegI_rReg(rFlagsReg cr, immI0 zero, rax_RegI op2) +instruct overflowNegI_rReg(rFlagsReg cr, immI_0 zero, rax_RegI op2) %{ match(Set cr (OverflowSubI zero op2)); effect(DEF cr, USE_KILL op2); @@ -11570,7 +11624,7 @@ ins_pipe(ialu_cr_reg_mem); %} -instruct testI_reg(rFlagsReg cr, rRegI src, immI0 zero) +instruct testI_reg(rFlagsReg cr, rRegI src, immI_0 zero) %{ match(Set cr (CmpI src zero)); @@ -11580,7 +11634,7 @@ ins_pipe(ialu_cr_reg_imm); %} -instruct testI_reg_imm(rFlagsReg cr, rRegI src, immI con, immI0 zero) +instruct testI_reg_imm(rFlagsReg cr, rRegI src, immI con, immI_0 zero) %{ match(Set cr (CmpI (AndI src con) zero)); @@ -11590,7 +11644,7 @@ ins_pipe(ialu_cr_reg_imm); %} -instruct testI_reg_mem(rFlagsReg cr, rRegI src, memory mem, immI0 zero) +instruct testI_reg_mem(rFlagsReg cr, rRegI src, memory mem, immI_0 zero) %{ match(Set cr (CmpI (AndI src (LoadI mem)) zero)); @@ -11644,7 +11698,7 @@ // // ins_encode( OpcP, reg_mem( op1, op2) ); // //%} -instruct testU_reg(rFlagsRegU cr, rRegI src, immI0 zero) +instruct testU_reg(rFlagsRegU cr, rRegI src, immI_0 zero) %{ match(Set cr (CmpU src zero)); @@ -11983,7 +12037,7 @@ ins_pipe(ialu_cr_reg_mem); %} -instruct testUB_mem_imm(rFlagsReg cr, memory mem, immU8 imm, immI0 zero) +instruct testUB_mem_imm(rFlagsReg cr, memory mem, immU8 imm, immI_0 zero) %{ match(Set cr (CmpI (AndI (LoadUB mem) imm) zero)); @@ -11993,7 +12047,7 @@ ins_pipe(ialu_cr_reg_mem); %} -instruct testB_mem_imm(rFlagsReg cr, memory mem, immI8 imm, immI0 zero) +instruct testB_mem_imm(rFlagsReg cr, memory mem, immI8 imm, immI_0 zero) %{ match(Set cr (CmpI (AndI (LoadB mem) imm) zero)); @@ -12716,7 +12770,7 @@ // match(Set dst (CopyI src)); // %} // -// instruct incI_rReg(rRegI dst, immI1 src, rFlagsReg cr) +// instruct incI_rReg(rRegI dst, immI_1 src, rFlagsReg cr) // %{ // match(Set dst (AddI dst src)); // effect(KILL cr);