--- old/src/hotspot/cpu/x86/assembler_x86.cpp 2018-03-09 13:15:37.482686297 -0800 +++ new/src/hotspot/cpu/x86/assembler_x86.cpp 2018-03-09 13:15:37.186685189 -0800 @@ -8709,6 +8709,24 @@ emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::vpopcntd(XMMRegister dst, XMMRegister src, int vector_len) { + assert(UseAVX > 2 && VM_Version::support_avx512_vpopcntdq(), "must support vpopcntdq feature"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x55); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::vpopcntq(XMMRegister dst, XMMRegister src, int vector_len) { + assert(UseAVX > 2 && VM_Version::support_avx512_vpopcntdq(), "must support vpopcntdq feature"); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x55); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::popq(Address dst) { InstructionMark im(this); prefixq(dst); --- old/src/hotspot/cpu/x86/assembler_x86.hpp 2018-03-09 13:15:38.614690529 -0800 +++ new/src/hotspot/cpu/x86/assembler_x86.hpp 2018-03-09 13:15:38.318689422 -0800 @@ -1638,6 +1638,9 @@ void popcntq(Register dst, Register src); #endif + void vpopcntd(XMMRegister dst, XMMRegister src, int vector_len); + void vpopcntq(XMMRegister dst, XMMRegister src, int vector_len); + // Prefetches (SSE, SSE2, 3DNOW only) void prefetchnta(Address src); --- old/src/hotspot/cpu/x86/vm_version_x86.cpp 2018-03-09 13:15:39.706694614 -0800 +++ new/src/hotspot/cpu/x86/vm_version_x86.cpp 2018-03-09 13:15:39.414693522 -0800 @@ -257,6 +257,8 @@ __ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset()))); __ movl(Address(rsi, 0), rax); __ movl(Address(rsi, 4), rbx); + __ movl(Address(rsi, 8), rcx); + __ movl(Address(rsi, 12), rdx); // // Extended cpuid(0x80000000) --- old/src/hotspot/cpu/x86/vm_version_x86.hpp 2018-03-09 13:15:40.802698715 -0800 +++ new/src/hotspot/cpu/x86/vm_version_x86.hpp 2018-03-09 13:15:40.502697592 -0800 @@ -228,6 +228,38 @@ } bits; }; + union SefCpuid7Ecx { + uint32_t value; + struct { + uint32_t prefetchwt1 : 1, + avx512_vbmi : 1, + umip : 1, + pku : 1, + ospke : 1, + : 1, + avx512_vbmi2 : 1, + : 1, + gfni : 1, + vaes : 1, + vpclmulqdq : 1, + avx512_vnni : 1, + avx512_bitalg : 1, + : 1, + avx512_vpopcntdq : 1, + : 17; + } bits; + }; + + union SefCpuid7Edx { + uint32_t value; + struct { + uint32_t : 2, + avx512_4vnniw : 1, + avx512_4fmaps : 1, + : 28; + } bits; + }; + union ExtCpuid1EEbx { uint32_t value; struct { @@ -300,7 +332,8 @@ #define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length #define CPU_SHA ((uint64_t)UCONST64(0x400000000)) // SHA instructions #define CPU_FMA ((uint64_t)UCONST64(0x800000000)) // FMA instructions -#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000)) // Vzeroupper instruction +#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000)) // Vzeroupper instruction +#define CPU_AVX512_VPOPCNTDQ ((uint64_t)UCONST64(0x2000000000)) // Vector popcount enum Extended_Family { // AMD @@ -353,8 +386,8 @@ // cpuid function 7 (structured extended features) SefCpuid7Eax sef_cpuid7_eax; SefCpuid7Ebx sef_cpuid7_ebx; - uint32_t sef_cpuid7_ecx; // unused currently - uint32_t sef_cpuid7_edx; // unused currently + SefCpuid7Ecx sef_cpuid7_ecx; + SefCpuid7Edx sef_cpuid7_edx; // cpuid function 0xB (processor topology) // ecx = 0 @@ -507,6 +540,8 @@ result |= CPU_AVX512BW; if (_cpuid_info.sef_cpuid7_ebx.bits.avx512vl != 0) result |= CPU_AVX512VL; + if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vpopcntdq != 0) + result |= CPU_AVX512_VPOPCNTDQ; } } if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0) @@ -783,6 +818,7 @@ static bool supports_sha() { return (_features & CPU_SHA) != 0; } static bool supports_fma() { return (_features & CPU_FMA) != 0 && supports_avx(); } static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; } + static bool support_avx512_vpopcntdq() { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; } // Intel features static bool is_intel_family_core() { return is_intel() && --- old/src/hotspot/cpu/x86/x86.ad 2018-03-09 13:15:41.902702829 -0800 +++ new/src/hotspot/cpu/x86/x86.ad 2018-03-09 13:15:41.598701693 -0800 @@ -1223,6 +1223,10 @@ if (!UsePopCountInstruction) ret_value = false; break; + case Op_PopCountVI: + if (!UsePopCountInstruction || !VM_Version::support_avx512_vpopcntdq() || UseAVX <= 2) + ret_value = false; + break; case Op_MulVI: if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX ret_value = false; @@ -10788,3 +10792,49 @@ %} ins_pipe( pipe_slow ); %} + +// --------------------------------- PopCount -------------------------------------- + +instruct vpopcount2I(vecD dst, vecD src) %{ + predicate(UseAVX > 2 && VM_Version::support_avx512_vpopcntdq() && UsePopCountInstruction); + match(Set dst (PopCountVI src)); + format %{ "vpopcntd $dst,$src\t! vector popcount packed2I" %} + ins_encode %{ + int vector_len = 0; + __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vpopcount4I(vecX dst, vecX src) %{ + predicate(UseAVX > 2 && VM_Version::support_avx512_vpopcntdq() && UsePopCountInstruction); + match(Set dst (PopCountVI src)); + format %{ "vpopcntd $dst,$src\t! vector popcount packed4I" %} + ins_encode %{ + int vector_len = 0; + __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vpopcount8I(vecY dst, vecY src) %{ + predicate(UseAVX > 2 && VM_Version::support_avx512_vpopcntdq() && UsePopCountInstruction); + match(Set dst (PopCountVI src)); + format %{ "vpopcntd $dst,$src\t! vector popcount packed8I" %} + ins_encode %{ + int vector_len = 1; + __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vpopcount16I(vecZ dst, vecZ src) %{ + predicate(UseAVX > 2 && VM_Version::support_avx512_vpopcntdq() && UsePopCountInstruction); + match(Set dst (PopCountVI src)); + format %{ "vpopcntd $dst,$src\t! vector popcount packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} --- old/src/hotspot/share/adlc/formssel.cpp 2018-03-09 13:15:43.074707214 -0800 +++ new/src/hotspot/share/adlc/formssel.cpp 2018-03-09 13:15:42.766706061 -0800 @@ -4180,7 +4180,7 @@ "URShiftVB","URShiftVS","URShiftVI","URShiftVL", "ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD", "LoadVector","StoreVector", - "FmaVD", "FmaVF", + "FmaVD", "FmaVF","PopCountVI", // Next are not supported currently. "PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D", "ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD" --- old/src/hotspot/share/opto/classes.hpp 2018-03-09 13:15:44.182711359 -0800 +++ new/src/hotspot/share/opto/classes.hpp 2018-03-09 13:15:43.886710252 -0800 @@ -241,6 +241,7 @@ macro(Phi) macro(PopCountI) macro(PopCountL) +macro(PopCountVI) macro(PrefetchAllocation) macro(Proj) macro(RShiftI) --- old/src/hotspot/share/opto/superword.cpp 2018-03-09 13:15:45.270715429 -0800 +++ new/src/hotspot/share/opto/superword.cpp 2018-03-09 13:15:44.974714321 -0800 @@ -2325,8 +2325,11 @@ vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n)); vlen_in_bytes = vn->as_Vector()->length_in_bytes(); } - } else if (opc == Op_SqrtF || opc == Op_SqrtD || opc == Op_AbsF || opc == Op_AbsD || opc == Op_NegF || opc == Op_NegD) { - // Promote operand to vector (Sqrt/Abs/Neg are 2 address instructions) + } else if (opc == Op_SqrtF || opc == Op_SqrtD || + opc == Op_AbsF || opc == Op_AbsD || + opc == Op_NegF || opc == Op_NegD || + opc == Op_PopCountI) { + assert(n->req() == 2, "only one input expected"); Node* in = vector_opd(p, 1); vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n)); vlen_in_bytes = vn->as_Vector()->length_in_bytes(); --- old/src/hotspot/share/opto/vectornode.cpp 2018-03-09 13:15:46.378719573 -0800 +++ new/src/hotspot/share/opto/vectornode.cpp 2018-03-09 13:15:46.082718466 -0800 @@ -122,6 +122,9 @@ case Op_SqrtD: assert(bt == T_DOUBLE, "must be"); return Op_SqrtVD; + case Op_PopCountI: + assert(bt == T_INT, "must be"); + return Op_PopCountVI; case Op_LShiftI: switch (bt) { case T_BOOLEAN: @@ -325,6 +328,8 @@ case Op_SqrtVF: return new SqrtVFNode(n1, vt); case Op_SqrtVD: return new SqrtVDNode(n1, vt); + case Op_PopCountVI: return new PopCountVINode(n1, vt); + case Op_LShiftVB: return new LShiftVBNode(n1, n2, vt); case Op_LShiftVS: return new LShiftVSNode(n1, n2, vt); case Op_LShiftVI: return new LShiftVINode(n1, n2, vt); --- old/src/hotspot/share/opto/vectornode.hpp 2018-03-09 13:15:47.470723658 -0800 +++ new/src/hotspot/share/opto/vectornode.hpp 2018-03-09 13:15:47.178722564 -0800 @@ -381,6 +381,14 @@ virtual int Opcode() const; }; +//------------------------------PopCountVINode--------------------------------- +// Vector popcount integer bits +class PopCountVINode : public VectorNode { + public: + PopCountVINode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {} + virtual int Opcode() const; +}; + //------------------------------SqrtVFNode-------------------------------------- // Vector Sqrt float class SqrtVFNode : public VectorNode { --- old/src/hotspot/share/runtime/vmStructs.cpp 2018-03-09 13:15:48.554727712 -0800 +++ new/src/hotspot/share/runtime/vmStructs.cpp 2018-03-09 13:15:48.258726605 -0800 @@ -1996,6 +1996,7 @@ declare_c2_type(MulReductionVDNode, ReductionNode) \ declare_c2_type(DivVFNode, VectorNode) \ declare_c2_type(DivVDNode, VectorNode) \ + declare_c2_type(PopCountVINode, VectorNode) \ declare_c2_type(LShiftVBNode, VectorNode) \ declare_c2_type(LShiftVSNode, VectorNode) \ declare_c2_type(LShiftVINode, VectorNode) \