--- old/src/hotspot/cpu/x86/assembler_x86.cpp 2018-03-12 17:44:03.423040196 -0700 +++ new/src/hotspot/cpu/x86/assembler_x86.cpp 2018-03-12 17:44:03.123039074 -0700 @@ -8709,6 +8709,15 @@ emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::vpopcntd(XMMRegister dst, XMMRegister src, int vector_len) { + assert(VM_Version::supports_vpopcntdq(), "must support vpopcntdq feature"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x55); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::popq(Address dst) { InstructionMark im(this); prefixq(dst); --- old/src/hotspot/cpu/x86/assembler_x86.hpp 2018-03-12 17:44:04.615044655 -0700 +++ new/src/hotspot/cpu/x86/assembler_x86.hpp 2018-03-12 17:44:04.299043473 -0700 @@ -1638,6 +1638,8 @@ void popcntq(Register dst, Register src); #endif + void vpopcntd(XMMRegister dst, XMMRegister src, int vector_len); + // Prefetches (SSE, SSE2, 3DNOW only) void prefetchnta(Address src); --- old/src/hotspot/cpu/x86/vm_version_x86.cpp 2018-03-12 17:44:05.771048980 -0700 +++ new/src/hotspot/cpu/x86/vm_version_x86.cpp 2018-03-12 17:44:05.467047842 -0700 @@ -257,6 +257,8 @@ __ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset()))); __ movl(Address(rsi, 0), rax); __ movl(Address(rsi, 4), rbx); + __ movl(Address(rsi, 8), rcx); + __ movl(Address(rsi, 12), rdx); // // Extended cpuid(0x80000000) @@ -662,6 +664,7 @@ _features &= ~CPU_AVX512CD; _features &= ~CPU_AVX512BW; _features &= ~CPU_AVX512VL; + _features &= ~CPU_AVX512_VPOPCNTDQ; } if (UseAVX < 2) --- old/src/hotspot/cpu/x86/vm_version_x86.hpp 2018-03-12 17:44:06.935053335 -0700 +++ new/src/hotspot/cpu/x86/vm_version_x86.hpp 2018-03-12 17:44:06.639052227 -0700 @@ -228,6 +228,38 @@ } bits; }; + union SefCpuid7Ecx { + uint32_t value; + struct { + uint32_t prefetchwt1 : 1, + avx512_vbmi : 1, + umip : 1, + pku : 1, + ospke : 1, + : 1, + avx512_vbmi2 : 1, + : 1, + gfni : 1, + vaes : 1, + vpclmulqdq : 1, + avx512_vnni : 1, + avx512_bitalg : 1, + : 1, + avx512_vpopcntdq : 1, + : 17; + } bits; + }; + + union SefCpuid7Edx { + uint32_t value; + struct { + uint32_t : 2, + avx512_4vnniw : 1, + avx512_4fmaps : 1, + : 28; + } bits; + }; + union ExtCpuid1EEbx { uint32_t value; struct { @@ -300,7 +332,8 @@ #define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length #define CPU_SHA ((uint64_t)UCONST64(0x400000000)) // SHA instructions #define CPU_FMA ((uint64_t)UCONST64(0x800000000)) // FMA instructions -#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000)) // Vzeroupper instruction +#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000)) // Vzeroupper instruction +#define CPU_AVX512_VPOPCNTDQ ((uint64_t)UCONST64(0x2000000000)) // Vector popcount enum Extended_Family { // AMD @@ -353,8 +386,8 @@ // cpuid function 7 (structured extended features) SefCpuid7Eax sef_cpuid7_eax; SefCpuid7Ebx sef_cpuid7_ebx; - uint32_t sef_cpuid7_ecx; // unused currently - uint32_t sef_cpuid7_edx; // unused currently + SefCpuid7Ecx sef_cpuid7_ecx; + SefCpuid7Edx sef_cpuid7_edx; // cpuid function 0xB (processor topology) // ecx = 0 @@ -507,6 +540,8 @@ result |= CPU_AVX512BW; if (_cpuid_info.sef_cpuid7_ebx.bits.avx512vl != 0) result |= CPU_AVX512VL; + if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vpopcntdq != 0) + result |= CPU_AVX512_VPOPCNTDQ; } } if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0) @@ -783,6 +818,7 @@ static bool supports_sha() { return (_features & CPU_SHA) != 0; } static bool supports_fma() { return (_features & CPU_FMA) != 0 && supports_avx(); } static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; } + static bool supports_vpopcntdq() { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; } // Intel features static bool is_intel_family_core() { return is_intel() && --- old/src/hotspot/cpu/x86/x86.ad 2018-03-12 17:44:08.039057463 -0700 +++ new/src/hotspot/cpu/x86/x86.ad 2018-03-12 17:44:07.743056356 -0700 @@ -1223,6 +1223,10 @@ if (!UsePopCountInstruction) ret_value = false; break; + case Op_PopCountVI: + if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq()) + ret_value = false; + break; case Op_MulVI: if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX ret_value = false; @@ -10788,3 +10792,49 @@ %} ins_pipe( pipe_slow ); %} + +// --------------------------------- PopCount -------------------------------------- + +instruct vpopcount2I(vecD dst, vecD src) %{ + predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2); + match(Set dst (PopCountVI src)); + format %{ "vpopcntd $dst,$src\t! vector popcount packed2I" %} + ins_encode %{ + int vector_len = 0; + __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vpopcount4I(vecX dst, vecX src) %{ + predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4); + match(Set dst (PopCountVI src)); + format %{ "vpopcntd $dst,$src\t! vector popcount packed4I" %} + ins_encode %{ + int vector_len = 0; + __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vpopcount8I(vecY dst, vecY src) %{ + predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8); + match(Set dst (PopCountVI src)); + format %{ "vpopcntd $dst,$src\t! vector popcount packed8I" %} + ins_encode %{ + int vector_len = 1; + __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vpopcount16I(vecZ dst, vecZ src) %{ + predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16); + match(Set dst (PopCountVI src)); + format %{ "vpopcntd $dst,$src\t! vector popcount packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} --- old/src/hotspot/share/adlc/formssel.cpp 2018-03-12 17:44:09.203061817 -0700 +++ new/src/hotspot/share/adlc/formssel.cpp 2018-03-12 17:44:08.903060695 -0700 @@ -4180,7 +4180,7 @@ "URShiftVB","URShiftVS","URShiftVI","URShiftVL", "ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD", "LoadVector","StoreVector", - "FmaVD", "FmaVF", + "FmaVD", "FmaVF","PopCountVI", // Next are not supported currently. "PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D", "ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD" --- old/src/hotspot/share/opto/classes.hpp 2018-03-12 17:44:10.327066022 -0700 +++ new/src/hotspot/share/opto/classes.hpp 2018-03-12 17:44:10.031064915 -0700 @@ -241,6 +241,7 @@ macro(Phi) macro(PopCountI) macro(PopCountL) +macro(PopCountVI) macro(PrefetchAllocation) macro(Proj) macro(RShiftI) --- old/src/hotspot/share/opto/superword.cpp 2018-03-12 17:44:11.495070392 -0700 +++ new/src/hotspot/share/opto/superword.cpp 2018-03-12 17:44:11.191069254 -0700 @@ -2325,8 +2325,11 @@ vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n)); vlen_in_bytes = vn->as_Vector()->length_in_bytes(); } - } else if (opc == Op_SqrtF || opc == Op_SqrtD || opc == Op_AbsF || opc == Op_AbsD || opc == Op_NegF || opc == Op_NegD) { - // Promote operand to vector (Sqrt/Abs/Neg are 2 address instructions) + } else if (opc == Op_SqrtF || opc == Op_SqrtD || + opc == Op_AbsF || opc == Op_AbsD || + opc == Op_NegF || opc == Op_NegD || + opc == Op_PopCountI) { + assert(n->req() == 2, "only one input expected"); Node* in = vector_opd(p, 1); vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n)); vlen_in_bytes = vn->as_Vector()->length_in_bytes(); --- old/src/hotspot/share/opto/vectornode.cpp 2018-03-12 17:44:12.707074925 -0700 +++ new/src/hotspot/share/opto/vectornode.cpp 2018-03-12 17:44:12.391073742 -0700 @@ -122,6 +122,13 @@ case Op_SqrtD: assert(bt == T_DOUBLE, "must be"); return Op_SqrtVD; + case Op_PopCountI: + if (bt == T_INT) { + return Op_PopCountVI; + } + // Unimplemented for subword types since bit count changes + // depending on size of lane (and sign bit). + return 0; case Op_LShiftI: switch (bt) { case T_BOOLEAN: @@ -325,6 +332,8 @@ case Op_SqrtVF: return new SqrtVFNode(n1, vt); case Op_SqrtVD: return new SqrtVDNode(n1, vt); + case Op_PopCountVI: return new PopCountVINode(n1, vt); + case Op_LShiftVB: return new LShiftVBNode(n1, n2, vt); case Op_LShiftVS: return new LShiftVSNode(n1, n2, vt); case Op_LShiftVI: return new LShiftVINode(n1, n2, vt); --- old/src/hotspot/share/opto/vectornode.hpp 2018-03-12 17:44:13.843079173 -0700 +++ new/src/hotspot/share/opto/vectornode.hpp 2018-03-12 17:44:13.547078066 -0700 @@ -381,6 +381,14 @@ virtual int Opcode() const; }; +//------------------------------PopCountVINode--------------------------------- +// Vector popcount integer bits +class PopCountVINode : public VectorNode { + public: + PopCountVINode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {} + virtual int Opcode() const; +}; + //------------------------------SqrtVFNode-------------------------------------- // Vector Sqrt float class SqrtVFNode : public VectorNode { --- old/src/hotspot/share/runtime/vmStructs.cpp 2018-03-12 17:44:14.963083364 -0700 +++ new/src/hotspot/share/runtime/vmStructs.cpp 2018-03-12 17:44:14.659082226 -0700 @@ -1996,6 +1996,7 @@ declare_c2_type(MulReductionVDNode, ReductionNode) \ declare_c2_type(DivVFNode, VectorNode) \ declare_c2_type(DivVDNode, VectorNode) \ + declare_c2_type(PopCountVINode, VectorNode) \ declare_c2_type(LShiftVBNode, VectorNode) \ declare_c2_type(LShiftVSNode, VectorNode) \ declare_c2_type(LShiftVINode, VectorNode) \ --- /dev/null 2018-02-28 09:42:29.695381538 -0800 +++ new/test/hotspot/jtreg/compiler/vectorization/TestPopCountVector.java 2018-03-12 17:44:15.795086477 -0700 @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * @test + * @bug 8199421 + * @summary Test vectorization of popcount + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UsePopCountInstruction + * compiler.vectorization.TestPopCountVector + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UsePopCountInstruction + * -XX:MaxVectorSize=8 compiler.vectorization.TestPopCountVector + */ + +package compiler.vectorization; + +public class TestPopCountVector { + private int[] input; + private int[] output; + private static final int LEN = 1024; + + public static void main(String args[]) { + TestPopCountVector test = new TestPopCountVector(); + + for (int i = 0; i < 10_000; ++i) { + test.vectorizeBitCount(); + } + System.out.println("Checking popcount result"); + test.checkResult(); + + for (int i = 0; i < 10_000; ++i) { + test.vectorizeBitCount(); + } + System.out.println("Checking popcount result"); + test.checkResult(); + } + + public TestPopCountVector() { + input = new int[LEN]; + output = new int[LEN]; + for (int i = 0; i < LEN; ++i) { + input[i] = i % 2 == 0 ? i : -1 * i; + } + } + + public void vectorizeBitCount() { + for (int i = 0; i < LEN; ++i) { + output[i] = Integer.bitCount(input[i]); + } + } + + public void checkResult() { + for (int i = 0; i < LEN; ++i) { + int expected = Integer.bitCount(input[i]); + if (output[i] != expected) { + throw new RuntimeException("Invalid result: output[" + i + "] = " + output[i] + " != " + expected); + } + } + } +} +