--- old/src/cpu/x86/vm/assembler_x86.cpp 2017-06-05 10:06:00.630334700 -0700 +++ new/src/cpu/x86/vm/assembler_x86.cpp 2017-06-05 10:05:59.865796700 -0700 @@ -5070,6 +5070,42 @@ emit_operand(dst, src); } +void Assembler::vfmadd231pd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) { + assert(VM_Version::supports_fma(), ""); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0xB8); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::vfmadd231ps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) { + assert(VM_Version::supports_fma(), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0xB8); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::vfmadd231pd(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) { + assert(VM_Version::supports_fma(), ""); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit); + vex_prefix(src2, src1->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0xB8); + emit_operand(dst, src2); +} + +void Assembler::vfmadd231ps(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) { + assert(VM_Version::supports_fma(), ""); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit); + vex_prefix(src2, src1->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0xB8); + emit_operand(dst, src2); +} + void Assembler::divpd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); --- old/src/cpu/x86/vm/assembler_x86.hpp 2017-06-05 10:06:04.772721800 -0700 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2017-06-05 10:06:04.065603200 -0700 @@ -1903,6 +1903,11 @@ void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vfmadd231pd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vfmadd231ps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vfmadd231pd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void vfmadd231ps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + // Divide Packed Floating-Point Values void divpd(XMMRegister dst, XMMRegister src); void divps(XMMRegister dst, XMMRegister src); --- old/src/cpu/x86/vm/macroAssembler_x86.cpp 2017-06-05 10:06:09.048122200 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.cpp 2017-06-05 10:06:08.130448200 -0700 @@ -3163,8 +3163,37 @@ } } +// dst = c = a * b + c +void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) { + Assembler::vfmadd231pd(c, a, b, vector_len); + if (dst != c) { + vmovdqu(dst, c); + } +} +// dst = c = a * b + c +void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) { + Assembler::vfmadd231ps(c, a, b, vector_len); + if (dst != c) { + vmovdqu(dst, c); + } +} +// dst = c = a * b + c +void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) { + Assembler::vfmadd231pd(c, a, b, vector_len); + if (dst != c) { + vmovdqu(dst, c); + } +} + +// dst = c = a * b + c +void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) { + Assembler::vfmadd231ps(c, a, b, vector_len); + if (dst != c) { + vmovdqu(dst, c); + } +} void MacroAssembler::incrementl(AddressLiteral dst) { if (reachable(dst)) { --- old/src/cpu/x86/vm/macroAssembler_x86.hpp 2017-06-05 10:06:13.923276700 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.hpp 2017-06-05 10:06:13.021374300 -0700 @@ -456,6 +456,11 @@ void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c); void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c); + void vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len); + void vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len); + void vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len); + void vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len); + // same as fcmp2int, but using SSE2 void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less); --- old/src/cpu/x86/vm/vm_version_x86.cpp 2017-06-05 10:06:18.527378500 -0700 +++ new/src/cpu/x86/vm/vm_version_x86.cpp 2017-06-05 10:06:17.608790100 -0700 @@ -786,7 +786,7 @@ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); } - if (supports_fma() && UseSSE >= 2) { + if (supports_fma() && UseAVX > 0) { if (FLAG_IS_DEFAULT(UseFMA)) { UseFMA = true; } --- old/src/cpu/x86/vm/x86.ad 2017-06-05 10:06:23.172583800 -0700 +++ new/src/cpu/x86/vm/x86.ad 2017-06-05 10:06:22.233706900 -0700 @@ -10520,3 +10520,161 @@ ins_pipe( pipe_slow ); %} +// --------------------------------- FMA -------------------------------------- + +// a * b + c +instruct vfma2D_reg(vecX a, vecX b, vecX c) %{ + predicate(UseFMA && n->as_Vector()->length() == 2); + match(Set c (FmaVD c (Binary a b))); + format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %} + ins_cost(150); + ins_encode %{ + int vector_len = 0; + __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// a * b + c +instruct vfma2D_mem(vecX a, memory b, vecX c) %{ + predicate(UseFMA && n->as_Vector()->length() == 2); + match(Set c (FmaVD c (Binary a (LoadVector b)))); + format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %} + ins_cost(150); + ins_encode %{ + int vector_len = 0; + __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + + +// a * b + c +instruct vfma4D_reg(vecY a, vecY b, vecY c) %{ + predicate(UseFMA && n->as_Vector()->length() == 4); + match(Set c (FmaVD c (Binary a b))); + format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %} + ins_cost(150); + ins_encode %{ + int vector_len = 1; + __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// a * b + c +instruct vfma4D_mem(vecY a, memory b, vecY c) %{ + predicate(UseFMA && n->as_Vector()->length() == 4); + match(Set c (FmaVD c (Binary a (LoadVector b)))); + format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %} + ins_cost(150); + ins_encode %{ + int vector_len = 1; + __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// a * b + c +instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{ + predicate(UseFMA && n->as_Vector()->length() == 8); + match(Set c (FmaVD c (Binary a b))); + format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %} + ins_cost(150); + ins_encode %{ + int vector_len = 2; + __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// a * b + c +instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{ + predicate(UseFMA && n->as_Vector()->length() == 8); + match(Set c (FmaVD c (Binary a (LoadVector b)))); + format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %} + ins_cost(150); + ins_encode %{ + int vector_len = 2; + __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// a * b + c +instruct vfma4F_reg(vecX a, vecX b, vecX c) %{ + predicate(UseFMA && n->as_Vector()->length() == 4); + match(Set c (FmaVF c (Binary a b))); + format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %} + ins_cost(150); + ins_encode %{ + int vector_len = 0; + __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// a * b + c +instruct vfma4F_mem(vecX a, memory b, vecX c) %{ + predicate(UseFMA && n->as_Vector()->length() == 4); + match(Set c (FmaVF c (Binary a (LoadVector b)))); + format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %} + ins_cost(150); + ins_encode %{ + int vector_len = 0; + __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// a * b + c +instruct vfma8F_reg(vecY a, vecY b, vecY c) %{ + predicate(UseFMA && n->as_Vector()->length() == 8); + match(Set c (FmaVF c (Binary a b))); + format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %} + ins_cost(150); + ins_encode %{ + int vector_len = 1; + __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// a * b + c +instruct vfma8F_mem(vecY a, memory b, vecY c) %{ + predicate(UseFMA && n->as_Vector()->length() == 8); + match(Set c (FmaVF c (Binary a (LoadVector b)))); + format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %} + ins_cost(150); + ins_encode %{ + int vector_len = 1; + __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// a * b + c +instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{ + predicate(UseFMA && n->as_Vector()->length() == 16); + match(Set c (FmaVF c (Binary a b))); + format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %} + ins_cost(150); + ins_encode %{ + int vector_len = 2; + __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// a * b + c +instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{ + predicate(UseFMA && n->as_Vector()->length() == 16); + match(Set c (FmaVF c (Binary a (LoadVector b)))); + format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %} + ins_cost(150); + ins_encode %{ + int vector_len = 2; + __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} --- old/src/share/vm/adlc/formssel.cpp 2017-06-05 10:06:28.069050500 -0700 +++ new/src/share/vm/adlc/formssel.cpp 2017-06-05 10:06:27.157055000 -0700 @@ -4174,6 +4174,7 @@ "URShiftVB","URShiftVS","URShiftVI","URShiftVL", "ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD", "LoadVector","StoreVector", + "FmaVD", "FmaVF", // Next are not supported currently. "PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D", "ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD" --- old/src/share/vm/opto/classes.hpp 2017-06-05 10:06:32.791334900 -0700 +++ new/src/share/vm/opto/classes.hpp 2017-06-05 10:06:31.855631400 -0700 @@ -309,6 +309,8 @@ macro(MulReductionVF) macro(MulVD) macro(MulReductionVD) +macro(FmaVD) +macro(FmaVF) macro(DivVF) macro(DivVD) macro(AbsVF) --- old/src/share/vm/opto/matcher.cpp 2017-06-05 10:06:37.273931500 -0700 +++ new/src/share/vm/opto/matcher.cpp 2017-06-05 10:06:36.347636100 -0700 @@ -977,7 +977,6 @@ // Use one stack to keep both: child's node/state and parent's node/index MStack mstack(max_stack * 2 * 2); // usually: C->live_nodes() * 2 * 2 mstack.push(n, Visit, NULL, -1); // set NULL as parent to indicate root - while (mstack.is_nonempty()) { C->check_node_count(NodeLimitFudgeFactor, "too many nodes matching instructions"); if (C->failing()) return NULL; @@ -2122,6 +2121,8 @@ case Op_EncodeISOArray: case Op_FmaD: case Op_FmaF: + case Op_FmaVD: + case Op_FmaVF: set_shared(n); // Force result into register (it will be anyways) break; case Op_ConP: { // Convert pointers above the centerline to NUL @@ -2311,7 +2312,9 @@ break; } case Op_FmaD: - case Op_FmaF: { + case Op_FmaF: + case Op_FmaVD: + case Op_FmaVF: { // Restructure into a binary tree for Matching. Node* pair = new BinaryNode(n->in(1), n->in(2)); n->set_req(2, pair); --- old/src/share/vm/opto/superword.cpp 2017-06-05 10:06:42.034131200 -0700 +++ new/src/share/vm/opto/superword.cpp 2017-06-05 10:06:41.043302600 -0700 @@ -2324,6 +2324,13 @@ const TypeVect* vt = TypeVect::make(bt, vlen); vn = new CMoveVDNode(cc, src1, src2, vt); NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created new CMove node %d: ", vn->_idx); vn->dump();}) + } else if (opc == Op_FmaD || opc == Op_FmaF) { + // Promote operands to vector + Node* in1 = vector_opd(p, 1); + Node* in2 = vector_opd(p, 2); + Node* in3 = vector_opd(p, 3); + vn = VectorNode::make(opc, in1, in2, in3, vlen, velt_basic_type(n)); + vlen_in_bytes = vn->as_Vector()->length_in_bytes(); } else { if (do_reserve_copy()) { NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: ShouldNotReachHere, exiting SuperWord");}) --- old/src/share/vm/opto/vectornode.cpp 2017-06-05 10:06:46.696512900 -0700 +++ new/src/share/vm/opto/vectornode.cpp 2017-06-05 10:06:45.792504700 -0700 @@ -86,6 +86,12 @@ case Op_MulD: assert(bt == T_DOUBLE, "must be"); return Op_MulVD; + case Op_FmaD: + assert(bt == T_DOUBLE, "must be"); + return Op_FmaVD; + case Op_FmaF: + assert(bt == T_FLOAT, "must be"); + return Op_FmaVF; case Op_CMoveD: assert(bt == T_DOUBLE, "must be"); return Op_CMoveVD; @@ -259,6 +265,11 @@ *start = 2; *end = n->req(); break; + case Op_FmaD: + case Op_FmaF: + *start = 1; + *end = 4; // 3 vector operands + break; default: *start = 1; *end = n->req(); // default is all operands @@ -328,6 +339,19 @@ } +VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, Node* n3, uint vlen, BasicType bt) { + const TypeVect* vt = TypeVect::make(bt, vlen); + int vopc = VectorNode::opcode(opc, bt); + // This method should not be called for unimplemented vectors. + guarantee(vopc > 0, "Vector for '%s' is not implemented", NodeClassNames[opc]); + switch (vopc) { + case Op_FmaVD: return new FmaVDNode(n1, n2, n3, vt); + case Op_FmaVF: return new FmaVFNode(n1, n2, n3, vt); + } + fatal("Missed vector creation for '%s'", NodeClassNames[vopc]); + return NULL; +} + // Scalar promotion VectorNode* VectorNode::scalar2vector(Node* s, uint vlen, const Type* opd_t) { BasicType bt = opd_t->array_element_basic_type(); --- old/src/share/vm/opto/vectornode.hpp 2017-06-05 10:06:51.291041000 -0700 +++ new/src/share/vm/opto/vectornode.hpp 2017-06-05 10:06:50.337026300 -0700 @@ -62,6 +62,7 @@ static VectorNode* scalar2vector(Node* s, uint vlen, const Type* opd_t); static VectorNode* shift_count(Node* shift, Node* cnt, uint vlen, BasicType bt); static VectorNode* make(int opc, Node* n1, Node* n2, uint vlen, BasicType bt); + static VectorNode* make(int opc, Node* n1, Node* n2, Node* n3, uint vlen, BasicType bt); static int opcode(int opc, BasicType bt); static bool implemented(int opc, uint vlen, BasicType bt); @@ -260,6 +261,22 @@ virtual int Opcode() const; }; +//------------------------------FmaVDNode-------------------------------------- +// Vector multiply double +class FmaVDNode : public VectorNode { +public: + FmaVDNode(Node* in1, Node* in2, Node* in3, const TypeVect* vt) : VectorNode(in1, in2, in3, vt) {} + virtual int Opcode() const; +}; + +//------------------------------FmaVFNode-------------------------------------- +// Vector multiply float +class FmaVFNode : public VectorNode { +public: + FmaVFNode(Node* in1, Node* in2, Node* in3, const TypeVect* vt) : VectorNode(in1, in2, in3, vt) {} + virtual int Opcode() const; +}; + //------------------------------CMoveVDNode-------------------------------------- // Vector multiply double class CMoveVDNode : public VectorNode { --- old/src/share/vm/runtime/vmStructs.cpp 2017-06-05 10:06:55.970048300 -0700 +++ new/src/share/vm/runtime/vmStructs.cpp 2017-06-05 10:06:54.997742500 -0700 @@ -2052,6 +2052,8 @@ declare_c2_type(MulVFNode, VectorNode) \ declare_c2_type(MulReductionVFNode, ReductionNode) \ declare_c2_type(MulVDNode, VectorNode) \ + declare_c2_type(FmaVDNode, VectorNode) \ + declare_c2_type(FmaVFNode, VectorNode) \ declare_c2_type(CMoveVDNode, VectorNode) \ declare_c2_type(MulReductionVDNode, ReductionNode) \ declare_c2_type(DivVFNode, VectorNode) \