--- old/src/cpu/x86/vm/assembler_x86.cpp 2015-03-30 19:42:54.000000000 -0700 +++ new/src/cpu/x86/vm/assembler_x86.cpp 2015-03-30 19:42:53.000000000 -0700 @@ -3359,6 +3359,20 @@ // Integer vector arithmetic +void Assembler::vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38); + emit_int8(0x01); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { + assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); + int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38); + emit_int8(0x02); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::paddb(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); emit_simd_arith(0xFC, dst, src, VEX_SIMD_66); @@ -3379,6 +3393,20 @@ emit_simd_arith(0xD4, dst, src, VEX_SIMD_66); } +void Assembler::phaddw(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse3(), "")); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_int8(0x01); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::phaddd(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse3(), "")); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38); + emit_int8(0x02); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2"); emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256); @@ -3804,6 +3832,17 @@ emit_int8(0x01); } +void Assembler::vextractf128h(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_avx(), ""); + bool vector256 = true; + int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A); + emit_int8(0x19); + emit_int8((unsigned char)(0xC0 | encode)); + // 0x00 - insert into lower 128 bits + // 0x01 - insert into upper 128 bits + emit_int8(0x01); +} + void Assembler::vextractf128h(Address dst, XMMRegister src) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); --- old/src/cpu/x86/vm/assembler_x86.hpp 2015-03-30 19:42:54.000000000 -0700 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2015-03-30 19:42:54.000000000 -0700 @@ -1777,6 +1777,12 @@ void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256); void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256); + // Add horizontal packed integers + void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256); + void phaddw(XMMRegister dst, XMMRegister src); + void phaddd(XMMRegister dst, XMMRegister src); + // Add packed integers void paddb(XMMRegister dst, XMMRegister src); void paddw(XMMRegister dst, XMMRegister src); @@ -1869,6 +1875,7 @@ // Copy low 128bit into high 128bit of YMM registers. void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src); void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src); + void vextractf128h(XMMRegister dst, XMMRegister src); // Load/store high 128bit of YMM registers which does not destroy other half. void vinsertf128h(XMMRegister dst, Address src); --- old/src/cpu/x86/vm/x86.ad 2015-03-30 19:42:55.000000000 -0700 +++ new/src/cpu/x86/vm/x86.ad 2015-03-30 19:42:55.000000000 -0700 @@ -623,6 +623,22 @@ if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX return false; break; + case Op_AddReductionVL: + if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here + return false; + case Op_AddReductionVI: + if (UseSSE < 3) // requires at least SSE3 + return false; + case Op_MulReductionVI: + if (UseSSE < 4) // requires at least SSE4 + return false; + case Op_AddReductionVF: + case Op_AddReductionVD: + case Op_MulReductionVF: + case Op_MulReductionVD: + if (UseSSE < 1) // requires at least SSE + return false; + break; case Op_CompareAndSwapL: #ifdef _LP64 case Op_CompareAndSwapP: @@ -2532,6 +2548,574 @@ ins_pipe( fpu_reg_reg ); %} +// ====================REDUCTION ARITHMETIC======================================= + +instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{ + predicate(UseSSE > 2 && UseAVX == 0); + match(Set dst (AddReductionVI src1 src2)); + effect(TEMP tmp2, TEMP tmp); + format %{ "movdqu $tmp2,$src2\n\t" + "phaddd $tmp2,$tmp2\n\t" + "movd $tmp,$src1\n\t" + "paddd $tmp,$tmp2\n\t" + "movd $dst,$tmp\t! add reduction2I" %} + ins_encode %{ + __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister); + __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister); + __ movdl($tmp$$XMMRegister, $src1$$Register); + __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister); + __ movdl($dst$$Register, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 0); + match(Set dst (AddReductionVI src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "vphaddd $tmp,$src2,$src2\n\t" + "movd $tmp2,$src1\n\t" + "vpaddd $tmp2,$tmp2,$tmp\n\t" + "movd $dst,$tmp2\t! add reduction2I" %} + ins_encode %{ + __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false); + __ movdl($tmp2$$XMMRegister, $src1$$Register); + __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false); + __ movdl($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{ + predicate(UseSSE > 2 && UseAVX == 0); + match(Set dst (AddReductionVI src1 src2)); + effect(TEMP tmp2, TEMP tmp); + format %{ "movdqu $tmp2,$src2\n\t" + "phaddd $tmp2,$tmp2\n\t" + "phaddd $tmp2,$tmp2\n\t" + "movd $tmp,$src1\n\t" + "paddd $tmp,$tmp2\n\t" + "movd $dst,$tmp\t! add reduction4I" %} + ins_encode %{ + __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister); + __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister); + __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister); + __ movdl($tmp$$XMMRegister, $src1$$Register); + __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister); + __ movdl($dst$$Register, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 0); + match(Set dst (AddReductionVI src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "vphaddd $tmp,$src2,$src2\n\t" + "vphaddd $tmp,$tmp,$tmp2\n\t" + "movd $tmp2,$src1\n\t" + "vpaddd $tmp2,$tmp2,$tmp\n\t" + "movd $dst,$tmp2\t! add reduction4I" %} + ins_encode %{ + __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false); + __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + __ movdl($tmp2$$XMMRegister, $src1$$Register); + __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false); + __ movdl($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 0); + match(Set dst (AddReductionVI src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "vphaddd $tmp,$src2,$src2\n\t" + "vphaddd $tmp,$tmp,$tmp2\n\t" + "vextractf128 $tmp2,$tmp\n\t" + "vpaddd $tmp,$tmp,$tmp2\n\t" + "movd $tmp2,$src1\n\t" + "vpaddd $tmp2,$tmp2,$tmp\n\t" + "movd $dst,$tmp2\t! add reduction8I" %} + ins_encode %{ + __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, true); + __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, true); + __ vextractf128h($tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + __ movdl($tmp2$$XMMRegister, $src1$$Register); + __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false); + __ movdl($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rsadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{ + predicate(UseSSE >= 1 && UseAVX == 0); + match(Set dst (AddReductionVF src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "movdqu $tmp,$src1\n\t" + "addss $tmp,$src2\n\t" + "pshufd $tmp2,$src2,0x01\n\t" + "addss $tmp,$tmp2\n\t" + "movdqu $dst,$tmp\t! add reduction2F" %} + ins_encode %{ + __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister); + __ addss($tmp$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01); + __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister); + __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 0); + match(Set dst (AddReductionVF src1 src2)); + effect(TEMP tmp2, TEMP tmp); + format %{ "vaddss $tmp2,$src1,$src2\n\t" + "pshufd $tmp,$src2,0x01\n\t" + "vaddss $dst,$tmp2,$tmp\t! add reduction2F" %} + ins_encode %{ + __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); + __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rsadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{ + predicate(UseSSE >= 1 && UseAVX == 0); + match(Set dst (AddReductionVF src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "movdqu $tmp,$src1\n\t" + "addss $tmp,$src2\n\t" + "pshufd $tmp2,$src2,0x01\n\t" + "addss $tmp,$tmp2\n\t" + "pshufd $tmp2,$src2,0x02\n\t" + "addss $tmp,$tmp2\n\t" + "pshufd $tmp2,$src2,0x03\n\t" + "addss $tmp,$tmp2\n\t" + "movdqu $dst,$tmp\t! add reduction4F" %} + ins_encode %{ + __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister); + __ addss($tmp$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01); + __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister); + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02); + __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister); + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03); + __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister); + __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 0); + match(Set dst (AddReductionVF src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "vaddss $tmp2,$src1,$src2\n\t" + "pshufd $tmp,$src2,0x01\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$src2,0x02\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$src2,0x03\n\t" + "vaddss $dst,$tmp2,$tmp\t! add reduction4F" %} + ins_encode %{ + __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03); + __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct radd8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{ + predicate(UseAVX > 0); + match(Set dst (AddReductionVF src1 src2)); + effect(TEMP tmp, TEMP tmp2, TEMP tmp3); + format %{ "vaddss $tmp2,$src1,$src2\n\t" + "pshufd $tmp,$src2,0x01\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$src2,0x02\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$src2,0x03\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "vextractf128 $tmp3,$src2\n\t" + "vaddss $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$tmp3,0x01\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x02\n\t" + "vaddss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x03\n\t" + "vaddss $dst,$tmp2,$tmp\t! add reduction8F" %} + ins_encode %{ + __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02); + __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03); + __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rsadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{ + predicate(UseSSE >= 1 && UseAVX == 0); + match(Set dst (AddReductionVD src1 src2)); + effect(TEMP tmp, TEMP dst); + format %{ "movdqu $tmp,$src1\n\t" + "addsd $tmp,$src2\n\t" + "pshufd $dst,$src2,0xE\n\t" + "addsd $dst,$tmp\t! add reduction2D" %} + ins_encode %{ + __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister); + __ addsd($tmp$$XMMRegister, $src2$$XMMRegister); + __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE); + __ addsd($dst$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{ + predicate(UseAVX > 0); + match(Set dst (AddReductionVD src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "vaddsd $tmp2,$src1,$src2\n\t" + "pshufd $tmp,$src2,0xE\n\t" + "vaddsd $dst,$tmp2,$tmp\t! add reduction2D" %} + ins_encode %{ + __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE); + __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvadd4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{ + predicate(UseAVX > 0); + match(Set dst (AddReductionVD src1 src2)); + effect(TEMP tmp, TEMP tmp2, TEMP tmp3); + format %{ "vaddsd $tmp2,$src1,$src2\n\t" + "pshufd $tmp,$src2,0xE\n\t" + "vaddsd $tmp2,$tmp2,$tmp\n\t" + "vextractf128 $tmp3,$src2\n\t" + "vaddsd $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$tmp3,0xE\n\t" + "vaddsd $dst,$tmp2,$tmp\t! add reduction4D" %} + ins_encode %{ + __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE); + __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister); + __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE); + __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{ + predicate(UseSSE > 3 && UseAVX == 0); + match(Set dst (MulReductionVI src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "pshufd $tmp2,$src2,0x1\n\t" + "pmulld $tmp2,$src2\n\t" + "movd $tmp,$src1\n\t" + "pmulld $tmp2,$tmp\n\t" + "movd $dst,$tmp2\t! mul reduction2I" %} + ins_encode %{ + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1); + __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister); + __ movdl($tmp$$XMMRegister, $src1$$Register); + __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister); + __ movdl($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 0); + match(Set dst (MulReductionVI src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "pshufd $tmp2,$src2,0x1\n\t" + "vpmulld $tmp,$src2,$tmp2\n\t" + "movd $tmp2,$src1\n\t" + "vpmulld $tmp2,$tmp,$tmp2\n\t" + "movd $dst,$tmp2\t! mul reduction2I" %} + ins_encode %{ + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1); + __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false); + __ movdl($tmp2$$XMMRegister, $src1$$Register); + __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + __ movdl($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{ + predicate(UseSSE > 3 && UseAVX == 0); + match(Set dst (MulReductionVI src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "pshufd $tmp2,$src2,0xE\n\t" + "pmulld $tmp2,$src2\n\t" + "pshufd $tmp,$tmp2,0x1\n\t" + "pmulld $tmp2,$tmp\n\t" + "movd $tmp,$src1\n\t" + "pmulld $tmp2,$tmp\n\t" + "movd $dst,$tmp2\t! mul reduction4I" %} + ins_encode %{ + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE); + __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1); + __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister); + __ movdl($tmp$$XMMRegister, $src1$$Register); + __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister); + __ movdl($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 0); + match(Set dst (MulReductionVI src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "pshufd $tmp2,$src2,0xE\n\t" + "vpmulld $tmp,$src2,$tmp2\n\t" + "pshufd $tmp2,$tmp,0x1\n\t" + "vpmulld $tmp,$tmp,$tmp2\n\t" + "movd $tmp2,$src1\n\t" + "vpmulld $tmp2,$tmp,$tmp2\n\t" + "movd $dst,$tmp2\t! mul reduction4I" %} + ins_encode %{ + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE); + __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false); + __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1); + __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + __ movdl($tmp2$$XMMRegister, $src1$$Register); + __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + __ movdl($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 0); + match(Set dst (MulReductionVI src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "vextractf128 $tmp,$src2\n\t" + "vpmulld $tmp,$tmp,$src2\n\t" + "pshufd $tmp2,$tmp,0xE\n\t" + "vpmulld $tmp,$tmp,$tmp2\n\t" + "pshufd $tmp2,$tmp,0x1\n\t" + "vpmulld $tmp,$tmp,$tmp2\n\t" + "movd $tmp2,$src1\n\t" + "vpmulld $tmp2,$tmp,$tmp2\n\t" + "movd $dst,$tmp2\t! mul reduction8I" %} + ins_encode %{ + __ vextractf128h($tmp$$XMMRegister, $src2$$XMMRegister); + __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, false); + __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE); + __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1); + __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + __ movdl($tmp2$$XMMRegister, $src1$$Register); + __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); + __ movdl($dst$$Register, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rsmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{ + predicate(UseSSE >= 1 && UseAVX == 0); + match(Set dst (MulReductionVF src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "movdqu $tmp,$src1\n\t" + "mulss $tmp,$src2\n\t" + "pshufd $tmp2,$src2,0x01\n\t" + "mulss $tmp,$tmp2\n\t" + "movdqu $dst,$tmp\t! add reduction2F" %} + ins_encode %{ + __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister); + __ mulss($tmp$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01); + __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister); + __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 0); + match(Set dst (MulReductionVF src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "vmulss $tmp2,$src1,$src2\n\t" + "pshufd $tmp,$src2,0x01\n\t" + "vmulss $dst,$tmp2,$tmp\t! add reduction2F" %} + ins_encode %{ + __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); + __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rsmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{ + predicate(UseSSE >= 1 && UseAVX == 0); + match(Set dst (MulReductionVF src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "movdqu $tmp,$src1\n\t" + "mulss $tmp,$src2\n\t" + "pshufd $tmp2,$src2,0x01\n\t" + "mulss $tmp,$tmp2\n\t" + "pshufd $tmp2,$src2,0x02\n\t" + "mulss $tmp,$tmp2\n\t" + "pshufd $tmp2,$src2,0x03\n\t" + "mulss $tmp,$tmp2\n\t" + "movdqu $dst,$tmp\t! add reduction4F" %} + ins_encode %{ + __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister); + __ mulss($tmp$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01); + __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister); + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02); + __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister); + __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03); + __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister); + __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{ + predicate(UseAVX > 0); + match(Set dst (MulReductionVF src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "vmulss $tmp2,$src1,$src2\n\t" + "pshufd $tmp,$src2,0x01\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$src2,0x02\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$src2,0x03\n\t" + "vmulss $dst,$tmp2,$tmp\t! add reduction4F" %} + ins_encode %{ + __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03); + __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvmul8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{ + predicate(UseAVX > 0); + match(Set dst (MulReductionVF src1 src2)); + effect(TEMP tmp, TEMP tmp2, TEMP tmp3); + format %{ "vmulss $tmp2,$src1,$src2\n\t" + "pshufd $tmp,$src2,0x01\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$src2,0x02\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$src2,0x03\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "vextractf128 $tmp3,$src2\n\t" + "vmulss $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$tmp3,0x01\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x02\n\t" + "vmulss $tmp2,$tmp2,$tmp\n\t" + "pshufd $tmp,$tmp3,0x03\n\t" + "vmulss $dst,$tmp2,$tmp\t! mul reduction8F" %} + ins_encode %{ + __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02); + __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03); + __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rsmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{ + predicate(UseSSE >= 1 && UseAVX == 0); + match(Set dst (MulReductionVD src1 src2)); + effect(TEMP tmp, TEMP dst); + format %{ "movdqu $tmp,$src1\n\t" + "mulsd $tmp,$src2\n\t" + "pshufd $dst,$src2,0xE\n\t" + "mulsd $dst,$tmp\t! add reduction2D" %} + ins_encode %{ + __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister); + __ mulsd($tmp$$XMMRegister, $src2$$XMMRegister); + __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE); + __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{ + predicate(UseAVX > 0); + match(Set dst (MulReductionVD src1 src2)); + effect(TEMP tmp, TEMP tmp2); + format %{ "vmulsd $tmp2,$src1,$src2\n\t" + "pshufd $tmp,$src2,0xE\n\t" + "vmulsd $dst,$tmp2,$tmp\t! mul reduction2D" %} + ins_encode %{ + __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE); + __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct rvmul4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{ + predicate(UseAVX > 0); + match(Set dst (MulReductionVD src1 src2)); + effect(TEMP tmp, TEMP tmp2, TEMP tmp3); + format %{ "vmulsd $tmp2,$src1,$src2\n\t" + "pshufd $tmp,$src2,0xE\n\t" + "vmulsd $tmp2,$tmp2,$tmp\n\t" + "vextractf128 $tmp3,$src2\n\t" + "vmulsd $tmp2,$tmp2,$tmp3\n\t" + "pshufd $tmp,$tmp3,0xE\n\t" + "vmulsd $dst,$tmp2,$tmp\t! mul reduction4D" %} + ins_encode %{ + __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE); + __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister); + __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); + __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE); + __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + // ====================VECTOR ARITHMETIC======================================= // --------------------------------- ADD -------------------------------------- --- old/src/share/vm/adlc/formssel.cpp 2015-03-30 19:42:56.000000000 -0700 +++ new/src/share/vm/adlc/formssel.cpp 2015-03-30 19:42:55.000000000 -0700 @@ -4043,6 +4043,13 @@ strcmp(opType,"ReplicateL")==0 || strcmp(opType,"ReplicateF")==0 || strcmp(opType,"ReplicateD")==0 || + strcmp(opType,"AddReductionVI")==0 || + strcmp(opType,"AddReductionVL")==0 || + strcmp(opType,"AddReductionVF")==0 || + strcmp(opType,"AddReductionVD")==0 || + strcmp(opType,"MulReductionVI")==0 || + strcmp(opType,"MulReductionVF")==0 || + strcmp(opType,"MulReductionVD")==0 || 0 /* 0 to line up columns nicely */ ) return 1; } @@ -4135,6 +4142,10 @@ "MulVS","MulVI","MulVF","MulVD", "DivVF","DivVD", "AndV" ,"XorV" ,"OrV", + "AddReductionVI", "AddReductionVL", + "AddReductionVF", "AddReductionVD", + "MulReductionVI", + "MulReductionVF", "MulReductionVD", "LShiftCntV","RShiftCntV", "LShiftVB","LShiftVS","LShiftVI","LShiftVL", "RShiftVB","RShiftVS","RShiftVI","RShiftVL", --- old/src/share/vm/opto/c2_globals.hpp 2015-03-30 19:42:56.000000000 -0700 +++ new/src/share/vm/opto/c2_globals.hpp 2015-03-30 19:42:56.000000000 -0700 @@ -324,6 +324,9 @@ develop(bool, SuperWordRTDepCheck, false, \ "Enable runtime dependency checks.") \ \ + product(bool, SuperWordReductions, true, \ + "Enable reductions support in superword.") \ + \ notproduct(bool, TraceSuperWord, false, \ "Trace superword transforms") \ \ --- old/src/share/vm/opto/classes.hpp 2015-03-30 19:42:57.000000000 -0700 +++ new/src/share/vm/opto/classes.hpp 2015-03-30 19:42:57.000000000 -0700 @@ -266,9 +266,13 @@ macro(AddVB) macro(AddVS) macro(AddVI) +macro(AddReductionVI) macro(AddVL) +macro(AddReductionVL) macro(AddVF) +macro(AddReductionVF) macro(AddVD) +macro(AddReductionVD) macro(SubVB) macro(SubVS) macro(SubVI) @@ -277,8 +281,11 @@ macro(SubVD) macro(MulVS) macro(MulVI) +macro(MulReductionVI) macro(MulVF) +macro(MulReductionVF) macro(MulVD) +macro(MulReductionVD) macro(DivVF) macro(DivVD) macro(LShiftCntV) --- old/src/share/vm/opto/compile.cpp 2015-03-30 19:42:57.000000000 -0700 +++ new/src/share/vm/opto/compile.cpp 2015-03-30 19:42:57.000000000 -0700 @@ -3049,6 +3049,15 @@ case Op_StoreVector: break; + case Op_AddReductionVI: + case Op_AddReductionVL: + case Op_AddReductionVF: + case Op_AddReductionVD: + case Op_MulReductionVI: + case Op_MulReductionVF: + case Op_MulReductionVD: + break; + case Op_PackB: case Op_PackS: case Op_PackI: --- old/src/share/vm/opto/loopTransform.cpp 2015-03-30 19:42:58.000000000 -0700 +++ new/src/share/vm/opto/loopTransform.cpp 2015-03-30 19:42:58.000000000 -0700 @@ -1524,6 +1524,44 @@ } } +void PhaseIdealLoop::mark_reductions(IdealLoopTree *loop) { + if (SuperWordReductions == false) return; + + CountedLoopNode* loop_head = loop->_head->as_CountedLoop(); + if (loop_head->unrolled_count() > 1) { + return; + } + + Node* trip_phi = loop_head->phi(); + for (DUIterator_Fast imax, i = loop_head->fast_outs(imax); i < imax; i++) { + Node* phi = loop_head->fast_out(i); + if (phi->is_Phi() && phi->outcnt() > 0 && phi != trip_phi) { + // For definitions which are loop inclusive and not tripcounts. + Node* def_node = phi->in(LoopNode::LoopBackControl); + + if (def_node != NULL) { + Node* n_ctrl = get_ctrl(def_node); + if (n_ctrl != NULL && loop->is_member(get_loop(n_ctrl))) { + // Now test it to see if it fits the standard pattern for a reduction operator. + int opc = def_node->Opcode(); + if (opc != ReductionNode::opcode(opc, def_node->bottom_type()->basic_type())) { + if (!def_node->is_reduction()) { // Not marked yet + // To be a reduction, the arithmetic node must have the phi as input and provide a def to it + for (unsigned j = 1; j < def_node->req(); j++) { + Node* in = def_node->in(j); + if (in == phi) { + def_node->add_flag(Node::Flag_is_reduction); + break; + } + } + } + } + } + } + } + } +} + //------------------------------dominates_backedge--------------------------------- // Returns true if ctrl is executed on every complete iteration bool IdealLoopTree::dominates_backedge(Node* ctrl) { @@ -2361,8 +2399,10 @@ // an even number of trips). If we are peeling, we might enable some RCE // and we'd rather unroll the post-RCE'd loop SO... do not unroll if // peeling. - if (should_unroll && !should_peel) - phase->do_unroll(this,old_new, true); + if (should_unroll && !should_peel) { + phase->mark_reductions(this); + phase->do_unroll(this, old_new, true); + } // Adjust the pre-loop limits to align the main body // iterations. --- old/src/share/vm/opto/loopnode.hpp 2015-03-30 19:42:59.000000000 -0700 +++ new/src/share/vm/opto/loopnode.hpp 2015-03-30 19:42:59.000000000 -0700 @@ -872,6 +872,9 @@ // Unroll the loop body one step - make each trip do 2 iterations. void do_unroll( IdealLoopTree *loop, Node_List &old_new, bool adjust_min_trip ); + // Mark vector reduction candidates before loop unrolling + void mark_reductions( IdealLoopTree *loop ); + // Return true if exp is a constant times an induction var bool is_scaled_iv(Node* exp, Node* iv, int* p_scale); --- old/src/share/vm/opto/node.hpp 2015-03-30 19:42:59.000000000 -0700 +++ new/src/share/vm/opto/node.hpp 2015-03-30 19:42:59.000000000 -0700 @@ -673,7 +673,8 @@ Flag_avoid_back_to_back_before = Flag_may_be_short_branch << 1, Flag_avoid_back_to_back_after = Flag_avoid_back_to_back_before << 1, Flag_has_call = Flag_avoid_back_to_back_after << 1, - Flag_is_expensive = Flag_has_call << 1, + Flag_is_reduction = Flag_has_call << 1, + Flag_is_expensive = Flag_is_reduction << 1, _max_flags = (Flag_is_expensive << 1) - 1 // allow flags combination }; @@ -701,6 +702,10 @@ const jushort flags() const { return _flags; } + void add_flag(jushort fl) { init_flags(fl); } + + void remove_flag(jushort fl) { clear_flag(fl); } + // Return a dense integer opcode number virtual int Opcode() const; @@ -852,6 +857,10 @@ // The node is expensive: the best control is set during loop opts bool is_expensive() const { return (_flags & Flag_is_expensive) != 0 && in(0) != NULL; } + // An arithmetic node which accumulates a data in a loop. + // It must have the loop's phi as input and provide a def to the phi. + bool is_reduction() const { return (_flags & Flag_is_reduction) != 0; } + //----------------- Optimization // Get the worst-case Type output for this Node. --- old/src/share/vm/opto/superword.cpp 2015-03-30 19:43:00.000000000 -0700 +++ new/src/share/vm/opto/superword.cpp 2015-03-30 19:43:00.000000000 -0700 @@ -65,7 +65,8 @@ _lpt(NULL), // loop tree node _lp(NULL), // LoopNode _bb(NULL), // basic block - _iv(NULL) // induction var + _iv(NULL), // induction var + _race_possible(false) // cases where SDMU is true {} //------------------------------transform_loop--------------------------- @@ -145,7 +146,6 @@ void SuperWord::SLP_extract() { // Ready the block - if (!construct_bb()) return; // Exit if no interesting nodes or complex graph. @@ -640,7 +640,7 @@ } if (isomorphic(s1, s2)) { - if (independent(s1, s2)) { + if (independent(s1, s2) || reduction(s1, s2)) { if (!exists_at(s1, 0) && !exists_at(s2, 1)) { if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) { int s1_align = alignment(s1); @@ -718,6 +718,28 @@ return independent_path(shallow, deep); } +//------------------------------reduction--------------------------- +// Is there a data path between s1 and s2 and the nodes reductions? +bool SuperWord::reduction(Node* s1, Node* s2) { + bool retValue = false; + int d1 = depth(s1); + int d2 = depth(s2); + if (d1 + 1 == d2) { + if (s1->is_reduction() && s2->is_reduction()) { + // This is an ordered set, so s1 should define s2 + for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { + Node* t1 = s1->fast_out(i); + if (t1 == s2) { + // both nodes are reductions and connected + retValue = true; + } + } + } + } + + return retValue; +} + //------------------------------independent_path------------------------------ // Helper for independent bool SuperWord::independent_path(Node* shallow, Node* deep, uint dp) { @@ -761,6 +783,7 @@ void SuperWord::extend_packlist() { bool changed; do { + packset_sort(_packset.length()); changed = false; for (int i = 0; i < _packset.length(); i++) { Node_List* p = _packset.at(i); @@ -769,6 +792,13 @@ } } while (changed); + if (_race_possible) { + for (int i = 0; i < _packset.length(); i++) { + Node_List* p = _packset.at(i); + order_def_uses(p); + } + } + #ifndef PRODUCT if (TraceSuperWord) { tty->print_cr("\nAfter extend_packlist"); @@ -825,10 +855,12 @@ int align = alignment(s1); int savings = -1; + int num_s1_uses = 0; Node* u1 = NULL; Node* u2 = NULL; for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { Node* t1 = s1->fast_out(i); + num_s1_uses++; if (!in_bb(t1)) continue; for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) { Node* t2 = s2->fast_out(j); @@ -845,6 +877,9 @@ } } } + if (num_s1_uses > 1) { + _race_possible = true; + } if (savings >= 0) { Node_List* pair = new Node_List(); pair->push(u1); @@ -856,9 +891,64 @@ return changed; } +//------------------------------order_def_uses--------------------------- +// For extended packsets, ordinally arrange uses packset by major component +void SuperWord::order_def_uses(Node_List* p) { + Node* s1 = p->at(0); + + if (s1->is_Store()) return; + + // reductions are always managed beforehand + if (s1->is_reduction()) return; + + for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { + Node* t1 = s1->fast_out(i); + + // Only allow operand swap on commuting operations + if (!t1->is_Add() && !t1->is_Mul()) { + break; + } + + // Now find t1's packset + Node_List* p2 = NULL; + for (int j = 0; j < _packset.length(); j++) { + p2 = _packset.at(j); + Node* first = p2->at(0); + if (t1 == first) { + break; + } + p2 = NULL; + } + // Arrange all sub components by the major component + if (p2 != NULL) { + for (uint j = 1; j < p->size(); j++) { + Node* d1 = p->at(j); + Node* u1 = p2->at(j); + opnd_positions_match(s1, t1, d1, u1); + } + } + } +} + //---------------------------opnd_positions_match------------------------- // Is the use of d1 in u1 at the same operand position as d2 in u2? bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) { + // check reductions to see if they are marshalled to represent the reduction + // operator in a specified opnd + if (u1->is_reduction() && u2->is_reduction()) { + // ensure reductions have phis and reduction definitions feeding the 1st operand + Node* first = u1->in(2); + if (first->is_Phi() || first->is_reduction()) { + u1->swap_edges(1, 2); + } + // ensure reductions have phis and reduction definitions feeding the 1st operand + first = u2->in(2); + if (first->is_Phi() || first->is_reduction()) { + u2->swap_edges(1, 2); + } + return true; + } + uint ct = u1->req(); if (ct != u2->req()) return false; uint i1 = 0; @@ -940,7 +1030,8 @@ for (int i = 0; i < _packset.length(); i++) { Node_List* p1 = _packset.at(i); if (p1 == NULL) continue; - for (int j = 0; j < _packset.length(); j++) { + // Because of sorting we can start at i + 1 + for (int j = i + 1; j < _packset.length(); j++) { Node_List* p2 = _packset.at(j); if (p2 == NULL) continue; if (i == j) continue; @@ -1067,8 +1158,19 @@ //------------------------------implemented--------------------------- // Can code be generated for pack p? bool SuperWord::implemented(Node_List* p) { + bool retValue = false; Node* p0 = p->at(0); - return VectorNode::implemented(p0->Opcode(), p->size(), velt_basic_type(p0)); + if (p0 != NULL) { + int opc = p0->Opcode(); + uint size = p->size(); + if (p0->is_reduction()) { + const Type *arith_type = p0->bottom_type(); + retValue = ReductionNode::implemented(opc, size, arith_type->basic_type()); + } else { + retValue = VectorNode::implemented(opc, size, velt_basic_type(p0)); + } + } + return retValue; } //------------------------------same_inputs-------------------------- @@ -1102,6 +1204,18 @@ if (!is_vector_use(p0, i)) return false; } + // Check if reductions are connected + if (p0->is_reduction()) { + Node* second_in = p0->in(2); + Node_List* second_pk = my_pack(second_in); + if (second_pk == NULL) { + // Remove reduction flag if no parent pack, it is not profitable + p0->remove_flag(Node::Flag_is_reduction); + return false; + } else if (second_pk->size() != p->size()) { + return false; + } + } if (VectorNode::is_shift(p0)) { // For now, return false if shift count is vector or not scalar promotion // case (different shift counts) because it is not supported yet. @@ -1123,6 +1237,9 @@ for (uint k = 0; k < use->req(); k++) { Node* n = use->in(k); if (def == n) { + // reductions can be loop carried dependences + if (def->is_reduction() && use->is_Phi()) + continue; if (!is_vector_use(use, k)) { return false; } @@ -1407,16 +1524,33 @@ vlen_in_bytes = vn->as_StoreVector()->memory_size(); } else if (n->req() == 3) { // Promote operands to vector - Node* in1 = vector_opd(p, 1); + Node* in1 = NULL; + bool node_isa_reduction = n->is_reduction(); + if (node_isa_reduction) { + // the input to the first reduction operation is retained + in1 = low_adr->in(1); + } else { + in1 = vector_opd(p, 1); + } Node* in2 = vector_opd(p, 2); - if (VectorNode::is_invariant_vector(in1) && (n->is_Add() || n->is_Mul())) { + if (VectorNode::is_invariant_vector(in1) && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) { // Move invariant vector input into second position to avoid register spilling. Node* tmp = in1; in1 = in2; in2 = tmp; } - vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n)); - vlen_in_bytes = vn->as_Vector()->length_in_bytes(); + if (node_isa_reduction) { + const Type *arith_type = n->bottom_type(); + vn = ReductionNode::make(opc, NULL, in1, in2, arith_type->basic_type()); + if (in2->is_Load()) { + vlen_in_bytes = in2->as_LoadVector()->memory_size(); + } else { + vlen_in_bytes = in2->as_Vector()->length_in_bytes(); + } + } else { + vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n)); + vlen_in_bytes = vn->as_Vector()->length_in_bytes(); + } } else { ShouldNotReachHere(); } @@ -1556,6 +1690,8 @@ _n_idx_list.pop(); Node* def = use->in(idx); + if (def->is_reduction()) continue; + // Insert extract operation _igvn.hash_delete(def); int def_pos = alignment(def) / data_size(def); @@ -1576,6 +1712,7 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) { Node_List* u_pk = my_pack(use); if (u_pk == NULL) return false; + if (use->is_reduction()) return true; Node* def = use->in(u_idx); Node_List* d_pk = my_pack(def); if (d_pk == NULL) { @@ -1613,7 +1750,7 @@ // by the visited and post_visited sets, // and count number of nodes in block. int bb_ct = 0; - for (uint i = 0; i < lpt()->_body.size(); i++ ) { + for (uint i = 0; i < lpt()->_body.size(); i++) { Node *n = lpt()->_body.at(i); set_bb_idx(n, i); // Create a temporary map if (in_bb(n)) { @@ -1674,6 +1811,7 @@ // Do a depth first walk over out edges int rpo_idx = bb_ct - 1; int size; + int reduction_uses = 0; while ((size = _stk.length()) > 0) { Node* n = _stk.top(); // Leave node on stack if (!visited_test_set(n)) { @@ -1685,7 +1823,23 @@ if (in_bb(use) && !visited_test(use) && // Don't go around backedge (!use->is_Phi() || n == entry)) { - _stk.push(use); + if (use->is_reduction()) { + // First see if we can map the reduction on the given system we are on, then + // make a data entry operation for each reduction we see. + const Type *arith_type = use->bottom_type(); + int vopc = ReductionNode::opcode(use->Opcode(), arith_type->basic_type()); + if (vopc != use->Opcode()) { + if (Matcher::match_rule_supported(vopc)) { + _stk.push(use); + reduction_uses++; + } else { + // failed a support issue + return false; + } + } + } else { + _stk.push(use); + } } } if (_stk.length() == size) { @@ -1708,7 +1862,8 @@ set_bb_idx(n, j); } - initialize_bb(); // Ensure extra info is allocated. + // Ensure extra info is allocated. + initialize_bb(); #ifndef PRODUCT if (TraceSuperWord) { @@ -1726,7 +1881,7 @@ } #endif assert(rpo_idx == -1 && bb_ct == _block.length(), "all block members found"); - return (_mem_slice_head.length() > 0) || (_data_entry.length() > 0); + return (_mem_slice_head.length() > 0) || (reduction_uses > 0) || (_data_entry.length() > 0); } //------------------------------initialize_bb--------------------------- @@ -1959,6 +2114,28 @@ _packset.remove_at(pos); } +void SuperWord::packset_sort(int n) +{ + // simple bubble sort so that we capitalize with O(n) when its already sorted + while (n != 0) { + bool swapped = false; + for (int i = 1; i < n; i++) { + Node_List* q_low = _packset.at(i-1); + Node_List* q_i = _packset.at(i); + + // only swap when we find something to swap + if (alignment(q_low->at(0)) > alignment(q_i->at(0))) { + Node_List* t = q_i; + *(_packset.adr_at(i)) = q_low; + *(_packset.adr_at(i-1)) = q_i; + swapped = true; + } + } + if (swapped == false) break; + n--; + } +} + //------------------------------executed_first--------------------------- // Return the node executed first in pack p. Uses the RPO block list // to determine order. --- old/src/share/vm/opto/superword.hpp 2015-03-30 19:43:01.000000000 -0700 +++ new/src/share/vm/opto/superword.hpp 2015-03-30 19:43:01.000000000 -0700 @@ -249,6 +249,7 @@ LoopNode* _lp; // Current LoopNode Node* _bb; // Current basic block PhiNode* _iv; // Induction var + bool _race_possible; // In cases where SDMU is true // Accessors Arena* arena() { return _arena; } @@ -337,6 +338,8 @@ bool isomorphic(Node* s1, Node* s2); // Is there no data path from s1 to s2 or s2 to s1? bool independent(Node* s1, Node* s2); + // Is there a data path between s1 and s2 and both are reductions? + bool reduction(Node* s1, Node* s2); // Helper for independent bool independent_path(Node* shallow, Node* deep, uint dp=0); void set_alignment(Node* s1, Node* s2, int align); @@ -347,6 +350,8 @@ bool follow_use_defs(Node_List* p); // Extend the packset by visiting uses of nodes in pack p bool follow_def_uses(Node_List* p); + // For extended packsets, ordinally arrange uses packset by major component + void order_def_uses(Node_List* p); // Estimate the savings from executing s1 and s2 as a pack int est_savings(Node* s1, Node* s2); int adjacent_profit(Node* s1, Node* s2); @@ -419,9 +424,12 @@ void print_bb(); void print_stmt(Node* s); char* blank(uint depth); + + void packset_sort(int n); }; + //------------------------------SWPointer--------------------------- // Information about an address for dependence checking and vector alignment class SWPointer VALUE_OBJ_CLASS_SPEC { --- old/src/share/vm/opto/vectornode.cpp 2015-03-30 19:43:01.000000000 -0700 +++ new/src/share/vm/opto/vectornode.cpp 2015-03-30 19:43:01.000000000 -0700 @@ -250,7 +250,6 @@ int vopc = VectorNode::opcode(opc, bt); // This method should not be called for unimplemented vectors. guarantee(vopc > 0, err_msg_res("Vector for '%s' is not implemented", NodeClassNames[opc])); - switch (vopc) { case Op_AddVB: return new AddVBNode(n1, n2, vt); case Op_AddVS: return new AddVSNode(n1, n2, vt); @@ -441,3 +440,72 @@ return NULL; } +int ReductionNode::opcode(int opc, BasicType bt) { + int vopc = opc; + switch (opc) { + case Op_AddI: + assert(bt == T_INT, "must be"); + vopc = Op_AddReductionVI; + break; + case Op_AddL: + assert(bt == T_LONG, "must be"); + vopc = Op_AddReductionVL; + break; + case Op_AddF: + assert(bt == T_FLOAT, "must be"); + vopc = Op_AddReductionVF; + break; + case Op_AddD: + assert(bt == T_DOUBLE, "must be"); + vopc = Op_AddReductionVD; + break; + case Op_MulI: + assert(bt == T_INT, "must be"); + vopc = Op_MulReductionVI; + break; + case Op_MulF: + assert(bt == T_FLOAT, "must be"); + vopc = Op_MulReductionVF; + break; + case Op_MulD: + assert(bt == T_DOUBLE, "must be"); + vopc = Op_MulReductionVD; + break; + // TODO: add MulL for targets that support it + default: + break; + } + return vopc; +} + +// Return the appropriate reduction node. +ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, BasicType bt) { + + int vopc = opcode(opc, bt); + + // This method should not be called for unimplemented vectors. + guarantee(vopc != opc, err_msg_res("Vector for '%s' is not implemented", NodeClassNames[opc])); + + switch (vopc) { + case Op_AddReductionVI: return new AddReductionVINode(ctrl, n1, n2); + case Op_AddReductionVL: return new AddReductionVLNode(ctrl, n1, n2); + case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2); + case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2); + case Op_MulReductionVI: return new MulReductionVINode(ctrl, n1, n2); + case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2); + case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2); + } + fatal(err_msg_res("Missed vector creation for '%s'", NodeClassNames[vopc])); + return NULL; +} + +bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) { + if (is_java_primitive(bt) && + (vlen > 1) && is_power_of_2(vlen) && + Matcher::vector_size_supported(bt, vlen)) { + int vopc = ReductionNode::opcode(opc, bt); + return vopc != opc && Matcher::match_rule_supported(vopc); + } + return false; +} + --- old/src/share/vm/opto/vectornode.hpp 2015-03-30 19:43:02.000000000 -0700 +++ new/src/share/vm/opto/vectornode.hpp 2015-03-30 19:43:02.000000000 -0700 @@ -90,6 +90,37 @@ virtual int Opcode() const; }; +//------------------------------ReductionNode------------------------------------ +// Perform reduction of a vector +class ReductionNode : public Node { + public: + ReductionNode(Node *ctrl, Node* in1, Node* in2) : Node(ctrl, in1, in2) {} + + static ReductionNode* make(int opc, Node *ctrl, Node* in1, Node* in2, BasicType bt); + static int opcode(int opc, BasicType bt); + static bool implemented(int opc, uint vlen, BasicType bt); +}; + +//------------------------------AddReductionVINode-------------------------------------- +// Vector add int as a reduction +class AddReductionVINode : public ReductionNode { +public: + AddReductionVINode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + virtual int Opcode() const; + virtual const Type* bottom_type() const { return TypeInt::INT; } + virtual uint ideal_reg() const { return Op_RegI; } +}; + +//------------------------------AddReductionVLNode-------------------------------------- +// Vector add long as a reduction +class AddReductionVLNode : public ReductionNode { +public: + AddReductionVLNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + virtual int Opcode() const; + virtual const Type* bottom_type() const { return TypeLong::LONG; } + virtual uint ideal_reg() const { return Op_RegL; } +}; + //------------------------------AddVLNode-------------------------------------- // Vector add long class AddVLNode : public VectorNode { @@ -106,6 +137,16 @@ virtual int Opcode() const; }; +//------------------------------AddReductionVFNode-------------------------------------- +// Vector add float as a reduction +class AddReductionVFNode : public ReductionNode { +public: + AddReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + virtual int Opcode() const; + virtual const Type* bottom_type() const { return Type::FLOAT; } + virtual uint ideal_reg() const { return Op_RegF; } +}; + //------------------------------AddVDNode-------------------------------------- // Vector add double class AddVDNode : public VectorNode { @@ -114,6 +155,16 @@ virtual int Opcode() const; }; +//------------------------------AddReductionVDNode-------------------------------------- +// Vector add double as a reduction +class AddReductionVDNode : public ReductionNode { +public: + AddReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + virtual int Opcode() const; + virtual const Type* bottom_type() const { return Type::DOUBLE; } + virtual uint ideal_reg() const { return Op_RegD; } +}; + //------------------------------SubVBNode-------------------------------------- // Vector subtract byte class SubVBNode : public VectorNode { @@ -178,6 +229,16 @@ virtual int Opcode() const; }; +//------------------------------MulReductionVINode-------------------------------------- +// Vector multiply int as a reduction +class MulReductionVINode : public ReductionNode { +public: + MulReductionVINode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + virtual int Opcode() const; + virtual const Type* bottom_type() const { return TypeInt::INT; } + virtual uint ideal_reg() const { return Op_RegI; } +}; + //------------------------------MulVFNode-------------------------------------- // Vector multiply float class MulVFNode : public VectorNode { @@ -186,6 +247,16 @@ virtual int Opcode() const; }; +//------------------------------MulReductionVFNode-------------------------------------- +// Vector multiply float as a reduction +class MulReductionVFNode : public ReductionNode { +public: + MulReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + virtual int Opcode() const; + virtual const Type* bottom_type() const { return Type::FLOAT; } + virtual uint ideal_reg() const { return Op_RegF; } +}; + //------------------------------MulVDNode-------------------------------------- // Vector multiply double class MulVDNode : public VectorNode { @@ -194,6 +265,16 @@ virtual int Opcode() const; }; +//------------------------------MulReductionVDNode-------------------------------------- +// Vector multiply double as a reduction +class MulReductionVDNode : public ReductionNode { +public: + MulReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + virtual int Opcode() const; + virtual const Type* bottom_type() const { return Type::DOUBLE; } + virtual uint ideal_reg() const { return Op_RegD; } +}; + //------------------------------DivVFNode-------------------------------------- // Vector divide float class DivVFNode : public VectorNode { --- old/src/share/vm/runtime/vmStructs.cpp 2015-03-30 19:43:03.000000000 -0700 +++ new/src/share/vm/runtime/vmStructs.cpp 2015-03-30 19:43:02.000000000 -0700 @@ -1982,13 +1982,18 @@ declare_c2_type(PowDNode, Node) \ declare_c2_type(ReverseBytesINode, Node) \ declare_c2_type(ReverseBytesLNode, Node) \ + declare_c2_type(ReductionNode, Node) \ declare_c2_type(VectorNode, Node) \ declare_c2_type(AddVBNode, VectorNode) \ declare_c2_type(AddVSNode, VectorNode) \ declare_c2_type(AddVINode, VectorNode) \ + declare_c2_type(AddReductionVINode, ReductionNode) \ declare_c2_type(AddVLNode, VectorNode) \ + declare_c2_type(AddReductionVLNode, ReductionNode) \ declare_c2_type(AddVFNode, VectorNode) \ + declare_c2_type(AddReductionVFNode, ReductionNode) \ declare_c2_type(AddVDNode, VectorNode) \ + declare_c2_type(AddReductionVDNode, ReductionNode) \ declare_c2_type(SubVBNode, VectorNode) \ declare_c2_type(SubVSNode, VectorNode) \ declare_c2_type(SubVINode, VectorNode) \ @@ -1997,8 +2002,11 @@ declare_c2_type(SubVDNode, VectorNode) \ declare_c2_type(MulVSNode, VectorNode) \ declare_c2_type(MulVINode, VectorNode) \ + declare_c2_type(MulReductionVINode, ReductionNode) \ declare_c2_type(MulVFNode, VectorNode) \ + declare_c2_type(MulReductionVFNode, ReductionNode) \ declare_c2_type(MulVDNode, VectorNode) \ + declare_c2_type(MulReductionVDNode, ReductionNode) \ declare_c2_type(DivVFNode, VectorNode) \ declare_c2_type(DivVDNode, VectorNode) \ declare_c2_type(LShiftVBNode, VectorNode) \ --- /dev/null 2015-03-30 19:43:03.000000000 -0700 +++ new/test/compiler/loopopts/superword/ProdRed_Double.java 2015-03-30 19:43:03.000000000 -0700 @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 8074981 + * @summary Add C2 x86 Superword support for scalar product reduction optimizations : float test + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 ProdRed_Double + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 ProdRed_Double + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 ProdRed_Double + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 ProdRed_Double + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 ProdRed_Double + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 ProdRed_Double + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 ProdRed_Double + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 ProdRed_Double + */ + +public class ProdRed_Double +{ + public static void main(String[] args) throws Exception { + double[] a = new double[256*1024]; + double[] b = new double[256*1024]; + prodReductionInit(a,b); + double valid = 20000; + double total = 0; + for(int j = 0; j < 20000; j++) { + total = j + 1; + total = prodReductionImplement(a,b, total); + } + if(total == valid) { + System.out.println("Success"); + } else { + System.out.println("Invalid sum of elements variable in total: " + total); + System.out.println("Expected value = " + valid); + throw new Exception("Failed"); + } + } + + public static void prodReductionInit(double[] a, double[] b) + { + for(int i = 0; i < a.length; i++) + { + a[i] = i + 2; + b[i] = i + 1; + } + } + + public static double prodReductionImplement(double[] a, double[] b, double total) + { + for(int i = 0; i < a.length; i++) + { + total *= a[i] - b[i]; + } + return total; + } + +} --- /dev/null 2015-03-30 19:43:04.000000000 -0700 +++ new/test/compiler/loopopts/superword/ProdRed_Float.java 2015-03-30 19:43:04.000000000 -0700 @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 8074981 + * @summary Add C2 x86 Superword support for scalar product reduction optimizations : float test + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 ProdRed_Float + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 ProdRed_Float + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 ProdRed_Float + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 ProdRed_Float + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 ProdRed_Float + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 ProdRed_Float + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 ProdRed_Float + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 ProdRed_Float + */ + +public class ProdRed_Float +{ + public static void main(String[] args) throws Exception { + float[] a = new float[256*1024]; + float[] b = new float[256*1024]; + prodReductionInit(a,b); + float valid = 20000; + float total = 0; + for(int j = 0; j < 20000; j++) { + total = j + 1; + total = prodReductionImplement(a,b, total); + } + if(total == valid) { + System.out.println("Success"); + } else { + System.out.println("Invalid sum of elements variable in total: " + total); + System.out.println("Expected value = " + valid); + throw new Exception("Failed"); + } + } + + public static void prodReductionInit(float[] a, float[] b) + { + for(int i = 0; i < a.length; i++) + { + a[i] = i + 2; + b[i] = i + 1; + } + } + + public static float prodReductionImplement(float[] a, float[] b, float total) + { + for(int i = 0; i < a.length; i++) + { + total *= a[i] - b[i]; + } + return total; + } + +} --- /dev/null 2015-03-30 19:43:04.000000000 -0700 +++ new/test/compiler/loopopts/superword/ProdRed_Int.java 2015-03-30 19:43:04.000000000 -0700 @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 8074981 + * @summary Add C2 x86 Superword support for scalar product reduction optimizations : int test + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 ProdRed_Int + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 ProdRed_Int + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 ProdRed_Int + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 ProdRed_Int + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 ProdRed_Int + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 ProdRed_Int + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 ProdRed_Int + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 ProdRed_Int + */ + +public class ProdRed_Int +{ + public static void main(String[] args) throws Exception { + int[] a = new int[256*1024]; + int[] b = new int[256*1024]; + prodReductionInit(a,b); + int valid = -100663295; + int total = 1; + for(int j = 0; j < 20000; j++) { + total = prodReductionImplement(a,b,total); + } + if(total == valid) { + System.out.println("Success"); + } else { + System.out.println("Invalid sum of elements variable in total: " + total); + System.out.println("Expected value = " + valid); + throw new Exception("Failed"); + } + } + + public static void prodReductionInit(int[] a, int[] b) + { + for(int i = 0; i < a.length; i++) + { + a[i] = i + 2; + b[i] = i + 1; + } + } + + public static int prodReductionImplement(int[] a, int[] b, int total) + { + for(int i = 0; i < a.length; i++) + { + total *= a[i] + b[i]; + } + return total; + } + +} --- /dev/null 2015-03-30 19:43:05.000000000 -0700 +++ new/test/compiler/loopopts/superword/SumRed_Double.java 2015-03-30 19:43:05.000000000 -0700 @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 8074981 + * @summary Add C2 x86 Superword support for scalar sum reduction optimizations : double test + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 SumRed_Double + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 SumRed_Double + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 SumRed_Double + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 SumRed_Double + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 SumRed_Double + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 SumRed_Double + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 SumRed_Double + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 SumRed_Double + */ + +public class SumRed_Double +{ + public static void main(String[] args) throws Exception { + double[] a = new double[256*1024]; + double[] b = new double[256*1024]; + double[] c = new double[256*1024]; + double[] d = new double[256*1024]; + sumReductionInit(a,b,c); + double total = 0; + double valid = 3.602859086516129E20; + for(int j = 0; j < 20000; j++) { + total = sumReductionImplement(a,b,c,d,total); + } + if(total == valid) { + System.out.println("Success"); + } else { + System.out.println("Invalid sum of elements variable in total: " + total); + System.out.println("Expected value = " + valid); + throw new Exception("Failed"); + } + } + + public static void sumReductionInit( + double[] a, + double[] b, + double[] c) + { + for(int j = 0; j < 1; j++) + { + for(int i = 0; i < a.length; i++) + { + a[i] = i * 1 + j; + b[i] = i * 1 - j; + c[i] = i + j; + } + } + } + + public static double sumReductionImplement( + double[] a, + double[] b, + double[] c, + double[] d, + double total) + { + for(int i = 0; i < a.length; i++) + { + d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total += d[i]; + } + return total; + } + +} --- /dev/null 2015-03-30 19:43:05.000000000 -0700 +++ new/test/compiler/loopopts/superword/SumRed_Float.java 2015-03-30 19:43:05.000000000 -0700 @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 8074981 + * @summary Add C2 x86 Superword support for scalar sum reduction optimizations : float test + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 SumRed_Float + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 SumRed_Float + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 SumRed_Float + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 SumRed_Float + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 SumRed_Float + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 SumRed_Float + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 SumRed_Float + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 SumRed_Float + */ + +public class SumRed_Float +{ + public static void main(String[] args) throws Exception { + float[] a = new float[256*1024]; + float[] b = new float[256*1024]; + float[] c = new float[256*1024]; + float[] d = new float[256*1024]; + sumReductionInit(a,b,c); + float total = 0; + float valid = (float)4.611686E18; + for(int j = 0; j < 20000; j++) { + total = sumReductionImplement(a,b,c,d,total); + } + if(total == valid) { + System.out.println("Success"); + } else { + System.out.println("Invalid sum of elements variable in total: " + total); + System.out.println("Expected value = " + valid); + throw new Exception("Failed"); + } + } + + public static void sumReductionInit( + float[] a, + float[] b, + float[] c) + { + for(int j = 0; j < 1; j++) + { + for(int i = 0; i < a.length; i++) + { + a[i] = i * 1 + j; + b[i] = i * 1 - j; + c[i] = i + j; + } + } + } + + public static float sumReductionImplement( + float[] a, + float[] b, + float[] c, + float[] d, + float total) + { + for(int i = 0; i < a.length; i++) + { + d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total += d[i]; + } + return total; + } + +} --- /dev/null 2015-03-30 19:43:06.000000000 -0700 +++ new/test/compiler/loopopts/superword/SumRed_Int.java 2015-03-30 19:43:06.000000000 -0700 @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/** + * @test + * @bug 8074981 + * @summary Add C2 x86 Superword support for scalar sum reduction optimizations : int test + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 SumRed_Int + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 SumRed_Int + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 SumRed_Int + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 SumRed_Int + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 SumRed_Int + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 SumRed_Int + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 SumRed_Int + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 SumRed_Int + */ + +public class SumRed_Int +{ + public static void main(String[] args) throws Exception { + int[] a = new int[256*1024]; + int[] b = new int[256*1024]; + int[] c = new int[256*1024]; + int[] d = new int[256*1024]; + sumReductionInit(a,b,c); + int total = 0; + int valid = -1673527296; + for(int j = 0; j < 20000; j++) { + total = sumReductionImplement(a,b,c,d,total); + } + if(total == valid) { + System.out.println("Success"); + } else { + System.out.println("Invalid sum of elements variable in total: " + total); + System.out.println("Expected value = " + valid); + throw new Exception("Failed"); + } + } + + public static void sumReductionInit( + int[] a, + int[] b, + int[] c) + { + for(int j = 0; j < 1; j++) + { + for(int i = 0; i < a.length; i++) + { + a[i] = i * 1 + j; + b[i] = i * 1 - j; + c[i] = i + j; + } + } + } + + public static int sumReductionImplement( + int[] a, + int[] b, + int[] c, + int[] d, + int total) + { + for(int i = 0; i < a.length; i++) + { + d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total += d[i]; + } + return total; + } + +}