--- old/src/cpu/aarch64/vm/c2_globals_aarch64.hpp 2016-02-11 11:52:15.611511000 -0800 +++ new/src/cpu/aarch64/vm/c2_globals_aarch64.hpp 2016-02-11 11:52:15.354485300 -0800 @@ -54,6 +54,7 @@ define_pd_global(intx, InteriorEntryAlignment, 16); define_pd_global(intx, NewSizeThreadIncrease, ScaleForWordSize(4*K)); define_pd_global(intx, LoopUnrollLimit, 60); +define_pd_global(intx, LoopPercentProfileLimit, 10); // InitialCodeCacheSize derived from specjbb2000 run. define_pd_global(intx, InitialCodeCacheSize, 2496*K); // Integral multiple of CodeCacheExpansionSize define_pd_global(intx, CodeCacheExpansionSize, 64*K); --- old/src/cpu/ppc/vm/c2_globals_ppc.hpp 2016-02-11 11:52:17.107660600 -0800 +++ new/src/cpu/ppc/vm/c2_globals_ppc.hpp 2016-02-11 11:52:16.884638300 -0800 @@ -54,6 +54,7 @@ define_pd_global(bool, UseTLAB, true); define_pd_global(bool, ResizeTLAB, true); define_pd_global(intx, LoopUnrollLimit, 60); +define_pd_global(intx, LoopPercentProfileLimit, 10); // Peephole and CISC spilling both break the graph, and so make the // scheduler sick. --- old/src/cpu/sparc/vm/c2_globals_sparc.hpp 2016-02-11 11:52:18.506800500 -0800 +++ new/src/cpu/sparc/vm/c2_globals_sparc.hpp 2016-02-11 11:52:18.283778200 -0800 @@ -52,6 +52,7 @@ define_pd_global(bool, UseTLAB, true); define_pd_global(bool, ResizeTLAB, true); define_pd_global(intx, LoopUnrollLimit, 60); // Design center runs on 1.3.1 +define_pd_global(intx, LoopPercentProfileLimit, 10); define_pd_global(intx, MinJumpTableSize, 5); // Peephole and CISC spilling both break the graph, and so makes the --- old/src/cpu/x86/vm/assembler_x86.cpp 2016-02-11 11:52:19.893939200 -0800 +++ new/src/cpu/x86/vm/assembler_x86.cpp 2016-02-11 11:52:19.628912700 -0800 @@ -2361,7 +2361,7 @@ void Assembler::movdqu(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); simd_prefix(dst, xnoreg, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x6F); @@ -2398,7 +2398,7 @@ void Assembler::vmovdqu(XMMRegister dst, Address src) { assert(UseAVX > 0, ""); InstructionMark im(this); - InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x6F); @@ -2486,7 +2486,7 @@ void Assembler::evmovdqul(XMMRegister dst, Address src, int vector_len) { assert(VM_Version::supports_evex(), ""); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false , /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x6F); @@ -2515,7 +2515,7 @@ void Assembler::evmovdquq(XMMRegister dst, Address src, int vector_len) { assert(VM_Version::supports_evex(), ""); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x6F); @@ -2640,7 +2640,7 @@ void Assembler::movsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x10); emit_int8((unsigned char)(0xC0 | encode)); @@ -2649,7 +2649,7 @@ void Assembler::movsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit); simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x10); @@ -2668,7 +2668,7 @@ void Assembler::movss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x10); emit_int8((unsigned char)(0xC0 | encode)); @@ -2677,7 +2677,7 @@ void Assembler::movss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); simd_prefix(dst, xnoreg, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x10); @@ -2782,7 +2782,7 @@ void Assembler::mulsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit); simd_prefix(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x59); @@ -2791,7 +2791,7 @@ void Assembler::mulsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x59); emit_int8((unsigned char)(0xC0 | encode)); @@ -2800,7 +2800,7 @@ void Assembler::mulss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); simd_prefix(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x59); @@ -2809,7 +2809,7 @@ void Assembler::mulss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x59); emit_int8((unsigned char)(0xC0 | encode)); @@ -3993,7 +3993,7 @@ void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x51); emit_int8((unsigned char)(0xC0 | encode)); @@ -4002,7 +4002,7 @@ void Assembler::sqrtsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit); simd_prefix(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x51); @@ -4011,7 +4011,7 @@ void Assembler::sqrtss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x51); emit_int8((unsigned char)(0xC0 | encode)); @@ -4024,7 +4024,7 @@ void Assembler::sqrtss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); simd_prefix(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x51); @@ -4078,7 +4078,7 @@ void Assembler::subsd(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x5C); emit_int8((unsigned char)(0xC0 | encode)); @@ -4087,7 +4087,7 @@ void Assembler::subsd(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit); simd_prefix(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x5C); @@ -4096,7 +4096,7 @@ void Assembler::subss(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false , /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x5C); emit_int8((unsigned char)(0xC0 | encode)); @@ -4105,7 +4105,7 @@ void Assembler::subss(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse(), "")); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); simd_prefix(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x5C); @@ -4293,7 +4293,7 @@ void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit); int nds_enc = nds->is_valid() ? nds->encoding() : 0; vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); @@ -4303,7 +4303,7 @@ void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x58); @@ -4313,7 +4313,7 @@ void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); int nds_enc = nds->is_valid() ? nds->encoding() : 0; vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); @@ -4323,7 +4323,7 @@ void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x58); @@ -4333,7 +4333,7 @@ void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit); int nds_enc = nds->is_valid() ? nds->encoding() : 0; vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); @@ -4343,7 +4343,7 @@ void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x5E); @@ -4353,7 +4353,7 @@ void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); int nds_enc = nds->is_valid() ? nds->encoding() : 0; vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); @@ -4363,7 +4363,7 @@ void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x5E); @@ -4373,7 +4373,7 @@ void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit); int nds_enc = nds->is_valid() ? nds->encoding() : 0; vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); @@ -4383,7 +4383,7 @@ void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x59); @@ -4393,7 +4393,7 @@ void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); int nds_enc = nds->is_valid() ? nds->encoding() : 0; vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); @@ -4403,7 +4403,7 @@ void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x59); @@ -4413,7 +4413,7 @@ void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit); int nds_enc = nds->is_valid() ? nds->encoding() : 0; vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); @@ -4423,7 +4423,7 @@ void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x5C); @@ -4433,7 +4433,7 @@ void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); int nds_enc = nds->is_valid() ? nds->encoding() : 0; vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); @@ -4443,7 +4443,7 @@ void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) { assert(VM_Version::supports_avx(), ""); - InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes); emit_int8(0x5C); @@ -5901,7 +5901,7 @@ // duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), ""); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int8(0x78); emit_int8((unsigned char)(0xC0 | encode)); @@ -5911,7 +5911,7 @@ assert(VM_Version::supports_evex(), ""); assert(dst != xnoreg, "sanity"); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit); // swap src<->dst for encoding vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); @@ -5922,7 +5922,7 @@ // duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), ""); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int8(0x79); emit_int8((unsigned char)(0xC0 | encode)); @@ -5932,7 +5932,7 @@ assert(VM_Version::supports_evex(), ""); assert(dst != xnoreg, "sanity"); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit); // swap src<->dst for encoding vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); @@ -6027,7 +6027,7 @@ // duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) { assert(VM_Version::supports_evex(), ""); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int8(0x7A); emit_int8((unsigned char)(0xC0 | encode)); @@ -6036,7 +6036,7 @@ // duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) { assert(VM_Version::supports_evex(), ""); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int8(0x7B); emit_int8((unsigned char)(0xC0 | encode)); --- old/src/cpu/x86/vm/assembler_x86.hpp 2016-02-11 11:52:21.614111200 -0800 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2016-02-11 11:52:21.354085200 -0800 @@ -2044,11 +2044,11 @@ class InstructionAttr { public: InstructionAttr( - int vector_len, - bool rex_vex_w, - bool legacy_mode, - bool no_reg_mask, - bool uses_vl) + int vector_len, // The length of vector to be applied in encoding - for both AVX and EVEX + bool rex_vex_w, // Width of data: if 32-bits or less, false, else if 64-bit or specially defined, true + bool legacy_mode, // Details if either this instruction is conditionally encoded to AVX or earlier if true else possibly EVEX + bool no_reg_mask, // when true, k0 is used when EVEX encoding is chosen, else k1 is used under the same condition + bool uses_vl) // This instruction may have legacy constraints based on vector length for EVEX : _avx_vector_len(vector_len), _rex_vex_w(rex_vex_w), --- old/src/cpu/x86/vm/c2_globals_x86.hpp 2016-02-11 11:52:23.134263200 -0800 +++ new/src/cpu/x86/vm/c2_globals_x86.hpp 2016-02-11 11:52:22.903240100 -0800 @@ -46,6 +46,7 @@ define_pd_global(intx, ConditionalMoveLimit, 3); define_pd_global(intx, FreqInlineSize, 325); define_pd_global(intx, MinJumpTableSize, 10); +define_pd_global(intx, LoopPercentProfileLimit, 30); #ifdef AMD64 define_pd_global(intx, INTPRESSURE, 13); define_pd_global(intx, FLOATPRESSURE, 14); --- old/src/share/vm/opto/c2_globals.hpp 2016-02-11 11:52:24.477397500 -0800 +++ new/src/share/vm/opto/c2_globals.hpp 2016-02-11 11:52:24.239373700 -0800 @@ -182,6 +182,10 @@ "Unroll loop bodies with node count less than this") \ range(0, max_jint / 4) \ \ + product_pd(intx, LoopPercentProfileLimit, \ + "Unroll loop bodies with % node count of profile limit") \ + range(10, 100) \ + \ product(intx, LoopMaxUnroll, 16, \ "Maximum number of unrolls for main loop") \ range(0, max_jint) \ --- old/src/share/vm/opto/loopTransform.cpp 2016-02-11 11:52:25.914541200 -0800 +++ new/src/share/vm/opto/loopTransform.cpp 2016-02-11 11:52:25.683518100 -0800 @@ -666,7 +666,8 @@ if (future_unroll_ct > LoopMaxUnroll) return false; } else { // obey user constraints on vector mapped loops with additional unrolling applied - if ((future_unroll_ct / cl->slp_max_unroll()) > LoopMaxUnroll) return false; + int unroll_constraint = (cl->slp_max_unroll()) ? cl->slp_max_unroll() : 1; + if ((future_unroll_ct / unroll_constraint) > LoopMaxUnroll) return false; } // Check for initial stride being a small enough constant @@ -689,7 +690,7 @@ // Progress defined as current size less than 20% larger than previous size. if (UseSuperWord && cl->node_count_before_unroll() > 0 && future_unroll_ct > LoopUnrollMin && - (future_unroll_ct - 1) * 10.0 > cl->profile_trip_cnt() && + (future_unroll_ct - 1) * (100 / LoopPercentProfileLimit) > cl->profile_trip_cnt() && 1.2 * cl->node_count_before_unroll() < (double)_body.size()) { return false; } @@ -1260,6 +1261,146 @@ loop->record_for_igvn(); } +//------------------------------insert_vector_post_loop------------------------ +// Insert a copy of the atomic unrolled vectorized main loop as a post loop, +// unroll_policy has already informed us that more unrolling is about to happen to +// the main loop. The resultant post loop will serve as a vectorized drain loop. +void PhaseIdealLoop::insert_vector_post_loop(IdealLoopTree *loop, Node_List &old_new) { + if (!loop->_head->is_CountedLoop()) return; + + CountedLoopNode *cl = loop->_head->as_CountedLoop(); + + // only process vectorized main loops + if (!cl->is_vectorized_loop() || !cl->is_main_loop()) return; + + int slp_max_unroll_factor = cl->slp_max_unroll(); + int cur_unroll = cl->unrolled_count(); + + if (slp_max_unroll_factor == 0) return; + + // only process atomic unroll vector loops (not super unrolled after vectorization) + if (cur_unroll != slp_max_unroll_factor) return; + + // we only ever process this one time + if (cl->has_atomic_post_loop()) return; + +#ifndef PRODUCT + if (TraceLoopOpts) { + tty->print("PostVector "); + loop->dump_head(); + } +#endif + C->set_major_progress(); + + // Find common pieces of the loop being guarded with pre & post loops + CountedLoopNode *main_head = loop->_head->as_CountedLoop(); + CountedLoopEndNode *main_end = main_head->loopexit(); + guarantee(main_end != NULL, "no loop exit node"); + // diagnostic to show loop end is not properly formed + assert(main_end->outcnt() == 2, "1 true, 1 false path only"); + uint dd_main_head = dom_depth(main_head); + uint max = main_head->outcnt(); + + // mark this loop as processed + main_head->mark_has_atomic_post_loop(); + + Node *pre_header = main_head->in(LoopNode::EntryControl); + Node *init = main_head->init_trip(); + Node *incr = main_end->incr(); + Node *limit = main_end->limit(); + Node *stride = main_end->stride(); + Node *cmp = main_end->cmp_node(); + BoolTest::mask b_test = main_end->test_trip(); + + //------------------------------ + // Step A: Create a new post-Loop. + Node* main_exit = main_end->proj_out(false); + assert(main_exit->Opcode() == Op_IfFalse, ""); + int dd_main_exit = dom_depth(main_exit); + + // Step A1: Clone the loop body of main. The clone becomes the vector post-loop. + // The main loop pre-header illegally has 2 control users (old & new loops). + clone_loop(loop, old_new, dd_main_exit); + assert(old_new[main_end->_idx]->Opcode() == Op_CountedLoopEnd, ""); + CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop(); + post_head->set_normal_loop(); + post_head->set_post_loop(main_head); + + // Reduce the post-loop trip count. + CountedLoopEndNode* post_end = old_new[main_end->_idx]->as_CountedLoopEnd(); + post_end->_prob = PROB_FAIR; + + // Build the main-loop normal exit. + IfFalseNode *new_main_exit = new IfFalseNode(main_end); + _igvn.register_new_node_with_optimizer(new_main_exit); + set_idom(new_main_exit, main_end, dd_main_exit); + set_loop(new_main_exit, loop->_parent); + + // Step A2: Build a zero-trip guard for the vector post-loop. After leaving the + // main-loop, the vector post-loop may not execute at all. We 'opaque' the incr + // (the vectorized main-loop trip-counter exit value) because we will be changing + // the exit value (via additional unrolling) so we cannot constant-fold away the zero + // trip guard until all unrolling is done. + Node *zer_opaq = new Opaque1Node(C, incr); + Node *zer_cmp = new CmpINode(zer_opaq, limit); + Node *zer_bol = new BoolNode(zer_cmp, b_test); + register_new_node(zer_opaq, new_main_exit); + register_new_node(zer_cmp, new_main_exit); + register_new_node(zer_bol, new_main_exit); + + // Build the IfNode + IfNode *zer_iff = new IfNode(new_main_exit, zer_bol, PROB_FAIR, COUNT_UNKNOWN); + _igvn.register_new_node_with_optimizer(zer_iff); + set_idom(zer_iff, new_main_exit, dd_main_exit); + set_loop(zer_iff, loop->_parent); + + // Plug in the false-path, taken if we need to skip vector post-loop + _igvn.replace_input_of(main_exit, 0, zer_iff); + set_idom(main_exit, zer_iff, dd_main_exit); + set_idom(main_exit->unique_out(), zer_iff, dd_main_exit); + // Make the true-path, must enter the vector post loop + Node *zer_taken = new IfTrueNode(zer_iff); + _igvn.register_new_node_with_optimizer(zer_taken); + set_idom(zer_taken, zer_iff, dd_main_exit); + set_loop(zer_taken, loop->_parent); + // Plug in the true path + _igvn.hash_delete(post_head); + post_head->set_req(LoopNode::EntryControl, zer_taken); + set_idom(post_head, zer_taken, dd_main_exit); + + Arena *a = Thread::current()->resource_area(); + VectorSet visited(a); + Node_Stack clones(a, main_head->back_control()->outcnt()); + // Step A3: Make the fall-in values to the vector post-loop come from the + // fall-out values of the main-loop. + for (DUIterator_Fast imax, i = main_head->fast_outs(imax); i < imax; i++) { + Node* main_phi = main_head->fast_out(i); + if (main_phi->is_Phi() && main_phi->in(0) == main_head && main_phi->outcnt() >0) { + Node *cur_phi = old_new[main_phi->_idx]; + Node *fallnew = clone_up_backedge_goo(main_head->back_control(), + post_head->init_control(), + main_phi->in(LoopNode::LoopBackControl), + visited, clones); + _igvn.hash_delete(cur_phi); + cur_phi->set_req(LoopNode::EntryControl, fallnew); + } + } + + // CastII for the new post loop: + bool inserted = cast_incr_before_loop(zer_opaq->in(1), zer_taken, post_head); + assert(inserted, "no castII inserted"); + + // It's difficult to be precise about the trip-counts + // for post loops. They are usually very short, + // so guess that unit vector trips is a reasonable value. + post_head->set_profile_trip_cnt((float)slp_max_unroll_factor); + + // Now force out all loop-invariant dominating tests. The optimizer + // finds some, but we _know_ they are all useless. + peeled_dom_test_elim(loop, old_new); + loop->record_for_igvn(); +} + //------------------------------is_invariant----------------------------- // Return true if n is invariant bool IdealLoopTree::is_invariant(Node* n) const { @@ -2598,6 +2739,9 @@ // and we'd rather unroll the post-RCE'd loop SO... do not unroll if // peeling. if (should_unroll && !should_peel) { + if (SuperWordLoopUnrollAnalysis) { + phase->insert_vector_post_loop(this, old_new); + } phase->do_unroll(this, old_new, true); } --- old/src/share/vm/opto/loopnode.hpp 2016-02-11 11:52:27.375687300 -0800 +++ new/src/share/vm/opto/loopnode.hpp 2016-02-11 11:52:27.132663000 -0800 @@ -67,7 +67,9 @@ HasReductions=128, WasSlpAnalyzed=256, PassedSlpAnalysis=512, - DoUnrollOnly=1024 }; + DoUnrollOnly=1024, + VectorizedLoop=2048, + HasAtomicPostLoop=4096 }; char _unswitch_count; enum { _unswitch_max=3 }; @@ -86,6 +88,8 @@ void mark_was_slp() { _loop_flags |= WasSlpAnalyzed; } void mark_passed_slp() { _loop_flags |= PassedSlpAnalysis; } void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; } + void mark_loop_vectorized() { _loop_flags |= VectorizedLoop; } + void mark_has_atomic_post_loop() { _loop_flags |= HasAtomicPostLoop; } int unswitch_max() { return _unswitch_max; } int unswitch_count() { return _unswitch_count; } @@ -221,6 +225,8 @@ int has_passed_slp () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; } int do_unroll_only () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; } int is_main_no_pre_loop() const { return _loop_flags & MainHasNoPreLoop; } + int is_vectorized_loop () const { return (_loop_flags & VectorizedLoop) == VectorizedLoop; } + int has_atomic_post_loop () const { return (_loop_flags & HasAtomicPostLoop) == HasAtomicPostLoop; } void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; } int main_idx() const { return _main_idx; } @@ -893,6 +899,8 @@ // Add pre and post loops around the given loop. These loops are used // during RCE, unrolling and aligning loops. void insert_pre_post_loops( IdealLoopTree *loop, Node_List &old_new, bool peel_only ); + // Add a vector post loop between a vector main loop and the current post loop + void insert_vector_post_loop(IdealLoopTree *loop, Node_List &old_new); // If Node n lives in the back_ctrl block, we clone a private version of n // in preheader_ctrl block and return that, otherwise return n. Node *clone_up_backedge_goo( Node *back_ctrl, Node *preheader_ctrl, Node *n, VectorSet &visited, Node_Stack &clones ); --- old/src/share/vm/opto/superword.cpp 2016-02-11 11:52:28.787828500 -0800 +++ new/src/share/vm/opto/superword.cpp 2016-02-11 11:52:28.555805300 -0800 @@ -2253,6 +2253,9 @@ C->set_major_progress(); } cl->mark_do_unroll_only(); + if (do_reserve_copy()) { + cl->mark_loop_vectorized(); + } } } }