--- old/src/cpu/aarch64/vm/aarch64.ad 2016-03-01 02:14:09.680434152 +0300 +++ new/src/cpu/aarch64/vm/aarch64.ad 2016-03-01 02:14:09.632434377 +0300 @@ -3425,9 +3425,6 @@ // false => size gets scaled to BytesPerLong, ok. const bool Matcher::init_array_count_is_in_bytes = false; -// Threshold size for cleararray. -const int Matcher::init_array_short_size = 18 * BytesPerLong; - // Use conditional move (CMOVL) const int Matcher::long_cmove_cost() { // long cmoves are no more expensive than int cmoves --- old/src/cpu/aarch64/vm/globals_aarch64.hpp 2016-03-01 02:14:09.924433010 +0300 +++ new/src/cpu/aarch64/vm/globals_aarch64.hpp 2016-03-01 02:14:09.880433215 +0300 @@ -76,6 +76,8 @@ // avoid biased locking while we are bootstrapping the aarch64 build define_pd_global(bool, UseBiasedLocking, false); +define_pd_global(intx, InitArrayShortSize, 18*BytesPerLong); + #if defined(COMPILER1) || defined(COMPILER2) define_pd_global(intx, InlineSmallCode, 1000); #endif --- old/src/cpu/ppc/vm/globals_ppc.hpp 2016-03-01 02:14:10.092432223 +0300 +++ new/src/cpu/ppc/vm/globals_ppc.hpp 2016-03-01 02:14:10.048432428 +0300 @@ -76,6 +76,8 @@ define_pd_global(bool, CompactStrings, true); +define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong); + // Platform dependent flag handling: flags only defined on this platform. #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \ \ --- old/src/cpu/ppc/vm/ppc.ad 2016-03-01 02:14:10.264431417 +0300 +++ new/src/cpu/ppc/vm/ppc.ad 2016-03-01 02:14:10.216431641 +0300 @@ -2137,8 +2137,6 @@ return decode; } */ -// Threshold size for cleararray. -const int Matcher::init_array_short_size = 8 * BytesPerLong; // false => size gets scaled to BytesPerLong, ok. const bool Matcher::init_array_count_is_in_bytes = false; --- old/src/cpu/sparc/vm/globals_sparc.hpp 2016-03-01 02:14:10.496430332 +0300 +++ new/src/cpu/sparc/vm/globals_sparc.hpp 2016-03-01 02:14:10.452430537 +0300 @@ -90,6 +90,8 @@ define_pd_global(bool, CompactStrings, true); +define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong); + #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \ \ product(intx, UseVIS, 99, \ --- old/src/cpu/sparc/vm/sparc.ad 2016-03-01 02:14:10.668429526 +0300 +++ new/src/cpu/sparc/vm/sparc.ad 2016-03-01 02:14:10.620429750 +0300 @@ -1980,9 +1980,6 @@ // No scaling for the parameter the ClearArray node. const bool Matcher::init_array_count_is_in_bytes = true; -// Threshold size for cleararray. -const int Matcher::init_array_short_size = 8 * BytesPerLong; - // No additional cost for CMOVL. const int Matcher::long_cmove_cost() { return 0; } --- old/src/cpu/x86/vm/globals_x86.hpp 2016-03-01 02:14:10.888428495 +0300 +++ new/src/cpu/x86/vm/globals_x86.hpp 2016-03-01 02:14:10.844428703 +0300 @@ -97,6 +97,8 @@ define_pd_global(bool, PreserveFramePointer, false); +define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong); + #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \ \ develop(bool, IEEEPrecision, true, \ --- old/src/cpu/x86/vm/macroAssembler_x86.cpp 2016-03-01 02:14:11.092427541 +0300 +++ new/src/cpu/x86/vm/macroAssembler_x86.cpp 2016-03-01 02:14:11.036427804 +0300 @@ -7198,21 +7198,45 @@ } -void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) { +void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, bool is_large) { // cnt - number of qwords (8-byte words). // base - start address, qword aligned. + // is_large - if optimizers know cnt is larger than InitArrayShortSize assert(base==rdi, "base register must be edi for rep stos"); assert(tmp==rax, "tmp register must be eax for rep stos"); assert(cnt==rcx, "cnt register must be ecx for rep stos"); + assert(InitArrayShortSize % BytesPerLong == 0, + "InitArrayShortSize should be the multiple of BytesPerLong"); + Label SHORT, LONG, DONE; + + if (!is_large) { + cmpptr(cnt, InitArrayShortSize/BytesPerLong); + jcc(Assembler::greater, LONG); + + NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM + + // Use individual pointer-sized stores for small counts: + bind(SHORT); + testptr(cnt, cnt); + jcc(Assembler::equal, DONE); + decrement(cnt); + movptr(Address(base, cnt, Address::times_ptr), 0); + jmp(SHORT); + } + + // Use longer rep-prefixed ops for non-small counts: + bind(LONG); xorptr(tmp, tmp); if (UseFastStosb) { - shlptr(cnt,3); // convert to number of bytes + shlptr(cnt, 3); // convert to number of bytes rep_stosb(); } else { - NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM + NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM rep_stos(); } + + bind(DONE); } #ifdef COMPILER2 --- old/src/cpu/x86/vm/macroAssembler_x86.hpp 2016-03-01 02:14:11.428425968 +0300 +++ new/src/cpu/x86/vm/macroAssembler_x86.hpp 2016-03-01 02:14:11.352426325 +0300 @@ -1284,8 +1284,9 @@ // C2 compiled method's prolog code. void verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b); - // clear memory of size 'cnt' qwords, starting at 'base'. - void clear_mem(Register base, Register cnt, Register rtmp); + // clear memory of size 'cnt' qwords, starting at 'base'; + // if 'is_large' is set, do not try to produce short loop + void clear_mem(Register base, Register cnt, Register rtmp, bool is_large); #ifdef COMPILER2 void string_indexof_char(Register str1, Register cnt1, Register ch, Register result, --- old/src/cpu/x86/vm/x86_32.ad 2016-03-01 02:14:11.816424152 +0300 +++ new/src/cpu/x86/vm/x86_32.ad 2016-03-01 02:14:11.764424396 +0300 @@ -1420,9 +1420,6 @@ // The ecx parameter to rep stos for the ClearArray node is in dwords. const bool Matcher::init_array_count_is_in_bytes = false; -// Threshold size for cleararray. -const int Matcher::init_array_short_size = 8 * BytesPerLong; - // Needs 2 CMOV's for longs. const int Matcher::long_cmove_cost() { return 1; } @@ -11369,27 +11366,53 @@ // ======================================================================= // fast clearing of an array instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{ - predicate(!UseFastStosb); + predicate(!UseFastStosb && !((ClearArrayNode*)n)->is_large()); + match(Set dummy (ClearArray cnt base)); + effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); + format %{ "XOR EAX,EAX\t# ClearArray:\n\t" + "SHL ECX,1\t# Convert doublewords to words\n\t" + "REP STOS\t# store EAX into [EDI++] while ECX--" %} + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false); + %} + ins_pipe( pipe_slow ); +%} + +instruct rep_stos_large(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{ + predicate(!UseFastStosb && ((ClearArrayNode*)n)->is_large()); match(Set dummy (ClearArray cnt base)); effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); format %{ "XOR EAX,EAX\t# ClearArray:\n\t" "SHL ECX,1\t# Convert doublewords to words\n\t" "REP STOS\t# store EAX into [EDI++] while ECX--" %} ins_encode %{ - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true); %} ins_pipe( pipe_slow ); %} instruct rep_fast_stosb(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{ - predicate(UseFastStosb); + predicate(UseFastStosb && !((ClearArrayNode*)n)->is_large()); + match(Set dummy (ClearArray cnt base)); + effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); + format %{ "XOR EAX,EAX\t# ClearArray:\n\t" + "SHL ECX,3\t# Convert doublewords to bytes\n\t" + "REP STOSB\t# store EAX into [EDI++] while ECX--" %} + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false); + %} + ins_pipe( pipe_slow ); +%} + +instruct rep_fast_stosb_large(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{ + predicate(UseFastStosb && ((ClearArrayNode*)n)->is_large()); match(Set dummy (ClearArray cnt base)); effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); format %{ "XOR EAX,EAX\t# ClearArray:\n\t" "SHL ECX,3\t# Convert doublewords to bytes\n\t" "REP STOSB\t# store EAX into [EDI++] while ECX--" %} ins_encode %{ - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true); %} ins_pipe( pipe_slow ); %} --- old/src/cpu/x86/vm/x86_64.ad 2016-03-01 02:14:12.292421923 +0300 +++ new/src/cpu/x86/vm/x86_64.ad 2016-03-01 02:14:12.220422261 +0300 @@ -1637,9 +1637,6 @@ // The ecx parameter to rep stosq for the ClearArray node is in words. const bool Matcher::init_array_count_is_in_bytes = false; -// Threshold size for cleararray. -const int Matcher::init_array_short_size = 8 * BytesPerLong; - // No additional cost for CMOVL. const int Matcher::long_cmove_cost() { return 0; } @@ -10460,14 +10457,29 @@ instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy, rFlagsReg cr) %{ - predicate(!UseFastStosb); + predicate(!UseFastStosb && !((ClearArrayNode*)n)->is_large()); + match(Set dummy (ClearArray cnt base)); + effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); + + format %{ "xorq rax, rax\t# ClearArray:\n\t" + "rep stosq\t# Store rax to *rdi++ while rcx--" %} + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false); + %} + ins_pipe(pipe_slow); +%} + +instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy, + rFlagsReg cr) +%{ + predicate(!UseFastStosb && ((ClearArrayNode*)n)->is_large()); match(Set dummy (ClearArray cnt base)); effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); format %{ "xorq rax, rax\t# ClearArray:\n\t" "rep stosq\t# Store rax to *rdi++ while rcx--" %} ins_encode %{ - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true); %} ins_pipe(pipe_slow); %} @@ -10475,14 +10487,29 @@ instruct rep_fast_stosb(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy, rFlagsReg cr) %{ - predicate(UseFastStosb); + predicate(UseFastStosb && !((ClearArrayNode*)n)->is_large()); + match(Set dummy (ClearArray cnt base)); + effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); + format %{ "xorq rax, rax\t# ClearArray:\n\t" + "shlq rcx,3\t# Convert doublewords to bytes\n\t" + "rep stosb\t# Store rax to *rdi++ while rcx--" %} + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false); + %} + ins_pipe( pipe_slow ); +%} + +instruct rep_fast_stosb_large(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy, + rFlagsReg cr) +%{ + predicate(UseFastStosb && ((ClearArrayNode*)n)->is_large()); match(Set dummy (ClearArray cnt base)); effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); format %{ "xorq rax, rax\t# ClearArray:\n\t" "shlq rcx,3\t# Convert doublewords to bytes\n\t" "rep stosb\t# Store rax to *rdi++ while rcx--" %} ins_encode %{ - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true); %} ins_pipe( pipe_slow ); %} --- old/src/share/vm/opto/matcher.hpp 2016-03-01 02:14:12.728419881 +0300 +++ new/src/share/vm/opto/matcher.hpp 2016-03-01 02:14:12.684420089 +0300 @@ -399,10 +399,6 @@ // Optional scaling for the parameter to the ClearArray/CopyArray node. static const bool init_array_count_is_in_bytes; - // Threshold small size (in bytes) for a ClearArray/CopyArray node. - // Anything this size or smaller may get converted to discrete scalar stores. - static const int init_array_short_size; - // Some hardware needs 2 CMOV's for longs. static const int long_cmove_cost(); --- old/src/share/vm/opto/memnode.cpp 2016-03-01 02:14:12.936418908 +0300 +++ new/src/share/vm/opto/memnode.cpp 2016-03-01 02:14:12.860419264 +0300 @@ -2732,6 +2732,9 @@ //------------------------------Idealize--------------------------------------- // Clearing a short array is faster with stores Node *ClearArrayNode::Ideal(PhaseGVN *phase, bool can_reshape){ + // Already know this is a large node, do not try to ideal it + if (_is_large) return NULL; + const int unit = BytesPerLong; const TypeX* t = phase->type(in(2))->isa_intptr_t(); if (!t) return NULL; @@ -2744,8 +2747,11 @@ // (see jck test stmt114.stmt11402.val). if (size <= 0 || size % unit != 0) return NULL; intptr_t count = size / unit; - // Length too long; use fast hardware clear - if (size > Matcher::init_array_short_size) return NULL; + // Length too long; communicate this to matchers and assemblers. + // Assemblers are responsible to produce fast hardware clears for it. + if (size > InitArrayShortSize) { + return new ClearArrayNode(in(0), in(1), in(2), in(3), true); + } Node *mem = in(1); if( phase->type(mem)==Type::TOP ) return NULL; Node *adr = in(3); @@ -3892,7 +3898,7 @@ zeroes_done, zeroes_needed, phase); zeroes_done = zeroes_needed; - if (zsize > Matcher::init_array_short_size && ++big_init_gaps > 2) + if (zsize > InitArrayShortSize && ++big_init_gaps > 2) do_zeroing = false; // leave the hole, next time } } --- old/src/share/vm/opto/memnode.hpp 2016-03-01 02:14:13.272417336 +0300 +++ new/src/share/vm/opto/memnode.hpp 2016-03-01 02:14:13.196417692 +0300 @@ -1013,9 +1013,11 @@ //------------------------------ClearArray------------------------------------- class ClearArrayNode: public Node { +private: + bool _is_large; public: - ClearArrayNode( Node *ctrl, Node *arymem, Node *word_cnt, Node *base ) - : Node(ctrl,arymem,word_cnt,base) { + ClearArrayNode( Node *ctrl, Node *arymem, Node *word_cnt, Node *base, bool is_large = false) + : Node(ctrl,arymem,word_cnt,base), _is_large(is_large) { init_class_id(Class_ClearArray); } virtual int Opcode() const; @@ -1026,6 +1028,7 @@ virtual Node* Identity(PhaseGVN* phase); virtual Node *Ideal(PhaseGVN *phase, bool can_reshape); virtual uint match_edge(uint idx) const; + bool is_large() const { return _is_large; } // Clear the given area of an object or array. // The start offset must always be aligned mod BytesPerInt. --- old/src/share/vm/runtime/commandLineFlagConstraintsCompiler.cpp 2016-03-01 02:14:13.572415932 +0300 +++ new/src/share/vm/runtime/commandLineFlagConstraintsCompiler.cpp 2016-03-01 02:14:13.512416210 +0300 @@ -354,6 +354,14 @@ return Flag::SUCCESS; } +Flag::Error InitArrayShortSizeConstraintFunc(intx value, bool verbose) { + if (value % BytesPerLong != 0) { + return Flag::VIOLATES_CONSTRAINT; + } else { + return Flag::SUCCESS; + } +} + #ifdef COMPILER2 Flag::Error InteriorEntryAlignmentConstraintFunc(intx value, bool verbose) { if (InteriorEntryAlignment > CodeEntryAlignment) { --- old/src/share/vm/runtime/commandLineFlagConstraintsCompiler.hpp 2016-03-01 02:14:13.852414620 +0300 +++ new/src/share/vm/runtime/commandLineFlagConstraintsCompiler.hpp 2016-03-01 02:14:13.780414956 +0300 @@ -62,6 +62,8 @@ Flag::Error TypeProfileLevelConstraintFunc(uintx value, bool verbose); +Flag::Error InitArrayShortSizeConstraintFunc(intx value, bool verbose); + #ifdef COMPILER2 Flag::Error InteriorEntryAlignmentConstraintFunc(intx value, bool verbose); --- old/src/share/vm/runtime/globals.hpp 2016-03-01 02:14:14.204412972 +0300 +++ new/src/share/vm/runtime/globals.hpp 2016-03-01 02:14:14.128413327 +0300 @@ -4162,6 +4162,13 @@ "in the loaded class C. " \ "Check (3) is available only in debug builds.") \ \ + develop_pd(intx, InitArrayShortSize, \ + "Threshold small size (in bytes) for clearing arrays. " \ + "Anything this size or smaller may get converted to discrete " \ + "scalar stores. ") \ + range(0, max_intx) \ + constraint(InitArrayShortSizeConstraintFunc, AfterErgo) \ + \ diagnostic(bool, CompilerDirectivesIgnoreCompileCommands, false, \ "Disable backwards compatibility for compile commands.") \ \