--- old/src/cpu/aarch64/vm/aarch64.ad 2016-02-29 14:28:32.554729826 +0300 +++ new/src/cpu/aarch64/vm/aarch64.ad 2016-02-29 14:28:32.510730035 +0300 @@ -3425,9 +3425,6 @@ // false => size gets scaled to BytesPerLong, ok. const bool Matcher::init_array_count_is_in_bytes = false; -// Threshold size for cleararray. -const int Matcher::init_array_short_size = 18 * BytesPerLong; - // Use conditional move (CMOVL) const int Matcher::long_cmove_cost() { // long cmoves are no more expensive than int cmoves --- old/src/cpu/aarch64/vm/globals_aarch64.hpp 2016-02-29 14:28:32.798728668 +0300 +++ new/src/cpu/aarch64/vm/globals_aarch64.hpp 2016-02-29 14:28:32.754728877 +0300 @@ -76,6 +76,8 @@ // avoid biased locking while we are bootstrapping the aarch64 build define_pd_global(bool, UseBiasedLocking, false); +define_pd_global(intx, InitArrayShortSize, 18*BytesPerLong); + #if defined(COMPILER1) || defined(COMPILER2) define_pd_global(intx, InlineSmallCode, 1000); #endif --- old/src/cpu/ppc/vm/globals_ppc.hpp 2016-02-29 14:28:32.958727910 +0300 +++ new/src/cpu/ppc/vm/globals_ppc.hpp 2016-02-29 14:28:32.918728100 +0300 @@ -76,6 +76,8 @@ define_pd_global(bool, CompactStrings, true); +define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong); + // Platform dependent flag handling: flags only defined on this platform. #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \ \ --- old/src/cpu/ppc/vm/ppc.ad 2016-02-29 14:28:33.126727113 +0300 +++ new/src/cpu/ppc/vm/ppc.ad 2016-02-29 14:28:33.082727322 +0300 @@ -2137,8 +2137,6 @@ return decode; } */ -// Threshold size for cleararray. -const int Matcher::init_array_short_size = 8 * BytesPerLong; // false => size gets scaled to BytesPerLong, ok. const bool Matcher::init_array_count_is_in_bytes = false; --- old/src/cpu/sparc/vm/globals_sparc.hpp 2016-02-29 14:28:33.350726051 +0300 +++ new/src/cpu/sparc/vm/globals_sparc.hpp 2016-02-29 14:28:33.306726260 +0300 @@ -90,6 +90,8 @@ define_pd_global(bool, CompactStrings, true); +define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong); + #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \ \ product(intx, UseVIS, 99, \ --- old/src/cpu/sparc/vm/sparc.ad 2016-02-29 14:28:33.514725273 +0300 +++ new/src/cpu/sparc/vm/sparc.ad 2016-02-29 14:28:33.470725482 +0300 @@ -1980,9 +1980,6 @@ // No scaling for the parameter the ClearArray node. const bool Matcher::init_array_count_is_in_bytes = true; -// Threshold size for cleararray. -const int Matcher::init_array_short_size = 8 * BytesPerLong; - // No additional cost for CMOVL. const int Matcher::long_cmove_cost() { return 0; } --- old/src/cpu/x86/vm/globals_x86.hpp 2016-02-29 14:28:33.738724210 +0300 +++ new/src/cpu/x86/vm/globals_x86.hpp 2016-02-29 14:28:33.694724420 +0300 @@ -97,6 +97,8 @@ define_pd_global(bool, PreserveFramePointer, false); +define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong); + #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \ \ develop(bool, IEEEPrecision, true, \ --- old/src/cpu/x86/vm/macroAssembler_x86.cpp 2016-02-29 14:28:33.914723376 +0300 +++ new/src/cpu/x86/vm/macroAssembler_x86.cpp 2016-02-29 14:28:33.862723623 +0300 @@ -7198,21 +7198,45 @@ } -void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) { +void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, bool is_large) { // cnt - number of qwords (8-byte words). // base - start address, qword aligned. + // is_large - if optimizers know cnt is larger than InitArrayShortSize assert(base==rdi, "base register must be edi for rep stos"); assert(tmp==rax, "tmp register must be eax for rep stos"); assert(cnt==rcx, "cnt register must be ecx for rep stos"); + assert(InitArrayShortSize % BytesPerLong == 0, + "InitArrayShortSize should be the multiple of BytesPerLong"); + Label SHORT, LONG, DONE; + + if (!is_large) { + cmpptr(cnt, InitArrayShortSize/BytesPerLong); + jcc(Assembler::greater, LONG); + + NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM + + // Use individual pointer-sized stores for small counts: + bind(SHORT); + testptr(cnt, cnt); + jcc(Assembler::equal, DONE); + decrement(cnt); + movptr(Address(base, cnt, Address::times_ptr), 0); + jmp(SHORT); + } + + // Use longer rep-prefixed ops for non-small counts: + bind(LONG); xorptr(tmp, tmp); if (UseFastStosb) { - shlptr(cnt,3); // convert to number of bytes + shlptr(cnt, 3); // convert to number of bytes rep_stosb(); } else { - NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM + NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM rep_stos(); } + + bind(DONE); } #ifdef COMPILER2 --- old/src/cpu/x86/vm/macroAssembler_x86.hpp 2016-02-29 14:28:34.238721840 +0300 +++ new/src/cpu/x86/vm/macroAssembler_x86.hpp 2016-02-29 14:28:34.170722161 +0300 @@ -1284,8 +1284,9 @@ // C2 compiled method's prolog code. void verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b); - // clear memory of size 'cnt' qwords, starting at 'base'. - void clear_mem(Register base, Register cnt, Register rtmp); + // clear memory of size 'cnt' qwords, starting at 'base'; + // if 'is_large' is set, do not try to produce short loop + void clear_mem(Register base, Register cnt, Register rtmp, bool is_large); #ifdef COMPILER2 void string_indexof_char(Register str1, Register cnt1, Register ch, Register result, --- old/src/cpu/x86/vm/x86_32.ad 2016-02-29 14:28:34.462720778 +0300 +++ new/src/cpu/x86/vm/x86_32.ad 2016-02-29 14:28:34.414721006 +0300 @@ -1420,9 +1420,6 @@ // The ecx parameter to rep stos for the ClearArray node is in dwords. const bool Matcher::init_array_count_is_in_bytes = false; -// Threshold size for cleararray. -const int Matcher::init_array_short_size = 8 * BytesPerLong; - // Needs 2 CMOV's for longs. const int Matcher::long_cmove_cost() { return 1; } --- old/src/cpu/x86/vm/x86_64.ad 2016-02-29 14:28:34.766719337 +0300 +++ new/src/cpu/x86/vm/x86_64.ad 2016-02-29 14:28:34.694719677 +0300 @@ -1637,9 +1637,6 @@ // The ecx parameter to rep stosq for the ClearArray node is in words. const bool Matcher::init_array_count_is_in_bytes = false; -// Threshold size for cleararray. -const int Matcher::init_array_short_size = 8 * BytesPerLong; - // No additional cost for CMOVL. const int Matcher::long_cmove_cost() { return 0; } @@ -10460,14 +10457,29 @@ instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy, rFlagsReg cr) %{ - predicate(!UseFastStosb); + predicate(!UseFastStosb && !((ClearArrayNode*)n)->is_large()); + match(Set dummy (ClearArray cnt base)); + effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); + + format %{ "xorq rax, rax\t# ClearArray:\n\t" + "rep stosq\t# Store rax to *rdi++ while rcx--" %} + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false); + %} + ins_pipe(pipe_slow); +%} + +instruct rep_stos_known_large(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy, + rFlagsReg cr) +%{ + predicate(!UseFastStosb && ((ClearArrayNode*)n)->is_large()); match(Set dummy (ClearArray cnt base)); effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); format %{ "xorq rax, rax\t# ClearArray:\n\t" "rep stosq\t# Store rax to *rdi++ while rcx--" %} ins_encode %{ - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true); %} ins_pipe(pipe_slow); %} @@ -10475,14 +10487,29 @@ instruct rep_fast_stosb(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy, rFlagsReg cr) %{ - predicate(UseFastStosb); + predicate(UseFastStosb && !((ClearArrayNode*)n)->is_large()); + match(Set dummy (ClearArray cnt base)); + effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); + format %{ "xorq rax, rax\t# ClearArray:\n\t" + "shlq rcx,3\t# Convert doublewords to bytes\n\t" + "rep stosb\t# Store rax to *rdi++ while rcx--" %} + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false); + %} + ins_pipe( pipe_slow ); +%} + +instruct rep_fast_stosb_known_large(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy, + rFlagsReg cr) +%{ + predicate(UseFastStosb && ((ClearArrayNode*)n)->is_large()); match(Set dummy (ClearArray cnt base)); effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); format %{ "xorq rax, rax\t# ClearArray:\n\t" "shlq rcx,3\t# Convert doublewords to bytes\n\t" "rep stosb\t# Store rax to *rdi++ while rcx--" %} ins_encode %{ - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true); %} ins_pipe( pipe_slow ); %} --- old/src/share/vm/opto/matcher.hpp 2016-02-29 14:28:35.146717535 +0300 +++ new/src/share/vm/opto/matcher.hpp 2016-02-29 14:28:35.074717875 +0300 @@ -399,10 +399,6 @@ // Optional scaling for the parameter to the ClearArray/CopyArray node. static const bool init_array_count_is_in_bytes; - // Threshold small size (in bytes) for a ClearArray/CopyArray node. - // Anything this size or smaller may get converted to discrete scalar stores. - static const int init_array_short_size; - // Some hardware needs 2 CMOV's for longs. static const int long_cmove_cost(); --- old/src/share/vm/opto/memnode.cpp 2016-02-29 14:28:35.398716339 +0300 +++ new/src/share/vm/opto/memnode.cpp 2016-02-29 14:28:35.330716662 +0300 @@ -2732,6 +2732,9 @@ //------------------------------Idealize--------------------------------------- // Clearing a short array is faster with stores Node *ClearArrayNode::Ideal(PhaseGVN *phase, bool can_reshape){ + // Already know this is a large node, do not try to ideal it + if (_is_large) return NULL; + const int unit = BytesPerLong; const TypeX* t = phase->type(in(2))->isa_intptr_t(); if (!t) return NULL; @@ -2744,8 +2747,11 @@ // (see jck test stmt114.stmt11402.val). if (size <= 0 || size % unit != 0) return NULL; intptr_t count = size / unit; - // Length too long; use fast hardware clear - if (size > Matcher::init_array_short_size) return NULL; + // Length too long; communicate this to matchers and assemblers. + // Assemblers are responsible to produce fast hardware clears for it. + if (size > InitArrayShortSize) { + return new ClearArrayNode(in(0), in(1), in(2), in(3), true); + } Node *mem = in(1); if( phase->type(mem)==Type::TOP ) return NULL; Node *adr = in(3); @@ -3892,7 +3898,7 @@ zeroes_done, zeroes_needed, phase); zeroes_done = zeroes_needed; - if (zsize > Matcher::init_array_short_size && ++big_init_gaps > 2) + if (zsize > InitArrayShortSize && ++big_init_gaps > 2) do_zeroing = false; // leave the hole, next time } } --- old/src/share/vm/opto/memnode.hpp 2016-02-29 14:28:35.750714670 +0300 +++ new/src/share/vm/opto/memnode.hpp 2016-02-29 14:28:35.674715029 +0300 @@ -1013,9 +1013,11 @@ //------------------------------ClearArray------------------------------------- class ClearArrayNode: public Node { +private: + bool _is_large; public: - ClearArrayNode( Node *ctrl, Node *arymem, Node *word_cnt, Node *base ) - : Node(ctrl,arymem,word_cnt,base) { + ClearArrayNode( Node *ctrl, Node *arymem, Node *word_cnt, Node *base, bool is_large = false) + : Node(ctrl,arymem,word_cnt,base), _is_large(is_large) { init_class_id(Class_ClearArray); } virtual int Opcode() const; @@ -1026,6 +1028,7 @@ virtual Node* Identity(PhaseGVN* phase); virtual Node *Ideal(PhaseGVN *phase, bool can_reshape); virtual uint match_edge(uint idx) const; + bool is_large() const { return _is_large; } // Clear the given area of an object or array. // The start offset must always be aligned mod BytesPerInt. --- old/src/share/vm/runtime/commandLineFlagConstraintsCompiler.cpp 2016-02-29 14:28:36.030713341 +0300 +++ new/src/share/vm/runtime/commandLineFlagConstraintsCompiler.cpp 2016-02-29 14:28:35.954713703 +0300 @@ -354,6 +354,14 @@ return Flag::SUCCESS; } +Flag::Error InitArrayShortSizeConstraintFunc(intx value, bool verbose) { + if (value % BytesPerLong != 0) { + return Flag::VIOLATES_CONSTRAINT; + } else { + return Flag::SUCCESS; + } +} + #ifdef COMPILER2 Flag::Error InteriorEntryAlignmentConstraintFunc(intx value, bool verbose) { if (InteriorEntryAlignment > CodeEntryAlignment) { --- old/src/share/vm/runtime/commandLineFlagConstraintsCompiler.hpp 2016-02-29 14:28:36.278712167 +0300 +++ new/src/share/vm/runtime/commandLineFlagConstraintsCompiler.hpp 2016-02-29 14:28:36.206712507 +0300 @@ -62,6 +62,8 @@ Flag::Error TypeProfileLevelConstraintFunc(uintx value, bool verbose); +Flag::Error InitArrayShortSizeConstraintFunc(intx value, bool verbose); + #ifdef COMPILER2 Flag::Error InteriorEntryAlignmentConstraintFunc(intx value, bool verbose); --- old/src/share/vm/runtime/globals.hpp 2016-02-29 14:28:36.514711047 +0300 +++ new/src/share/vm/runtime/globals.hpp 2016-02-29 14:28:36.458711313 +0300 @@ -4162,6 +4162,13 @@ "in the loaded class C. " \ "Check (3) is available only in debug builds.") \ \ + product_pd(intx, InitArrayShortSize, \ + "Threshold small size (in bytes) for clearing arrays. " \ + "Anything this size or smaller may get converted to discrete " \ + "scalar stores. ") \ + range(0, max_intx) \ + constraint(InitArrayShortSizeConstraintFunc, AfterErgo) \ + \ diagnostic(bool, CompilerDirectivesIgnoreCompileCommands, false, \ "Disable backwards compatibility for compile commands.") \ \