--- old/src/cpu/sparc/vm/assembler_sparc.hpp Tue Aug 16 10:00:26 2011 +++ new/src/cpu/sparc/vm/assembler_sparc.hpp Tue Aug 16 10:00:26 2011 @@ -886,7 +886,11 @@ enum ASIs { // page 72, v9 ASI_PRIMARY = 0x80, - ASI_PRIMARY_LITTLE = 0x88 + ASI_PRIMARY_LITTLE = 0x88, + // Block initializing store + ASI_ST_BLKINIT_PRIMARY = 0xE2, + // Most-Recently-Used (MRU) BIS variant + ASI_ST_BLKINIT_MRU_PRIMARY = 0xF2 // add more from book as needed }; --- old/src/cpu/sparc/vm/sparc.ad Tue Aug 16 10:00:27 2011 +++ new/src/cpu/sparc/vm/sparc.ad Tue Aug 16 10:00:27 2011 @@ -471,9 +471,6 @@ source %{ #define __ _masm. -// Block initializing store -#define ASI_BLK_INIT_QUAD_LDD_P 0xE2 - // tertiary op of a LoadP or StoreP encoding #define REGP_OP true @@ -2819,10 +2816,10 @@ Register nof_bytes_arg = reg_to_register_object($cnt$$reg); Register nof_bytes_tmp = reg_to_register_object($temp$$reg); Register base_pointer_arg = reg_to_register_object($base$$reg); - + Label loop; __ mov(nof_bytes_arg, nof_bytes_tmp); - + // Loop and clear, walking backwards through the array. // nof_bytes_tmp (if >0) is always the number of bytes to zero __ bind(loop); @@ -6269,6 +6266,7 @@ instruct prefetchr( memory mem ) %{ match( PrefetchRead mem ); ins_cost(MEMORY_REF_COST); + size(4); format %{ "PREFETCH $mem,0\t! Prefetch read-many" %} opcode(Assembler::prefetch_op3); @@ -6277,9 +6275,9 @@ %} instruct prefetchw( memory mem ) %{ - predicate(AllocatePrefetchStyle != 3 ); match( PrefetchWrite mem ); ins_cost(MEMORY_REF_COST); + size(4); format %{ "PREFETCH $mem,2\t! Prefetch write-many (and read)" %} opcode(Assembler::prefetch_op3); @@ -6287,24 +6285,61 @@ ins_pipe(iload_mem); %} -// Use BIS instruction to prefetch. -instruct prefetchw_bis( memory mem ) %{ - predicate(AllocatePrefetchStyle == 3); - match( PrefetchWrite mem ); +// Prefetch instructions for allocation. + +instruct prefetchAlloc( memory mem ) %{ + predicate(AllocatePrefetchInstr == 0); + match( PrefetchAllocation mem ); ins_cost(MEMORY_REF_COST); + size(4); - format %{ "STXA G0,$mem\t! // Block initializing store" %} + format %{ "PREFETCH $mem,2\t! Prefetch allocation" %} + opcode(Assembler::prefetch_op3); + ins_encode( form3_mem_prefetch_write( mem ) ); + ins_pipe(iload_mem); +%} + +// Use BIS instruction to prefetch for allocation. +// Could fault, need space at the end of TLAB. +instruct prefetchAlloc_bis( iRegP dst ) %{ + predicate(AllocatePrefetchInstr == 1); + match( PrefetchAllocation dst ); + ins_cost(MEMORY_REF_COST); + size(4); + + format %{ "STXA [$dst]\t! // Prefetch allocation using BIS" %} ins_encode %{ - Register base = as_Register($mem$$base); - int disp = $mem$$disp; - if (disp != 0) { - __ add(base, AllocatePrefetchStepSize, base); - } - __ stxa(G0, base, G0, ASI_BLK_INIT_QUAD_LDD_P); + __ stxa(G0, $dst$$Register, G0, Assembler::ASI_ST_BLKINIT_PRIMARY); %} ins_pipe(istore_mem_reg); %} +// Next code is used for finding next cache line address to prefetch. + +instruct cacheLineAdr32( iRegP dst, iRegP src, immI13 mask ) %{ + match(Set dst (CastX2P (AndI (CastP2X src) mask))); + ins_cost(DEFAULT_COST); + size(4); + + format %{ "AND $src,$mask,$dst\t! next cache line address" %} + ins_encode %{ + __ and3($src$$Register, $mask$$constant, $dst$$Register); + %} + ins_pipe(ialu_reg_imm); +%} + +instruct cacheLineAdr64( iRegP dst, iRegP src, immL13 mask ) %{ + match(Set dst (CastX2P (AndL (CastP2X src) mask))); + ins_cost(DEFAULT_COST); + size(4); + + format %{ "AND $src,$mask,$dst\t! next cache line address" %} + ins_encode %{ + __ and3($src$$Register, $mask$$constant, $dst$$Register); + %} + ins_pipe(ialu_reg_imm); +%} + //----------Store Instructions------------------------------------------------- // Store Byte instruct storeB(memory mem, iRegI src) %{ --- old/src/cpu/sparc/vm/vm_version_sparc.cpp Tue Aug 16 10:00:28 2011 +++ new/src/cpu/sparc/vm/vm_version_sparc.cpp Tue Aug 16 10:00:28 2011 @@ -44,20 +44,31 @@ PrefetchScanIntervalInBytes = prefetch_scan_interval_in_bytes(); PrefetchFieldsAhead = prefetch_fields_ahead(); + assert(0 <= AllocatePrefetchInstr && AllocatePrefetchInstr <= 1, "invalid value"); + if( AllocatePrefetchInstr < 0 ) AllocatePrefetchInstr = 0; + if( AllocatePrefetchInstr > 1 ) AllocatePrefetchInstr = 0; + // Allocation prefetch settings - intx cache_line_size = L1_data_cache_line_size(); + intx cache_line_size = prefetch_data_size(); if( cache_line_size > AllocatePrefetchStepSize ) AllocatePrefetchStepSize = cache_line_size; - if( FLAG_IS_DEFAULT(AllocatePrefetchLines) ) - AllocatePrefetchLines = 3; // Optimistic value - assert( AllocatePrefetchLines > 0, "invalid value"); - if( AllocatePrefetchLines < 1 ) // set valid value in product VM - AllocatePrefetchLines = 1; // Conservative value + assert(AllocatePrefetchLines > 0, "invalid value"); + if( AllocatePrefetchLines < 1 ) // set valid value in product VM + AllocatePrefetchLines = 3; + assert(AllocateInstancePrefetchLines > 0, "invalid value"); + if( AllocateInstancePrefetchLines < 1 ) // set valid value in product VM + AllocateInstancePrefetchLines = 1; + AllocatePrefetchDistance = allocate_prefetch_distance(); AllocatePrefetchStyle = allocate_prefetch_style(); - assert(AllocatePrefetchDistance % AllocatePrefetchStepSize == 0, "invalid value"); + assert((AllocatePrefetchDistance % AllocatePrefetchStepSize) == 0 && + (AllocatePrefetchDistance > 0), "invalid value"); + if ((AllocatePrefetchDistance % AllocatePrefetchStepSize) != 0 || + (AllocatePrefetchDistance <= 0)) { + AllocatePrefetchDistance = AllocatePrefetchStepSize; + } if (AllocatePrefetchStyle == 3 && !has_blk_init()) { warning("BIS instructions are not available on this CPU"); @@ -99,19 +110,42 @@ FLAG_SET_DEFAULT(InteriorEntryAlignment, 4); } if (is_niagara_plus()) { - if (has_blk_init() && AllocatePrefetchStyle > 0 && - FLAG_IS_DEFAULT(AllocatePrefetchStyle)) { - // Use BIS instruction for allocation prefetch. - FLAG_SET_DEFAULT(AllocatePrefetchStyle, 3); + if (has_blk_init() && UseTLAB && + FLAG_IS_DEFAULT(AllocatePrefetchInstr)) { + // Use BIS instruction for TLAB allocation prefetch. + FLAG_SET_ERGO(intx, AllocatePrefetchInstr, 1); + if (FLAG_IS_DEFAULT(AllocatePrefetchStyle)) { + FLAG_SET_ERGO(intx, AllocatePrefetchStyle, 3); + } if (FLAG_IS_DEFAULT(AllocatePrefetchDistance)) { - // Use smaller prefetch distance on N2 with BIS - FLAG_SET_DEFAULT(AllocatePrefetchDistance, 64); + // Use smaller prefetch distance with BIS + FLAG_SET_ERGO(intx, AllocatePrefetchDistance, 64); } } + if (is_T4()) { + // Double number of prefetched cache lines on T4 + // since L2 cache line size is smaller (32 bytes). + if (FLAG_IS_DEFAULT(AllocatePrefetchLines)) { + FLAG_SET_ERGO(intx, AllocatePrefetchLines, AllocatePrefetchLines*2); + } + if (FLAG_IS_DEFAULT(AllocateInstancePrefetchLines)) { + FLAG_SET_ERGO(intx, AllocateInstancePrefetchLines, AllocateInstancePrefetchLines*2); + } + } if (AllocatePrefetchStyle != 3 && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) { // Use different prefetch distance without BIS - FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256); + FLAG_SET_ERGO(intx, AllocatePrefetchDistance, 256); } + if (AllocatePrefetchInstr == 1) { + // Need a space at the end of TLAB for BIS since it + // will fault when accessing memory outside of heap. + + // +1 for rounding up to next cache line, +1 to be safe + int lines = AllocatePrefetchLines + 2; + int step_size = AllocatePrefetchStepSize; + int distance = AllocatePrefetchDistance; + _reserve_for_allocation_prefetch = (distance + step_size*lines)/(int)HeapWordSize; + } } #endif } @@ -185,14 +219,20 @@ #ifndef PRODUCT if (PrintMiscellaneous && Verbose) { - tty->print("Allocation: "); + tty->print("Allocation"); if (AllocatePrefetchStyle <= 0) { - tty->print_cr("no prefetching"); + tty->print_cr(": no prefetching"); } else { + tty->print(" prefetching: "); + if (AllocatePrefetchInstr == 0) { + tty->print("PREFETCH"); + } else if (AllocatePrefetchInstr == 1) { + tty->print("BIS"); + } if (AllocatePrefetchLines > 1) { - tty->print_cr("PREFETCH %d, %d lines of size %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize); + tty->print_cr(" at distance %d, %d lines of %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize); } else { - tty->print_cr("PREFETCH %d, one line", AllocatePrefetchDistance); + tty->print_cr(" at distance %d, one line of %d bytes", AllocatePrefetchDistance, AllocatePrefetchStepSize); } } if (PrefetchCopyIntervalInBytes > 0) { --- old/src/cpu/sparc/vm/vm_version_sparc.hpp Tue Aug 16 10:00:28 2011 +++ new/src/cpu/sparc/vm/vm_version_sparc.hpp Tue Aug 16 10:00:28 2011 @@ -121,6 +121,7 @@ // Returns true if the platform is in the niagara line (T series) // and newer than the niagara1. static bool is_niagara_plus() { return is_T_family(_features) && !is_T1_model(_features); } + static bool is_T4() { return is_T_family(_features) && has_cbcond(); } // Fujitsu SPARC64 static bool is_sparc64() { return (_features & sparc64_family_m) != 0; } @@ -130,13 +131,17 @@ static bool has_fast_fxtof() { return is_niagara() || is_sparc64() || has_v9() && !is_ultra3(); } static bool has_fast_idiv() { return is_niagara_plus() || is_sparc64(); } + // T4 and newer Sparc have fast RDPC instruction. - static bool has_fast_rdpc() { return is_niagara_plus() && has_cbcond(); } + static bool has_fast_rdpc() { return is_T4(); } + // T4 and newer Sparc have Most-Recently-Used (MRU) BIS. + static bool has_mru_blk_init() { return has_blk_init() && is_T4(); } + static const char* cpu_features() { return _features_str; } - static intx L1_data_cache_line_size() { - return 64; // default prefetch block size on sparc + static intx prefetch_data_size() { + return is_T4() ? 32 : 64; // default prefetch block size on sparc } // Prefetch --- old/src/cpu/x86/vm/assembler_x86.cpp Tue Aug 16 10:00:29 2011 +++ new/src/cpu/x86/vm/assembler_x86.cpp Tue Aug 16 10:00:29 2011 @@ -2315,7 +2315,7 @@ } void Assembler::prefetchr(Address src) { - NOT_LP64(assert(VM_Version::supports_3dnow_prefetch(), "must support")); + assert(VM_Version::supports_3dnow_prefetch(), "must support"); InstructionMark im(this); prefetch_prefix(src); emit_byte(0x0D); @@ -2347,7 +2347,7 @@ } void Assembler::prefetchw(Address src) { - NOT_LP64(assert(VM_Version::supports_3dnow_prefetch(), "must support")); + assert(VM_Version::supports_3dnow_prefetch(), "must support"); InstructionMark im(this); prefetch_prefix(src); emit_byte(0x0D); --- old/src/cpu/x86/vm/vm_version_x86.cpp Tue Aug 16 10:00:29 2011 +++ new/src/cpu/x86/vm/vm_version_x86.cpp Tue Aug 16 10:00:29 2011 @@ -557,14 +557,16 @@ if( !supports_sse() && supports_3dnow_prefetch() ) AllocatePrefetchInstr = 3; // Allocation prefetch settings - intx cache_line_size = L1_data_cache_line_size(); + intx cache_line_size = prefetch_data_size(); if( cache_line_size > AllocatePrefetchStepSize ) AllocatePrefetchStepSize = cache_line_size; - if( FLAG_IS_DEFAULT(AllocatePrefetchLines) ) - AllocatePrefetchLines = 3; // Optimistic value + assert(AllocatePrefetchLines > 0, "invalid value"); - if( AllocatePrefetchLines < 1 ) // set valid value in product VM - AllocatePrefetchLines = 1; // Conservative value + if( AllocatePrefetchLines < 1 ) // set valid value in product VM + AllocatePrefetchLines = 3; + assert(AllocateInstancePrefetchLines > 0, "invalid value"); + if( AllocateInstancePrefetchLines < 1 ) // set valid value in product VM + AllocateInstancePrefetchLines = 1; AllocatePrefetchDistance = allocate_prefetch_distance(); AllocatePrefetchStyle = allocate_prefetch_style(); @@ -601,10 +603,11 @@ tty->print_cr("Logical CPUs per core: %u", logical_processors_per_package()); tty->print_cr("UseSSE=%d",UseSSE); - tty->print("Allocation: "); + tty->print("Allocation"); if (AllocatePrefetchStyle <= 0 || UseSSE == 0 && !supports_3dnow_prefetch()) { - tty->print_cr("no prefetching"); + tty->print_cr(": no prefetching"); } else { + tty->print(" prefetching: "); if (UseSSE == 0 && supports_3dnow_prefetch()) { tty->print("PREFETCHW"); } else if (UseSSE >= 1) { @@ -619,9 +622,9 @@ } } if (AllocatePrefetchLines > 1) { - tty->print_cr(" %d, %d lines with step %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize); + tty->print_cr(" at distance %d, %d lines of %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize); } else { - tty->print_cr(" %d, one line", AllocatePrefetchDistance); + tty->print_cr(" at distance %d, one line of %d bytes", AllocatePrefetchDistance, AllocatePrefetchStepSize); } } --- old/src/cpu/x86/vm/vm_version_x86.hpp Tue Aug 16 10:00:30 2011 +++ new/src/cpu/x86/vm/vm_version_x86.hpp Tue Aug 16 10:00:30 2011 @@ -419,7 +419,7 @@ return result; } - static intx L1_data_cache_line_size() { + static intx prefetch_data_size() { intx result = 0; if (is_intel()) { result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1); --- old/src/cpu/x86/vm/x86_32.ad Tue Aug 16 10:00:30 2011 +++ new/src/cpu/x86/vm/x86_32.ad Tue Aug 16 10:00:30 2011 @@ -7325,8 +7325,9 @@ ins_cost(100); format %{ "PREFETCHR $mem\t! Prefetch into level 1 cache for read" %} - opcode(0x0F, 0x0d); /* Opcode 0F 0d /0 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem)); + ins_encode %{ + __ prefetchr($mem$$Address); + %} ins_pipe(ialu_mem); %} @@ -7336,8 +7337,9 @@ ins_cost(100); format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for read" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem)); + ins_encode %{ + __ prefetchnta($mem$$Address); + %} ins_pipe(ialu_mem); %} @@ -7347,8 +7349,9 @@ ins_cost(100); format %{ "PREFETCHT0 $mem\t! Prefetch into L1 and L2 caches for read" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem)); + ins_encode %{ + __ prefetcht0($mem$$Address); + %} ins_pipe(ialu_mem); %} @@ -7358,8 +7361,9 @@ ins_cost(100); format %{ "PREFETCHT2 $mem\t! Prefetch into L2 cache for read" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x03,mem)); + ins_encode %{ + __ prefetcht2($mem$$Address); + %} ins_pipe(ialu_mem); %} @@ -7374,46 +7378,86 @@ %} instruct prefetchw( memory mem ) %{ - predicate(UseSSE==0 && VM_Version::supports_3dnow_prefetch() || AllocatePrefetchInstr==3); + predicate(UseSSE==0 && VM_Version::supports_3dnow_prefetch()); match( PrefetchWrite mem ); ins_cost(100); format %{ "PREFETCHW $mem\t! Prefetch into L1 cache and mark modified" %} - opcode(0x0F, 0x0D); /* Opcode 0F 0D /1 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem)); + ins_encode %{ + __ prefetchw($mem$$Address); + %} ins_pipe(ialu_mem); %} instruct prefetchwNTA( memory mem ) %{ - predicate(UseSSE>=1 && AllocatePrefetchInstr==0); + predicate(UseSSE>=1); match(PrefetchWrite mem); ins_cost(100); format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for write" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem)); + ins_encode %{ + __ prefetchnta($mem$$Address); + %} ins_pipe(ialu_mem); %} -instruct prefetchwT0( memory mem ) %{ +// Prefetch instructions for allocation. + +instruct prefetchAlloc0( memory mem ) %{ + predicate(UseSSE==0 && AllocatePrefetchInstr!=3); + match(PrefetchAllocation mem); + ins_cost(0); + size(0); + format %{ "Prefetch allocation (non-SSE is empty encoding)" %} + ins_encode(); + ins_pipe(empty); +%} + +instruct prefetchAlloc( memory mem ) %{ + predicate(AllocatePrefetchInstr==3); + match( PrefetchAllocation mem ); + ins_cost(100); + + format %{ "PREFETCHW $mem\t! Prefetch allocation into L1 cache and mark modified" %} + ins_encode %{ + __ prefetchw($mem$$Address); + %} + ins_pipe(ialu_mem); +%} + +instruct prefetchAllocNTA( memory mem ) %{ + predicate(UseSSE>=1 && AllocatePrefetchInstr==0); + match(PrefetchAllocation mem); + ins_cost(100); + + format %{ "PREFETCHNTA $mem\t! Prefetch allocation into non-temporal cache for write" %} + ins_encode %{ + __ prefetchnta($mem$$Address); + %} + ins_pipe(ialu_mem); +%} + +instruct prefetchAllocT0( memory mem ) %{ predicate(UseSSE>=1 && AllocatePrefetchInstr==1); - match(PrefetchWrite mem); + match(PrefetchAllocation mem); ins_cost(100); - format %{ "PREFETCHT0 $mem\t! Prefetch into L1 and L2 caches for write" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem)); + format %{ "PREFETCHT0 $mem\t! Prefetch allocation into L1 and L2 caches for write" %} + ins_encode %{ + __ prefetcht0($mem$$Address); + %} ins_pipe(ialu_mem); %} -instruct prefetchwT2( memory mem ) %{ +instruct prefetchAllocT2( memory mem ) %{ predicate(UseSSE>=1 && AllocatePrefetchInstr==2); - match(PrefetchWrite mem); + match(PrefetchAllocation mem); ins_cost(100); - format %{ "PREFETCHT2 $mem\t! Prefetch into L2 cache for write" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */ - ins_encode(OpcP, OpcS, RMopc_Mem(0x03,mem)); + format %{ "PREFETCHT2 $mem\t! Prefetch allocation into L2 cache for write" %} + ins_encode %{ + __ prefetcht2($mem$$Address); + %} ins_pipe(ialu_mem); %} --- old/src/cpu/x86/vm/x86_64.ad Tue Aug 16 10:00:31 2011 +++ new/src/cpu/x86/vm/x86_64.ad Tue Aug 16 10:00:31 2011 @@ -6617,8 +6617,9 @@ ins_cost(125); format %{ "PREFETCHR $mem\t# Prefetch into level 1 cache" %} - opcode(0x0F, 0x0D); /* Opcode 0F 0D /0 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x00, mem)); + ins_encode %{ + __ prefetchr($mem$$Address); + %} ins_pipe(ialu_mem); %} @@ -6628,8 +6629,9 @@ ins_cost(125); format %{ "PREFETCHNTA $mem\t# Prefetch into non-temporal cache for read" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x00, mem)); + ins_encode %{ + __ prefetchnta($mem$$Address); + %} ins_pipe(ialu_mem); %} @@ -6639,8 +6641,9 @@ ins_cost(125); format %{ "PREFETCHT0 $mem\t# prefetch into L1 and L2 caches for read" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x01, mem)); + ins_encode %{ + __ prefetcht0($mem$$Address); + %} ins_pipe(ialu_mem); %} @@ -6650,52 +6653,70 @@ ins_cost(125); format %{ "PREFETCHT2 $mem\t# prefetch into L2 caches for read" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x03, mem)); + ins_encode %{ + __ prefetcht2($mem$$Address); + %} ins_pipe(ialu_mem); %} -instruct prefetchw( memory mem ) %{ - predicate(AllocatePrefetchInstr==3); +instruct prefetchwNTA( memory mem ) %{ match(PrefetchWrite mem); ins_cost(125); - format %{ "PREFETCHW $mem\t# Prefetch into level 1 cache and mark modified" %} - opcode(0x0F, 0x0D); /* Opcode 0F 0D /1 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x01, mem)); + format %{ "PREFETCHNTA $mem\t# Prefetch to non-temporal cache for write" %} + ins_encode %{ + __ prefetchnta($mem$$Address); + %} ins_pipe(ialu_mem); %} -instruct prefetchwNTA( memory mem ) %{ +// Prefetch instructions for allocation. + +instruct prefetchAlloc( memory mem ) %{ + predicate(AllocatePrefetchInstr==3); + match(PrefetchAllocation mem); + ins_cost(125); + + format %{ "PREFETCHW $mem\t# Prefetch allocation into level 1 cache and mark modified" %} + ins_encode %{ + __ prefetchw($mem$$Address); + %} + ins_pipe(ialu_mem); +%} + +instruct prefetchAllocNTA( memory mem ) %{ predicate(AllocatePrefetchInstr==0); - match(PrefetchWrite mem); + match(PrefetchAllocation mem); ins_cost(125); - format %{ "PREFETCHNTA $mem\t# Prefetch to non-temporal cache for write" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x00, mem)); + format %{ "PREFETCHNTA $mem\t# Prefetch allocation to non-temporal cache for write" %} + ins_encode %{ + __ prefetchnta($mem$$Address); + %} ins_pipe(ialu_mem); %} -instruct prefetchwT0( memory mem ) %{ +instruct prefetchAllocT0( memory mem ) %{ predicate(AllocatePrefetchInstr==1); - match(PrefetchWrite mem); + match(PrefetchAllocation mem); ins_cost(125); - format %{ "PREFETCHT0 $mem\t# Prefetch to level 1 and 2 caches for write" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x01, mem)); + format %{ "PREFETCHT0 $mem\t# Prefetch allocation to level 1 and 2 caches for write" %} + ins_encode %{ + __ prefetcht0($mem$$Address); + %} ins_pipe(ialu_mem); %} -instruct prefetchwT2( memory mem ) %{ +instruct prefetchAllocT2( memory mem ) %{ predicate(AllocatePrefetchInstr==2); - match(PrefetchWrite mem); + match(PrefetchAllocation mem); ins_cost(125); - format %{ "PREFETCHT2 $mem\t# Prefetch to level 2 cache for write" %} - opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */ - ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x03, mem)); + format %{ "PREFETCHT2 $mem\t# Prefetch allocation to level 2 cache for write" %} + ins_encode %{ + __ prefetcht2($mem$$Address); + %} ins_pipe(ialu_mem); %} --- old/src/share/vm/adlc/formssel.cpp Tue Aug 16 10:00:32 2011 +++ new/src/share/vm/adlc/formssel.cpp Tue Aug 16 10:00:32 2011 @@ -3391,7 +3391,9 @@ "ClearArray" }; int cnt = sizeof(needs_ideal_memory_list)/sizeof(char*); - if( strcmp(_opType,"PrefetchRead")==0 || strcmp(_opType,"PrefetchWrite")==0 ) + if( strcmp(_opType,"PrefetchRead")==0 || + strcmp(_opType,"PrefetchWrite")==0 || + strcmp(_opType,"PrefetchAllocation")==0 ) return 1; if( _lChild ) { const char *opType = _lChild->_opType; --- old/src/share/vm/memory/threadLocalAllocBuffer.hpp Tue Aug 16 10:00:32 2011 +++ new/src/share/vm/memory/threadLocalAllocBuffer.hpp Tue Aug 16 10:00:32 2011 @@ -124,16 +124,7 @@ // Reserve space at the end of TLAB static size_t end_reserve() { int reserve_size = typeArrayOopDesc::header_size(T_INT); - if (AllocatePrefetchStyle == 3) { - // BIS is used to prefetch - we need a space for it. - // +1 for rounding up to next cache line +1 to be safe - int lines = AllocatePrefetchLines + 2; - int step_size = AllocatePrefetchStepSize; - int distance = AllocatePrefetchDistance; - int prefetch_end = (distance + step_size*lines)/(int)HeapWordSize; - reserve_size = MAX2(reserve_size, prefetch_end); - } - return reserve_size; + return MAX2(reserve_size, VM_Version::reserve_for_allocation_prefetch()); } static size_t alignment_reserve() { return align_object_size(end_reserve()); } static size_t alignment_reserve_in_bytes() { return alignment_reserve() * HeapWordSize; } --- old/src/share/vm/opto/classes.hpp Tue Aug 16 10:00:33 2011 +++ new/src/share/vm/opto/classes.hpp Tue Aug 16 10:00:33 2011 @@ -196,6 +196,7 @@ macro(PopCountI) macro(PopCountL) macro(PowD) +macro(PrefetchAllocation) macro(PrefetchRead) macro(PrefetchWrite) macro(Proj) --- old/src/share/vm/opto/macro.cpp Tue Aug 16 10:00:34 2011 +++ new/src/share/vm/opto/macro.cpp Tue Aug 16 10:00:33 2011 @@ -1590,7 +1590,7 @@ prefetch_adr = new (C, 4) AddPNode( old_pf_wm, new_pf_wmt, _igvn.MakeConX(distance) ); transform_later(prefetch_adr); - prefetch = new (C, 3) PrefetchWriteNode( i_o, prefetch_adr ); + prefetch = new (C, 3) PrefetchAllocationNode( i_o, prefetch_adr ); transform_later(prefetch); distance += step_size; i_o = prefetch; @@ -1611,13 +1611,14 @@ contended_phi_rawmem = pf_phi_rawmem; i_o = pf_phi_abio; } else if( UseTLAB && AllocatePrefetchStyle == 3 ) { - // Insert a prefetch for each allocation only on the fast-path + // Insert a prefetch for each allocation. + // This code is used for Sparc with BIS. Node *pf_region = new (C, 3) RegionNode(3); Node *pf_phi_rawmem = new (C, 3) PhiNode( pf_region, Type::MEMORY, TypeRawPtr::BOTTOM ); - // Generate several prefetch instructions only for arrays. - uint lines = (length != NULL) ? AllocatePrefetchLines : 1; + // Generate several prefetch instructions. + uint lines = (length != NULL) ? AllocatePrefetchLines : AllocateInstancePrefetchLines; uint step_size = AllocatePrefetchStepSize; uint distance = AllocatePrefetchDistance; @@ -1634,7 +1635,7 @@ transform_later(cache_adr); // Prefetch - Node *prefetch = new (C, 3) PrefetchWriteNode( contended_phi_rawmem, cache_adr ); + Node *prefetch = new (C, 3) PrefetchAllocationNode( contended_phi_rawmem, cache_adr ); prefetch->set_req(0, needgc_false); transform_later(prefetch); contended_phi_rawmem = prefetch; @@ -1644,7 +1645,7 @@ prefetch_adr = new (C, 4) AddPNode( cache_adr, cache_adr, _igvn.MakeConX(distance) ); transform_later(prefetch_adr); - prefetch = new (C, 3) PrefetchWriteNode( contended_phi_rawmem, prefetch_adr ); + prefetch = new (C, 3) PrefetchAllocationNode( contended_phi_rawmem, prefetch_adr ); transform_later(prefetch); distance += step_size; contended_phi_rawmem = prefetch; @@ -1653,8 +1654,8 @@ // Insert a prefetch for each allocation only on the fast-path Node *prefetch_adr; Node *prefetch; - // Generate several prefetch instructions only for arrays. - uint lines = (length != NULL) ? AllocatePrefetchLines : 1; + // Generate several prefetch instructions. + uint lines = (length != NULL) ? AllocatePrefetchLines : AllocateInstancePrefetchLines; uint step_size = AllocatePrefetchStepSize; uint distance = AllocatePrefetchDistance; for ( uint i = 0; i < lines; i++ ) { @@ -1661,7 +1662,7 @@ prefetch_adr = new (C, 4) AddPNode( old_eden_top, new_eden_top, _igvn.MakeConX(distance) ); transform_later(prefetch_adr); - prefetch = new (C, 3) PrefetchWriteNode( i_o, prefetch_adr ); + prefetch = new (C, 3) PrefetchAllocationNode( i_o, prefetch_adr ); // Do not let it float too high, since if eden_top == eden_end, // both might be null. if( i == 0 ) { // Set control for first prefetch, next follows it --- old/src/share/vm/opto/matcher.cpp Tue Aug 16 10:00:34 2011 +++ new/src/share/vm/opto/matcher.cpp Tue Aug 16 10:00:34 2011 @@ -826,6 +826,7 @@ switch (n->Opcode()) { case Op_PrefetchRead: case Op_PrefetchWrite: + case Op_PrefetchAllocation: nidx = Compile::AliasIdxRaw; nat = TypeRawPtr::BOTTOM; break; --- old/src/share/vm/opto/memnode.hpp Tue Aug 16 10:00:35 2011 +++ new/src/share/vm/opto/memnode.hpp Tue Aug 16 10:00:35 2011 @@ -1278,6 +1278,16 @@ virtual int Opcode() const; virtual uint ideal_reg() const { return NotAMachineReg; } virtual uint match_edge(uint idx) const { return idx==2; } + virtual const Type *bottom_type() const { return Type::ABIO; } +}; + +// Allocation prefetch which may fault, TLAB size have to be adjusted. +class PrefetchAllocationNode : public Node { +public: + PrefetchAllocationNode(Node *mem, Node *adr) : Node(0,mem,adr) {} + virtual int Opcode() const; + virtual uint ideal_reg() const { return NotAMachineReg; } + virtual uint match_edge(uint idx) const { return idx==2; } virtual const Type *bottom_type() const { return ( AllocatePrefetchStyle == 3 ) ? Type::MEMORY : Type::ABIO; } }; --- old/src/share/vm/runtime/globals.hpp Tue Aug 16 10:00:35 2011 +++ new/src/share/vm/runtime/globals.hpp Tue Aug 16 10:00:35 2011 @@ -2897,9 +2897,12 @@ product(intx, AllocatePrefetchDistance, -1, \ "Distance to prefetch ahead of allocation pointer") \ \ - product(intx, AllocatePrefetchLines, 1, \ - "Number of lines to prefetch ahead of allocation pointer") \ + product(intx, AllocatePrefetchLines, 3, \ + "Number of lines to prefetch ahead of array allocation pointer") \ \ + product(intx, AllocateInstancePrefetchLines, 1, \ + "Number of lines to prefetch ahead of instance allocation pointer") \ + \ product(intx, AllocatePrefetchStepSize, 16, \ "Step size in bytes of sequential prefetch instructions") \ \ --- old/src/share/vm/runtime/vm_version.cpp Tue Aug 16 10:00:36 2011 +++ new/src/share/vm/runtime/vm_version.cpp Tue Aug 16 10:00:36 2011 @@ -46,6 +46,7 @@ const char* Abstract_VM_Version::_s_internal_vm_info_string = Abstract_VM_Version::internal_vm_info_string(); bool Abstract_VM_Version::_supports_cx8 = false; unsigned int Abstract_VM_Version::_logical_processors_per_package = 1U; +int Abstract_VM_Version::_reserve_for_allocation_prefetch = 0; #ifndef HOTSPOT_RELEASE_VERSION #error HOTSPOT_RELEASE_VERSION must be defined --- old/src/share/vm/runtime/vm_version.hpp Tue Aug 16 10:00:37 2011 +++ new/src/share/vm/runtime/vm_version.hpp Tue Aug 16 10:00:36 2011 @@ -44,6 +44,7 @@ static bool _initialized; static int _parallel_worker_threads; static bool _parallel_worker_threads_initialized; + static int _reserve_for_allocation_prefetch; static unsigned int nof_parallel_worker_threads(unsigned int num, unsigned int dem, @@ -77,6 +78,12 @@ return _logical_processors_per_package; } + // Need a space at the end of TLAB for prefetch instructions + // which may fault when accessing memory outside of heap. + static int reserve_for_allocation_prefetch() { + return _reserve_for_allocation_prefetch; + } + // ARCH specific policy for the BiasedLocking static bool use_biased_locking() { return true; }