--- old/./.hgtags Fri May 31 16:03:48 2013 +++ new/./.hgtags Fri May 31 16:03:47 2013 @@ -501,3 +501,5 @@ 69fecd3e06892e95a32ce4c27f85b1d61e946fc8 hs24-b45 43fd44b89792fcc931569218dce51df4c2856a17 jdk7u40-b26 e50c5a1869b1f629508780eda1592674177a9f91 hs24-b46 +f2614c006bb73eabf8fdf8027b042b98149d06a7 jdk7u40-b27 +64aaeeee0a10e568f2234d91dca91608412ae5d1 hs24-b47 --- old/make/hotspot_version Fri May 31 16:03:51 2013 +++ new/make/hotspot_version Fri May 31 16:03:51 2013 @@ -35,7 +35,7 @@ HS_MAJOR_VER=24 HS_MINOR_VER=0 -HS_BUILD_NUMBER=46 +HS_BUILD_NUMBER=47 JDK_MAJOR_VER=1 JDK_MINOR_VER=7 --- old/src/cpu/x86/vm/stubGenerator_x86_32.cpp Fri May 31 16:03:55 2013 +++ new/src/cpu/x86/vm/stubGenerator_x86_32.cpp Fri May 31 16:03:55 2013 @@ -1527,7 +1527,7 @@ __ movptr(elem_klass, elem_klass_addr); // query the object klass generate_type_check(elem_klass, ckoff_arg, ckval_arg, temp, &L_store_element, NULL); - // (On fall-through, we have failed the element type check.) + // (On fall-through, we have failed the element type check.) // ======== end loop ======== // It was a real error; we must depend on the caller to finish the job. @@ -1534,20 +1534,22 @@ // Register "count" = -1 * number of *remaining* oops, length_arg = *total* oops. // Emit GC store barriers for the oops we have copied (length_arg + count), // and report their number to the caller. + assert_different_registers(to, count, rax); + Label L_post_barrier; __ addl(count, length_arg); // transfers = (length - remaining) __ movl2ptr(rax, count); // save the value - __ notptr(rax); // report (-1^K) to caller - __ movptr(to, to_arg); // reload - assert_different_registers(to, count, rax); - gen_write_ref_array_post_barrier(to, count); - __ jmpb(L_done); + __ notptr(rax); // report (-1^K) to caller (does not affect flags) + __ jccb(Assembler::notZero, L_post_barrier); + __ jmp(L_done); // K == 0, nothing was copied, skip post barrier // Come here on success only. __ BIND(L_do_card_marks); + __ xorptr(rax, rax); // return 0 on success __ movl2ptr(count, length_arg); - __ movptr(to, to_arg); // reload + + __ BIND(L_post_barrier); + __ movptr(to, to_arg); // reload gen_write_ref_array_post_barrier(to, count); - __ xorptr(rax, rax); // return 0 on success // Common exit point (success or failure). __ BIND(L_done); --- old/src/cpu/x86/vm/stubGenerator_x86_64.cpp Fri May 31 16:03:59 2013 +++ new/src/cpu/x86/vm/stubGenerator_x86_64.cpp Fri May 31 16:03:58 2013 @@ -1245,27 +1245,28 @@ // // Input: // start - register containing starting address of destination array - // end - register containing ending address of destination array + // count - elements count // scratch - scratch register // // The input registers are overwritten. - // The ending address is inclusive. - void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { - assert_different_registers(start, end, scratch); + // + void gen_write_ref_array_post_barrier(Register start, Register count, Register scratch) { + assert_different_registers(start, count, scratch); BarrierSet* bs = Universe::heap()->barrier_set(); switch (bs->kind()) { case BarrierSet::G1SATBCT: case BarrierSet::G1SATBCTLogging: - { - __ pusha(); // push registers (overkill) - // must compute element count unless barrier set interface is changed (other platforms supply count) - assert_different_registers(start, end, scratch); - __ lea(scratch, Address(end, BytesPerHeapOop)); - __ subptr(scratch, start); // subtract start to get #bytes - __ shrptr(scratch, LogBytesPerHeapOop); // convert to element count - __ mov(c_rarg0, start); - __ mov(c_rarg1, scratch); + __ pusha(); // push registers (overkill) + if (c_rarg0 == count) { // On win64 c_rarg0 == rcx + assert_different_registers(c_rarg1, start); + __ mov(c_rarg1, count); + __ mov(c_rarg0, start); + } else { + assert_different_registers(c_rarg0, count); + __ mov(c_rarg0, start); + __ mov(c_rarg1, count); + } __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); __ popa(); } @@ -1277,22 +1278,16 @@ assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); Label L_loop; + const Register end = count; - __ shrptr(start, CardTableModRefBS::card_shift); - __ addptr(end, BytesPerHeapOop); - __ shrptr(end, CardTableModRefBS::card_shift); - __ subptr(end, start); // number of bytes to copy + __ leaq(end, Address(start, count, TIMES_OOP, 0)); // end == start+count*oop_size + __ subptr(end, BytesPerHeapOop); // end - 1 to make inclusive + __ shrptr(start, CardTableModRefBS::card_shift); + __ shrptr(end, CardTableModRefBS::card_shift); + __ subptr(end, start); // end --> cards count - intptr_t disp = (intptr_t) ct->byte_map_base; - if (Assembler::is_simm32(disp)) { - Address cardtable(noreg, noreg, Address::no_scale, disp); - __ lea(scratch, cardtable); - } else { - ExternalAddress cardtable((address)disp); - __ lea(scratch, cardtable); - } - - const Register count = end; // 'end' register contains bytes count now + int64_t disp = (int64_t) ct->byte_map_base; + __ mov64(scratch, disp); __ addptr(start, scratch); __ BIND(L_loop); __ movb(Address(start, count, Address::times_1), 0); @@ -1944,8 +1939,7 @@ __ BIND(L_exit); if (is_oop) { - __ leaq(end_to, Address(saved_to, dword_count, Address::times_4, -4)); - gen_write_ref_array_post_barrier(saved_to, end_to, rax); + gen_write_ref_array_post_barrier(saved_to, dword_count, rax); } restore_arg_regs(); inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free @@ -2040,12 +2034,10 @@ // Copy in multi-bytes chunks copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); - __ bind(L_exit); - if (is_oop) { - Register end_to = rdx; - __ leaq(end_to, Address(to, dword_count, Address::times_4, -4)); - gen_write_ref_array_post_barrier(to, end_to, rax); - } + __ BIND(L_exit); + if (is_oop) { + gen_write_ref_array_post_barrier(to, dword_count, rax); + } restore_arg_regs(); inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free __ xorptr(rax, rax); // return 0 @@ -2083,6 +2075,7 @@ const Register end_from = from; // source array end address const Register end_to = rcx; // destination array end address const Register saved_to = to; + const Register saved_count = r11; // End pointers are inclusive, and if count is not zero they point // to the last unit copied: end_to[0] := end_from[0] @@ -2100,6 +2093,8 @@ // r9 and r10 may be used to save non-volatile registers // 'from', 'to' and 'qword_count' are now valid if (is_oop) { + // Save to and count for store barrier + __ movptr(saved_count, qword_count); // no registers are destroyed by this call gen_write_ref_array_pre_barrier(to, qword_count, dest_uninitialized); } @@ -2132,7 +2127,7 @@ if (is_oop) { __ BIND(L_exit); - gen_write_ref_array_post_barrier(saved_to, end_to, rax); + gen_write_ref_array_post_barrier(saved_to, saved_count, rax); } restore_arg_regs(); if (is_oop) { @@ -2215,8 +2210,7 @@ if (is_oop) { __ BIND(L_exit); - __ lea(rcx, Address(to, saved_count, Address::times_8, -8)); - gen_write_ref_array_post_barrier(to, rcx, rax); + gen_write_ref_array_post_barrier(to, saved_count, rax); } restore_arg_regs(); if (is_oop) { @@ -2403,21 +2397,21 @@ // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops. // Emit GC store barriers for the oops we have copied (r14 + rdx), // and report their number to the caller. - assert_different_registers(rax, r14_length, count, to, end_to, rcx); - __ lea(end_to, to_element_addr); - __ addptr(end_to, -heapOopSize); // make an inclusive end pointer - gen_write_ref_array_post_barrier(to, end_to, rscratch1); - __ movptr(rax, r14_length); // original oops - __ addptr(rax, count); // K = (original - remaining) oops - __ notptr(rax); // report (-1^K) to caller - __ jmp(L_done); + assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1); + Label L_post_barrier; + __ addptr(r14_length, count); // K = (original - remaining) oops + __ movptr(rax, r14_length); // save the value + __ notptr(rax); // report (-1^K) to caller (does not affect flags) + __ jccb(Assembler::notZero, L_post_barrier); + __ jmp(L_done); // K == 0, nothing was copied, skip post barrier // Come here on success only. __ BIND(L_do_card_marks); - __ addptr(end_to, -heapOopSize); // make an inclusive end pointer - gen_write_ref_array_post_barrier(to, end_to, rscratch1); - __ xorptr(rax, rax); // return 0 on success + __ xorptr(rax, rax); // return 0 on success + __ BIND(L_post_barrier); + gen_write_ref_array_post_barrier(to, r14_length, rscratch1); + // Common exit point (success or failure). __ BIND(L_done); __ movptr(r13, Address(rsp, saved_r13_offset * wordSize)); --- old/src/os/linux/vm/os_linux.cpp Fri May 31 16:04:02 2013 +++ new/src/os/linux/vm/os_linux.cpp Fri May 31 16:04:01 2013 @@ -138,6 +138,7 @@ Mutex* os::Linux::_createThread_lock = NULL; pthread_t os::Linux::_main_thread; int os::Linux::_page_size = -1; +const int os::Linux::_vm_default_page_size = (8 * K); bool os::Linux::_is_floating_stack = false; bool os::Linux::_is_NPTL = false; bool os::Linux::_supports_fast_thread_cpu_time = false; @@ -4385,6 +4386,15 @@ Linux::clock_init(); initial_time_count = os::elapsed_counter(); pthread_mutex_init(&dl_mutex, NULL); + + // If the pagesize of the VM is greater than 8K determine the appropriate + // number of initial guard pages. The user can change this with the + // command line arguments, if needed. + if (vm_page_size() > (int)Linux::vm_default_page_size()) { + StackYellowPages = 1; + StackRedPages = 1; + StackShadowPages = round_to((StackShadowPages*Linux::vm_default_page_size()), vm_page_size()) / vm_page_size(); + } } // To install functions for atexit system call @@ -4438,8 +4448,8 @@ // Add in 2*BytesPerWord times page size to account for VM stack during // class initialization depending on 32 or 64 bit VM. os::Linux::min_stack_allowed = MAX2(os::Linux::min_stack_allowed, - (size_t)(StackYellowPages+StackRedPages+StackShadowPages+ - 2*BytesPerWord COMPILER2_PRESENT(+1)) * Linux::page_size()); + (size_t)(StackYellowPages+StackRedPages+StackShadowPages) * Linux::page_size() + + (2*BytesPerWord COMPILER2_PRESENT(+1)) * Linux::vm_default_page_size()); size_t threadStackSizeInBytes = ThreadStackSize * K; if (threadStackSizeInBytes != 0 && --- old/src/os/linux/vm/os_linux.hpp Fri May 31 16:04:05 2013 +++ new/src/os/linux/vm/os_linux.hpp Fri May 31 16:04:05 2013 @@ -70,6 +70,7 @@ static pthread_t _main_thread; static Mutex* _createThread_lock; static int _page_size; + static const int _vm_default_page_size; static julong available_memory(); static julong physical_memory() { return _physical_memory; } @@ -116,6 +117,8 @@ static int page_size(void) { return _page_size; } static void set_page_size(int val) { _page_size = val; } + static int vm_default_page_size(void) { return _vm_default_page_size; } + static address ucontext_get_pc(ucontext_t* uc); static intptr_t* ucontext_get_sp(ucontext_t* uc); static intptr_t* ucontext_get_fp(ucontext_t* uc); --- old/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp Fri May 31 16:04:07 2013 +++ new/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp Fri May 31 16:04:07 2013 @@ -129,7 +129,7 @@ // The alignment used for eden and survivors within the young gen // and for boundary between young gen and old gen. - size_t intra_heap_alignment() const { return 64 * K; } + size_t intra_heap_alignment() const { return 64 * K * HeapWordSize; } size_t capacity() const; size_t used() const; --- old/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp Fri May 31 16:04:09 2013 +++ new/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp Fri May 31 16:04:09 2013 @@ -64,14 +64,26 @@ #include // All sizes are in HeapWords. -const size_t ParallelCompactData::Log2RegionSize = 9; // 512 words +const size_t ParallelCompactData::Log2RegionSize = 16; // 64K words const size_t ParallelCompactData::RegionSize = (size_t)1 << Log2RegionSize; const size_t ParallelCompactData::RegionSizeBytes = RegionSize << LogHeapWordSize; const size_t ParallelCompactData::RegionSizeOffsetMask = RegionSize - 1; const size_t ParallelCompactData::RegionAddrOffsetMask = RegionSizeBytes - 1; -const size_t ParallelCompactData::RegionAddrMask = ~RegionAddrOffsetMask; +const size_t ParallelCompactData::RegionAddrMask = ~RegionAddrOffsetMask; +const size_t ParallelCompactData::Log2BlockSize = 7; // 128 words +const size_t ParallelCompactData::BlockSize = (size_t)1 << Log2BlockSize; +const size_t ParallelCompactData::BlockSizeBytes = + BlockSize << LogHeapWordSize; +const size_t ParallelCompactData::BlockSizeOffsetMask = BlockSize - 1; +const size_t ParallelCompactData::BlockAddrOffsetMask = BlockSizeBytes - 1; +const size_t ParallelCompactData::BlockAddrMask = ~BlockAddrOffsetMask; + +const size_t ParallelCompactData::BlocksPerRegion = RegionSize / BlockSize; +const size_t ParallelCompactData::Log2BlocksPerRegion = + Log2RegionSize - Log2BlockSize; + const ParallelCompactData::RegionData::region_sz_t ParallelCompactData::RegionData::dc_shift = 27; @@ -379,6 +391,10 @@ _region_vspace = 0; _region_data = 0; _region_count = 0; + + _block_vspace = 0; + _block_data = 0; + _block_count = 0; } bool ParallelCompactData::initialize(MemRegion covered_region) @@ -392,8 +408,7 @@ assert((region_size & RegionSizeOffsetMask) == 0, "region size not a multiple of RegionSize"); - bool result = initialize_region_data(region_size); - + bool result = initialize_region_data(region_size) && initialize_block_data(); return result; } @@ -438,17 +453,36 @@ return false; } +bool ParallelCompactData::initialize_block_data() +{ + assert(_region_count != 0, "region data must be initialized first"); + const size_t count = _region_count << Log2BlocksPerRegion; + _block_vspace = create_vspace(count, sizeof(BlockData)); + if (_block_vspace != 0) { + _block_data = (BlockData*)_block_vspace->reserved_low_addr(); + _block_count = count; + return true; + } + return false; +} + void ParallelCompactData::clear() { memset(_region_data, 0, _region_vspace->committed_size()); + memset(_block_data, 0, _block_vspace->committed_size()); } void ParallelCompactData::clear_range(size_t beg_region, size_t end_region) { assert(beg_region <= _region_count, "beg_region out of range"); assert(end_region <= _region_count, "end_region out of range"); + assert(RegionSize % BlockSize == 0, "RegionSize not a multiple of BlockSize"); const size_t region_cnt = end_region - beg_region; memset(_region_data + beg_region, 0, region_cnt * sizeof(RegionData)); + + const size_t beg_block = beg_region * BlocksPerRegion; + const size_t block_cnt = region_cnt * BlocksPerRegion; + memset(_block_data + beg_block, 0, block_cnt * sizeof(BlockData)); } HeapWord* ParallelCompactData::partial_obj_end(size_t region_idx) const @@ -727,45 +761,44 @@ HeapWord* ParallelCompactData::calc_new_pointer(HeapWord* addr) { assert(addr != NULL, "Should detect NULL oop earlier"); - assert(PSParallelCompact::gc_heap()->is_in(addr), "addr not in heap"); -#ifdef ASSERT - if (PSParallelCompact::mark_bitmap()->is_unmarked(addr)) { - gclog_or_tty->print_cr("calc_new_pointer:: addr " PTR_FORMAT, addr); - } -#endif - assert(PSParallelCompact::mark_bitmap()->is_marked(addr), "obj not marked"); + assert(PSParallelCompact::gc_heap()->is_in(addr), "not in heap"); + assert(PSParallelCompact::mark_bitmap()->is_marked(addr), "not marked"); // Region covering the object. - size_t region_index = addr_to_region_idx(addr); - const RegionData* const region_ptr = region(region_index); - HeapWord* const region_addr = region_align_down(addr); - - assert(addr < region_addr + RegionSize, "Region does not cover object"); - assert(addr_to_region_ptr(region_addr) == region_ptr, "sanity check"); - + RegionData* const region_ptr = addr_to_region_ptr(addr); HeapWord* result = region_ptr->destination(); - // If all the data in the region is live, then the new location of the object - // can be calculated from the destination of the region plus the offset of the - // object in the region. + // If the entire Region is live, the new location is region->destination + the + // offset of the object within in the Region. + + // Run some performance tests to determine if this special case pays off. It + // is worth it for pointers into the dense prefix. If the optimization to + // avoid pointer updates in regions that only point to the dense prefix is + // ever implemented, this should be revisited. if (region_ptr->data_size() == RegionSize) { - result += pointer_delta(addr, region_addr); - DEBUG_ONLY(PSParallelCompact::check_new_location(addr, result);) + result += region_offset(addr); return result; } - // The new location of the object is - // region destination + - // size of the partial object extending onto the region + - // sizes of the live objects in the Region that are to the left of addr - const size_t partial_obj_size = region_ptr->partial_obj_size(); - HeapWord* const search_start = region_addr + partial_obj_size; + // Otherwise, the new location is region->destination + block offset + the + // number of live words in the Block that are (a) to the left of addr and (b) + // due to objects that start in the Block. - const ParMarkBitMap* bitmap = PSParallelCompact::mark_bitmap(); - size_t live_to_left = bitmap->live_words_in_range(search_start, oop(addr)); + // Fill in the block table if necessary. This is unsynchronized, so multiple + // threads may fill the block table for a region (harmless, since it is + // idempotent). + if (!region_ptr->blocks_filled()) { + PSParallelCompact::fill_blocks(addr_to_region_idx(addr)); + region_ptr->set_blocks_filled(); + } - result += partial_obj_size + live_to_left; - DEBUG_ONLY(PSParallelCompact::check_new_location(addr, result);) + HeapWord* const search_start = block_align_down(addr); + const size_t block_offset = addr_to_block_ptr(addr)->offset(); + + const ParMarkBitMap* bitmap = PSParallelCompact::mark_bitmap(); + const size_t live = bitmap->live_words_in_range(search_start, oop(addr)); + result += block_offset + live; + DEBUG_ONLY(PSParallelCompact::check_new_location(addr, result)); return result; } @@ -780,7 +813,7 @@ return updated_klass; } -#ifdef ASSERT +#ifdef ASSERT void ParallelCompactData::verify_clear(const PSVirtualSpace* vspace) { const size_t* const beg = (const size_t*)vspace->committed_low_addr(); @@ -793,16 +826,10 @@ void ParallelCompactData::verify_clear() { verify_clear(_region_vspace); + verify_clear(_block_vspace); } #endif // #ifdef ASSERT -#ifdef NOT_PRODUCT -ParallelCompactData::RegionData* debug_region(size_t region_index) { - ParallelCompactData& sd = PSParallelCompact::summary_data(); - return sd.region(region_index); -} -#endif - STWGCTimer PSParallelCompact::_gc_timer; ParallelOldTracer PSParallelCompact::_gc_tracer; elapsedTimer PSParallelCompact::_accumulated_time; @@ -1997,11 +2024,6 @@ maximum_heap_compaction); } -bool ParallelCompactData::region_contains(size_t region_index, HeapWord* addr) { - size_t addr_region_index = addr_to_region_idx(addr); - return region_index == addr_region_index; -} - // This method contains no policy. You should probably // be calling invoke() instead. bool PSParallelCompact::invoke_no_policy(bool maximum_heap_compaction) { @@ -2661,6 +2683,41 @@ } } +#ifdef ASSERT +// Write a histogram of the number of times the block table was filled for a +// region. +void PSParallelCompact::write_block_fill_histogram(outputStream* const out) +{ + if (!TraceParallelOldGCCompactionPhase) return; + + typedef ParallelCompactData::RegionData rd_t; + ParallelCompactData& sd = summary_data(); + + for (unsigned int id = old_space_id; id < last_space_id; ++id) { + MutableSpace* const spc = _space_info[id].space(); + if (spc->bottom() != spc->top()) { + const rd_t* const beg = sd.addr_to_region_ptr(spc->bottom()); + HeapWord* const top_aligned_up = sd.region_align_up(spc->top()); + const rd_t* const end = sd.addr_to_region_ptr(top_aligned_up); + + size_t histo[5] = { 0, 0, 0, 0, 0 }; + const size_t histo_len = sizeof(histo) / sizeof(size_t); + const size_t region_cnt = pointer_delta(end, beg, sizeof(rd_t)); + + for (const rd_t* cur = beg; cur < end; ++cur) { + ++histo[MIN2(cur->blocks_filled_count(), histo_len - 1)]; + } + out->print("%u %-4s" SIZE_FORMAT_W(5), id, space_names[id], region_cnt); + for (size_t i = 0; i < histo_len; ++i) { + out->print(" " SIZE_FORMAT_W(5) " %5.1f%%", + histo[i], 100.0 * histo[i] / region_cnt); + } + out->cr(); + } + } +} +#endif // #ifdef ASSERT + void PSParallelCompact::compact() { // trace("5"); GCTraceTime tm("compaction phase", print_phases(), true, &_gc_timer); @@ -2702,6 +2759,8 @@ update_deferred_objects(cm, SpaceId(id)); } } + + DEBUG_ONLY(write_block_fill_histogram(gclog_or_tty)); } #ifdef ASSERT @@ -3371,6 +3430,57 @@ } while (true); } +void PSParallelCompact::fill_blocks(size_t region_idx) +{ + // Fill in the block table elements for the specified region. Each block + // table element holds the number of live words in the region that are to the + // left of the first object that starts in the block. Thus only blocks in + // which an object starts need to be filled. + // + // The algorithm scans the section of the bitmap that corresponds to the + // region, keeping a running total of the live words. When an object start is + // found, if it's the first to start in the block that contains it, the + // current total is written to the block table element. + const size_t Log2BlockSize = ParallelCompactData::Log2BlockSize; + const size_t Log2RegionSize = ParallelCompactData::Log2RegionSize; + const size_t RegionSize = ParallelCompactData::RegionSize; + + ParallelCompactData& sd = summary_data(); + const size_t partial_obj_size = sd.region(region_idx)->partial_obj_size(); + if (partial_obj_size >= RegionSize) { + return; // No objects start in this region. + } + + // Ensure the first loop iteration decides that the block has changed. + size_t cur_block = sd.block_count(); + + const ParMarkBitMap* const bitmap = mark_bitmap(); + + const size_t Log2BitsPerBlock = Log2BlockSize - LogMinObjAlignment; + assert((size_t)1 << Log2BitsPerBlock == + bitmap->words_to_bits(ParallelCompactData::BlockSize), "sanity"); + + size_t beg_bit = bitmap->words_to_bits(region_idx << Log2RegionSize); + const size_t range_end = beg_bit + bitmap->words_to_bits(RegionSize); + size_t live_bits = bitmap->words_to_bits(partial_obj_size); + beg_bit = bitmap->find_obj_beg(beg_bit + live_bits, range_end); + while (beg_bit < range_end) { + const size_t new_block = beg_bit >> Log2BitsPerBlock; + if (new_block != cur_block) { + cur_block = new_block; + sd.block(cur_block)->set_offset(bitmap->bits_to_words(live_bits)); + } + + const size_t end_bit = bitmap->find_obj_end(beg_bit, range_end); + if (end_bit < range_end - 1) { + live_bits += end_bit - beg_bit + 1; + beg_bit = bitmap->find_obj_beg(end_bit + 1, range_end); + } else { + return; + } + } +} + void PSParallelCompact::move_and_update(ParCompactionManager* cm, SpaceId space_id) { const MutableSpace* sp = space(space_id); --- old/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.hpp Fri May 31 16:04:11 2013 +++ new/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.hpp Fri May 31 16:04:11 2013 @@ -223,6 +223,17 @@ // Mask for the bits in a pointer to get the address of the start of a region. static const size_t RegionAddrMask; + static const size_t Log2BlockSize; + static const size_t BlockSize; + static const size_t BlockSizeBytes; + + static const size_t BlockSizeOffsetMask; + static const size_t BlockAddrOffsetMask; + static const size_t BlockAddrMask; + + static const size_t BlocksPerRegion; + static const size_t Log2BlocksPerRegion; + class RegionData { public: @@ -275,6 +286,12 @@ inline uint destination_count() const; inline uint destination_count_raw() const; + // Whether the block table for this region has been filled. + inline bool blocks_filled() const; + + // Number of times the block table was filled. + DEBUG_ONLY(inline size_t blocks_filled_count() const;) + // The location of the java heap data that corresponds to this region. inline HeapWord* data_location() const; @@ -299,6 +316,7 @@ void set_partial_obj_size(size_t words) { _partial_obj_size = (region_sz_t) words; } + inline void set_blocks_filled(); inline void set_destination_count(uint count); inline void set_live_obj_size(size_t words); @@ -331,7 +349,11 @@ HeapWord* _partial_obj_addr; region_sz_t _partial_obj_size; region_sz_t volatile _dc_and_los; + bool _blocks_filled; + #ifdef ASSERT + size_t _blocks_filled_count; // Number of block table fills. + // These enable optimizations that are only partially implemented. Use // debug builds to prevent the code fragments from breaking. HeapWord* _data_location; @@ -340,11 +362,26 @@ #ifdef ASSERT public: - uint _pushed; // 0 until region is pushed onto a worker's stack + uint _pushed; // 0 until region is pushed onto a stack private: #endif }; + // "Blocks" allow shorter sections of the bitmap to be searched. Each Block + // holds an offset, which is the amount of live data in the Region to the left + // of the first live object that starts in the Block. + class BlockData + { + public: + typedef unsigned short int blk_ofs_t; + + blk_ofs_t offset() const { return _offset; } + void set_offset(size_t val) { _offset = (blk_ofs_t)val; } + + private: + blk_ofs_t _offset; + }; + public: ParallelCompactData(); bool initialize(MemRegion covered_region); @@ -355,8 +392,9 @@ inline RegionData* region(size_t region_idx) const; inline size_t region(const RegionData* const region_ptr) const; - // Returns true if the given address is contained within the region - bool region_contains(size_t region_index, HeapWord* addr); + size_t block_count() const { return _block_count; } + inline BlockData* block(size_t block_idx) const; + inline size_t block(const BlockData* block_ptr) const; void add_obj(HeapWord* addr, size_t len); void add_obj(oop p, size_t len) { add_obj((HeapWord*)p, len); } @@ -396,11 +434,24 @@ inline HeapWord* region_align_up(HeapWord* addr) const; inline bool is_region_aligned(HeapWord* addr) const; + // Analogous to region_offset() for blocks. + size_t block_offset(const HeapWord* addr) const; + size_t addr_to_block_idx(const HeapWord* addr) const; + size_t addr_to_block_idx(const oop obj) const { + return addr_to_block_idx((HeapWord*) obj); + } + inline BlockData* addr_to_block_ptr(const HeapWord* addr) const; + inline HeapWord* block_to_addr(size_t block) const; + inline size_t region_to_block_idx(size_t region) const; + + inline HeapWord* block_align_down(HeapWord* addr) const; + inline HeapWord* block_align_up(HeapWord* addr) const; + inline bool is_block_aligned(HeapWord* addr) const; + // Return the address one past the end of the partial object. HeapWord* partial_obj_end(size_t region_idx) const; - // Return the new location of the object p after the - // the compaction. + // Return the location of the object after compaction. HeapWord* calc_new_pointer(HeapWord* addr); HeapWord* calc_new_pointer(oop p) { @@ -416,6 +467,7 @@ #endif // #ifdef ASSERT private: + bool initialize_block_data(); bool initialize_region_data(size_t region_size); PSVirtualSpace* create_vspace(size_t count, size_t element_size); @@ -428,6 +480,10 @@ PSVirtualSpace* _region_vspace; RegionData* _region_data; size_t _region_count; + + PSVirtualSpace* _block_vspace; + BlockData* _block_data; + size_t _block_count; }; inline uint @@ -442,7 +498,29 @@ return destination_count_raw() >> dc_shift; } +inline bool +ParallelCompactData::RegionData::blocks_filled() const +{ + return _blocks_filled; +} + +#ifdef ASSERT +inline size_t +ParallelCompactData::RegionData::blocks_filled_count() const +{ + return _blocks_filled_count; +} +#endif // #ifdef ASSERT + inline void +ParallelCompactData::RegionData::set_blocks_filled() +{ + _blocks_filled = true; + // Debug builds count the number of times the table was filled. + DEBUG_ONLY(Atomic::inc_ptr(&_blocks_filled_count)); +} + +inline void ParallelCompactData::RegionData::set_destination_count(uint count) { assert(count <= (dc_completed >> dc_shift), "count too large"); @@ -536,6 +614,12 @@ return pointer_delta(region_ptr, _region_data, sizeof(RegionData)); } +inline ParallelCompactData::BlockData* +ParallelCompactData::block(size_t n) const { + assert(n < block_count(), "bad arg"); + return _block_data + n; +} + inline size_t ParallelCompactData::region_offset(const HeapWord* addr) const { @@ -602,6 +686,63 @@ return region_offset(addr) == 0; } +inline size_t +ParallelCompactData::block_offset(const HeapWord* addr) const +{ + assert(addr >= _region_start, "bad addr"); + assert(addr <= _region_end, "bad addr"); + return (size_t(addr) & BlockAddrOffsetMask) >> LogHeapWordSize; +} + +inline size_t +ParallelCompactData::addr_to_block_idx(const HeapWord* addr) const +{ + assert(addr >= _region_start, "bad addr"); + assert(addr <= _region_end, "bad addr"); + return pointer_delta(addr, _region_start) >> Log2BlockSize; +} + +inline ParallelCompactData::BlockData* +ParallelCompactData::addr_to_block_ptr(const HeapWord* addr) const +{ + return block(addr_to_block_idx(addr)); +} + +inline HeapWord* +ParallelCompactData::block_to_addr(size_t block) const +{ + assert(block < _block_count, "block out of range"); + return _region_start + (block << Log2BlockSize); +} + +inline size_t +ParallelCompactData::region_to_block_idx(size_t region) const +{ + return region << Log2BlocksPerRegion; +} + +inline HeapWord* +ParallelCompactData::block_align_down(HeapWord* addr) const +{ + assert(addr >= _region_start, "bad addr"); + assert(addr < _region_end + RegionSize, "bad addr"); + return (HeapWord*)(size_t(addr) & BlockAddrMask); +} + +inline HeapWord* +ParallelCompactData::block_align_up(HeapWord* addr) const +{ + assert(addr >= _region_start, "bad addr"); + assert(addr <= _region_end, "bad addr"); + return block_align_down(addr + BlockSizeOffsetMask); +} + +inline bool +ParallelCompactData::is_block_aligned(HeapWord* addr) const +{ + return block_offset(addr) == 0; +} + // Abstract closure for use with ParMarkBitMap::iterate(), which will invoke the // do_addr() method. // @@ -779,6 +920,7 @@ // Convenient access to type names. typedef ParMarkBitMap::idx_t idx_t; typedef ParallelCompactData::RegionData RegionData; + typedef ParallelCompactData::BlockData BlockData; typedef enum { perm_space_id, old_space_id, eden_space_id, @@ -987,6 +1129,8 @@ // Serial code executed in preparation for the compaction phase. static void compact_prologue(); + DEBUG_ONLY(static void write_block_fill_histogram(outputStream* const out);) + // Move objects to new locations. static void compact_perm(ParCompactionManager* cm); static void compact(); @@ -1158,6 +1302,9 @@ fill_region(cm, region); } + // Fill in the block table for the specified region. + static void fill_blocks(size_t region_idx); + // Update the deferred objects in the space. static void update_deferred_objects(ParCompactionManager* cm, SpaceId id); --- old/src/share/vm/opto/loopnode.hpp Fri May 31 16:04:13 2013 +++ new/src/share/vm/opto/loopnode.hpp Fri May 31 16:04:13 2013 @@ -961,7 +961,7 @@ // Has use internal to the vector set (ie. not in a phi at the loop head) bool has_use_internal_to_set( Node* n, VectorSet& vset, IdealLoopTree *loop ); // clone "n" for uses that are outside of loop - void clone_for_use_outside_loop( IdealLoopTree *loop, Node* n, Node_List& worklist ); + int clone_for_use_outside_loop( IdealLoopTree *loop, Node* n, Node_List& worklist ); // clone "n" for special uses that are in the not_peeled region void clone_for_special_use_inside_loop( IdealLoopTree *loop, Node* n, VectorSet& not_peel, Node_List& sink_list, Node_List& worklist ); --- old/src/share/vm/opto/loopopts.cpp Fri May 31 16:04:15 2013 +++ new/src/share/vm/opto/loopopts.cpp Fri May 31 16:04:15 2013 @@ -1926,8 +1926,8 @@ //------------------------------ clone_for_use_outside_loop ------------------------------------- // clone "n" for uses that are outside of loop -void PhaseIdealLoop::clone_for_use_outside_loop( IdealLoopTree *loop, Node* n, Node_List& worklist ) { - +int PhaseIdealLoop::clone_for_use_outside_loop( IdealLoopTree *loop, Node* n, Node_List& worklist ) { + int cloned = 0; assert(worklist.size() == 0, "should be empty"); for (DUIterator_Fast jmax, j = n->fast_outs(jmax); j < jmax; j++) { Node* use = n->fast_out(j); @@ -1947,6 +1947,7 @@ // clone "n" and insert it between the inputs of "n" and the use outside the loop Node* n_clone = n->clone(); _igvn.replace_input_of(use, j, n_clone); + cloned++; Node* use_c; if (!use->is_Phi()) { use_c = has_ctrl(use) ? get_ctrl(use) : use->in(0); @@ -1964,6 +1965,7 @@ } #endif } + return cloned; } @@ -2482,6 +2484,7 @@ // Evacuate nodes in peel region into the not_peeled region if possible uint new_phi_cnt = 0; + uint cloned_for_outside_use = 0; for (i = 0; i < peel_list.size();) { Node* n = peel_list.at(i); #if !defined(PRODUCT) @@ -2500,8 +2503,7 @@ // if not pinned and not a load (which maybe anti-dependent on a store) // and not a CMove (Matcher expects only bool->cmove). if ( n->in(0) == NULL && !n->is_Load() && !n->is_CMove() ) { - clone_for_use_outside_loop( loop, n, worklist ); - + cloned_for_outside_use += clone_for_use_outside_loop( loop, n, worklist ); sink_list.push(n); peel >>= n->_idx; // delete n from peel set. not_peel <<= n->_idx; // add n to not_peel set. @@ -2538,6 +2540,12 @@ // Inhibit more partial peeling on this loop assert(!head->is_partial_peel_loop(), "not partial peeled"); head->mark_partial_peel_failed(); + if (cloned_for_outside_use > 0) { + // Terminate this round of loop opts because + // the graph outside this loop was changed. + C->set_major_progress(); + return true; + } return false; } --- old/src/share/vm/prims/jvmtiImpl.cpp Fri May 31 16:04:17 2013 +++ new/src/share/vm/prims/jvmtiImpl.cpp Fri May 31 16:04:17 2013 @@ -372,9 +372,6 @@ case CLEAR_BREAKPOINT: _breakpoints->clear_at_safepoint(*_bp); break; - case CLEAR_ALL_BREAKPOINT: - _breakpoints->clearall_at_safepoint(); - break; default: assert(false, "Unknown operation"); } @@ -381,10 +378,8 @@ } void VM_ChangeBreakpoints::oops_do(OopClosure* f) { - // This operation keeps breakpoints alive - if (_breakpoints != NULL) { - _breakpoints->oops_do(f); - } + // The JvmtiBreakpoints in _breakpoints will be visited via + // JvmtiExport::oops_do. if (_bp != NULL) { _bp->oops_do(f); } @@ -445,16 +440,6 @@ } } -void JvmtiBreakpoints::clearall_at_safepoint() { - assert(SafepointSynchronize::is_at_safepoint(), "must be at safepoint"); - - int len = _bps.length(); - for (int i=0; i 0) { // substaction works with unsigned values + // OldGen is placed before youngger for ParallelOldGC. + upper_limit = low_limit + 21000000l; // +20971520 + } + // Each A[one_card] size is 512 bytes, + // it will take about 40000 allocations to trigger GC. + // cache[] has 8192 elements so GC should happen + // each 5th iteration. + for(long l = 0; l < 20; l++) { + fill_heap(); + if (debug) { + System.out.println("test oop_disjoint_arraycopy"); + } + testA_arraycopy(); + if (debug) { + System.out.println("test checkcast_arraycopy"); + } + testB_arraycopy(); + // Execute arraycopy to the topmost array in young gen + if (debug) { + int top_index = get_top_address(low_limit, upper_limit); + if (top_index >= 0) { + long addr = wb.getObjectAddress(cache[top_index]); + System.out.println("top_addr: 0x" + Long.toHexString(addr) + ", 0x" + Long.toHexString(addr + 512)); + } + } + } + } + static void fill_heap() { + for (int i = 0; i < cache.length; ++i) { + o = new Test8010927[one_card]; + System.arraycopy(masterA, 0, o, 0, masterA.length); + cache[i] = o; + } + for (long j = 0; j < 256; ++j) { + o = new Long[10000]; // to trigger GC + } + } + static void testA_arraycopy() { + for (int i = 0; i < cache.length; ++i) { + System.arraycopy(masterA, 0, cache[i], 0, masterA.length); + } + } + static void testB_arraycopy() { + for (int i = 0; i < cache.length; ++i) { + System.arraycopy(masterB, 0, cache[i], 0, masterB.length); + } + } + static int get_top_address(long min, long max) { + int index = -1; + long addr = min; + for (int i = 0; i < cache.length; ++i) { + long test = wb.getObjectAddress(cache[i]); + if (((test - addr) > 0) && ((max - test) > 0)) { // substaction works with unsigned values + addr = test; + index = i; + } + } + return index; + } +}