--- old/src/share/vm/gc_implementation/g1/concurrentMark.cpp 2011-10-03 15:48:19.522881583 -0700 +++ new/src/share/vm/gc_implementation/g1/concurrentMark.cpp 2011-10-03 15:48:19.309048776 -0700 @@ -473,6 +473,7 @@ _card_bm((rs.size() + CardTableModRefBS::card_size - 1) >> CardTableModRefBS::card_shift, false /* in_resource_area*/), + _prevMarkBitMap(&_markBitMap1), _nextMarkBitMap(&_markBitMap2), _at_least_one_mark_complete(false), @@ -502,7 +503,11 @@ _total_counting_time(0.0), _total_rs_scrub_time(0.0), - _parallel_workers(NULL) { + _parallel_workers(NULL), + + _count_card_bitmaps(NULL), + _count_marked_bytes(NULL) +{ CMVerboseLevel verbose_level = (CMVerboseLevel) G1MarkingVerboseLevel; if (verbose_level < no_verbose) { verbose_level = no_verbose; @@ -536,6 +541,11 @@ _tasks = NEW_C_HEAP_ARRAY(CMTask*, _max_task_num); _accum_task_vtime = NEW_C_HEAP_ARRAY(double, _max_task_num); + _count_card_bitmaps = NEW_C_HEAP_ARRAY(BitMap, _max_task_num); + _count_marked_bytes = NEW_C_HEAP_ARRAY(size_t*, _max_task_num); + + BitMap::idx_t card_bm_size = _card_bm.size(); + // so that the assertion in MarkingTaskQueue::task_queue doesn't fail _active_tasks = _max_task_num; for (int i = 0; i < (int) _max_task_num; ++i) { @@ -545,6 +555,9 @@ _tasks[i] = new CMTask(i, this, task_queue, _task_queues); _accum_task_vtime[i] = 0.0; + + _count_card_bitmaps[i] = BitMap(card_bm_size, false); + _count_marked_bytes[i] = NEW_C_HEAP_ARRAY(size_t, max_regions); } if (ConcGCThreads > ParallelGCThreads) { @@ -666,6 +679,8 @@ // reset all the marking data structures and any necessary flags clear_marking_state(); + clear_all_count_data(); + if (verbose_low()) { gclog_or_tty->print_cr("[global] resetting"); } @@ -721,9 +736,17 @@ for (int i = 0; i < (int) _max_task_num; ++i) { delete _task_queues->queue(i); delete _tasks[i]; + + _count_card_bitmaps[i].resize(0, false); + FREE_C_HEAP_ARRAY(size_t, _count_marked_bytes[i]); } + delete _task_queues; - FREE_C_HEAP_ARRAY(CMTask*, _max_task_num); + FREE_C_HEAP_ARRAY(CMTask*, _tasks); + FREE_C_HEAP_ARRAY(double, _accum_task_vtime); + + FREE_C_HEAP_ARRAY(BitMap*, _count_card_bitmaps); + FREE_C_HEAP_ARRAY(size_t*, _count_marked_bytes); } // This closure is used to mark refs into the g1 generation @@ -942,7 +965,7 @@ } #endif // !PRODUCT -void ConcurrentMark::grayRoot(oop p) { +void ConcurrentMark::grayRoot(oop p, int worker_i) { HeapWord* addr = (HeapWord*) p; // We can't really check against _heap_start and _heap_end, since it // is possible during an evacuation pause with piggy-backed @@ -953,7 +976,10 @@ "address should be within the heap bounds"); if (!_nextMarkBitMap->isMarked(addr)) { - _nextMarkBitMap->parMark(addr); + if (_nextMarkBitMap->parMark(addr)) { + // Update the task specific count data for object p. + add_to_count_data_for(p, worker_i); + } } } @@ -1002,7 +1028,7 @@ } } -void ConcurrentMark::markAndGrayObjectIfNecessary(oop p) { +void ConcurrentMark::markAndGrayObjectIfNecessary(oop p, int worker_i) { // The object is not marked by the caller. We need to at least mark // it and maybe push in on the stack. @@ -1011,6 +1037,9 @@ // We definitely need to mark it, irrespective whether we bail out // because we're done with marking. if (_nextMarkBitMap->parMark(addr)) { + // Update the task specific count data for object p + add_to_count_data_for(p, worker_i); + if (!concurrent_marking_in_progress() || !_should_gray_objects) { // If we're done with concurrent marking and we're waiting for // remark, then we're not pushing anything on the stack. @@ -1173,6 +1202,10 @@ gclog_or_tty->print_cr("\nRemark led to restart for overflow."); } } else { + // Aggregate the per-task counting data that we have accumulated + // while marking. + aggregate_all_count_data(); + SATBMarkQueueSet& satb_mq_set = JavaThread::satb_mark_queue_set(); // We're done with marking. // This is the end of the marking cycle, we're expected all @@ -1212,41 +1245,42 @@ #define CARD_BM_TEST_MODE 0 +// Used to calculate the # live objects per region +// for verification purposes class CalcLiveObjectsClosure: public HeapRegionClosure { CMBitMapRO* _bm; ConcurrentMark* _cm; - bool _changed; - bool _yield; - size_t _words_done; + BitMap* _region_bm; + BitMap* _card_bm; + + size_t _tot_words_done; size_t _tot_live; size_t _tot_used; - size_t _regions_done; - double _start_vtime_sec; - BitMap* _region_bm; - BitMap* _card_bm; + size_t _region_marked_bytes; + intptr_t _bottom_card_num; - bool _final; void mark_card_num_range(intptr_t start_card_num, intptr_t last_card_num) { - for (intptr_t i = start_card_num; i <= last_card_num; i++) { + BitMap::idx_t start_idx = start_card_num - _bottom_card_num; + BitMap::idx_t last_idx = last_card_num - _bottom_card_num; + + for (BitMap::idx_t i = start_idx; i <= last_idx; i += 1) { #if CARD_BM_TEST_MODE - guarantee(_card_bm->at(i - _bottom_card_num), "Should already be set."); + guarantee(_card_bm->at(i), "Should already be set."); #else - _card_bm->par_at_put(i - _bottom_card_num, 1); + _card_bm->par_at_put(i, 1); #endif } } public: - CalcLiveObjectsClosure(bool final, - CMBitMapRO *bm, ConcurrentMark *cm, + CalcLiveObjectsClosure(CMBitMapRO *bm, ConcurrentMark *cm, BitMap* region_bm, BitMap* card_bm) : - _bm(bm), _cm(cm), _changed(false), _yield(true), - _words_done(0), _tot_live(0), _tot_used(0), - _region_bm(region_bm), _card_bm(card_bm),_final(final), - _regions_done(0), _start_vtime_sec(0.0) + _bm(bm), _cm(cm), _region_bm(region_bm), _card_bm(card_bm), + _region_marked_bytes(0), _tot_words_done(0), + _tot_live(0), _tot_used(0) { _bottom_card_num = intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >> @@ -1287,9 +1321,6 @@ } bool doHeapRegion(HeapRegion* hr) { - if (!_final && _regions_done == 0) { - _start_vtime_sec = os::elapsedVTime(); - } if (hr->continuesHumongous()) { // We will ignore these here and process them when their @@ -1303,48 +1334,40 @@ } HeapWord* nextTop = hr->next_top_at_mark_start(); - HeapWord* start = hr->top_at_conc_mark_count(); - assert(hr->bottom() <= start && start <= hr->end() && - hr->bottom() <= nextTop && nextTop <= hr->end() && - start <= nextTop, - "Preconditions."); - // Otherwise, record the number of word's we'll examine. + HeapWord* start = hr->bottom(); + + assert(start <= hr->end() && start <= nextTop && nextTop <= hr->end(), + "Preconditions."); + + // Record the number of word's we'll examine. size_t words_done = (nextTop - start); + // Find the first marked object at or after "start". start = _bm->getNextMarkedWordAddress(start, nextTop); + size_t marked_bytes = 0; + _region_marked_bytes = 0; // Below, the term "card num" means the result of shifting an address // by the card shift -- address 0 corresponds to card number 0. One // must subtract the card num of the bottom of the heap to obtain a // card table index. + // The first card num of the sequence of live cards currently being // constructed. -1 ==> no sequence. intptr_t start_card_num = -1; + // The last card num of the sequence of live cards currently being // constructed. -1 ==> no sequence. intptr_t last_card_num = -1; while (start < nextTop) { - if (_yield && _cm->do_yield_check()) { - // We yielded. It might be for a full collection, in which case - // all bets are off; terminate the traversal. - if (_cm->has_aborted()) { - _changed = false; - return true; - } else { - // Otherwise, it might be a collection pause, and the region - // we're looking at might be in the collection set. We'll - // abandon this region. - return false; - } - } oop obj = oop(start); int obj_sz = obj->size(); + // The card num of the start of the current object. intptr_t obj_card_num = intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift); - HeapWord* obj_last = start + obj_sz - 1; intptr_t obj_last_card_num = intptr_t(uintptr_t(obj_last) >> CardTableModRefBS::card_shift); @@ -1377,78 +1400,440 @@ last_card_num = obj_last_card_num; marked_bytes += (size_t)obj_sz * HeapWordSize; + // Find the next marked object after this one. start = _bm->getNextMarkedWordAddress(start + 1, nextTop); - _changed = true; } + // Handle the last range, if any. if (start_card_num != -1) { mark_card_num_range(start_card_num, last_card_num); } - if (_final) { - // Mark the allocated-since-marking portion... - HeapWord* tp = hr->top(); - if (nextTop < tp) { - start_card_num = - intptr_t(uintptr_t(nextTop) >> CardTableModRefBS::card_shift); - last_card_num = - intptr_t(uintptr_t(tp) >> CardTableModRefBS::card_shift); - mark_card_num_range(start_card_num, last_card_num); - // This definitely means the region has live objects. - set_bit_for_region(hr); - } + + // Mark the allocated-since-marking portion... + HeapWord* top = hr->top(); + if (nextTop < top) { + start_card_num = intptr_t(uintptr_t(nextTop) >> CardTableModRefBS::card_shift); + last_card_num = intptr_t(uintptr_t(top) >> CardTableModRefBS::card_shift); + + mark_card_num_range(start_card_num, last_card_num); + + // This definitely means the region has live objects. + set_bit_for_region(hr); } - hr->add_to_marked_bytes(marked_bytes); // Update the live region bitmap. if (marked_bytes > 0) { set_bit_for_region(hr); } - hr->set_top_at_conc_mark_count(nextTop); + + // Set the marked bytes for the current region so that + // it can be queried by a calling verificiation routine + _region_marked_bytes = marked_bytes; + _tot_live += hr->next_live_bytes(); _tot_used += hr->used(); - _words_done = words_done; + _tot_words_done = words_done; - if (!_final) { - ++_regions_done; - if (_regions_done % 10 == 0) { - double end_vtime_sec = os::elapsedVTime(); - double elapsed_vtime_sec = end_vtime_sec - _start_vtime_sec; - if (elapsed_vtime_sec > (10.0 / 1000.0)) { - jlong sleep_time_ms = - (jlong) (elapsed_vtime_sec * _cm->cleanup_sleep_factor() * 1000.0); - os::sleep(Thread::current(), sleep_time_ms, false); - _start_vtime_sec = end_vtime_sec; - } + return false; + } + + size_t region_marked_bytes() const { return _region_marked_bytes; } + size_t tot_words_done() const { return _tot_words_done; } + size_t tot_live() const { return _tot_live; } + size_t tot_used() const { return _tot_used; } +}; + +// Aggregate the counting data that was constructed concurrently +// with marking. +class AddToMarkedBytesClosure: public HeapRegionClosure { + ConcurrentMark* _cm; + size_t _task_num; + size_t _max_task_num; + + bool _final; + +public: + AddToMarkedBytesClosure(ConcurrentMark *cm, + size_t task_num, + size_t max_task_num) : + _cm(cm), + _task_num(task_num), + _max_task_num(max_task_num), + _final(false) + { + assert(0 <= _task_num && _task_num < _max_task_num, "sanity"); + if ((_max_task_num - _task_num) == 1) { + // Last task + _final = true; + } + } + + bool doHeapRegion(HeapRegion* hr) { + // Adds the value in the counted marked bytes array for + // _task_num for region hr to the value cached in heap + // region itself. + // For the final task we also set the top at conc count + // for the region. + // The bits in the live region bitmap are set for regions + // that contain live data during the cleanup pause. + + if (hr->continuesHumongous()) { + // We will ignore these here and process them when their + // associated "starts humongous" region is processed. + // Note that we cannot rely on their associated + // "starts humongous" region to have their bit set to 1 + // since, due to the region chunking in the parallel region + // iteration, a "continues humongous" region might be visited + // before its associated "starts humongous". + return false; + } + + int hrs_index = hr->hrs_index(); + size_t* marked_bytes_array = _cm->count_marked_bytes_for(_task_num); + size_t marked_bytes = marked_bytes_array[hrs_index]; + hr->add_to_marked_bytes(marked_bytes); + + if (_final) { + HeapWord* ntams = hr->next_top_at_mark_start(); + HeapWord* start = hr->bottom(); + + assert(start <= ntams && ntams <= hr->top() && hr->top() <= hr->end(), + "Preconditions."); + + hr->set_top_at_conc_mark_count(ntams); + } + + return false; + } +}; + +void ConcurrentMark::aggregate_all_count_data() { + _card_bm.clear(); + + // Unions the per task card bitmaps into the global card bitmap, + // and aggregates the per task marked bytes for each region into + // the heap region itself. + + for (int i = 0; i < _max_task_num; i += 1) { + BitMap& task_card_bm = count_card_bitmap_for(i); + _card_bm.set_union(task_card_bm); + + // Update the marked bytes for each region + AddToMarkedBytesClosure cl(this, i, _max_task_num); + _g1h->heap_region_iterate(&cl); + } + + // We're done with the accumulated per-task concurrent + // counting data so let's clear it for the next marking. + clear_all_count_data(); +} + +// Final update of count data (during cleanup). +// Adds [top_at_count, NTAMS) to the marked bytes for each +// region. Sets the bits in the card bitmap corresponding +// to the interval [top_at_count, top], and sets the +// liveness bit for each region containing live data +// in the region bitmap. + +class FinalCountDataUpdateClosure: public HeapRegionClosure { + ConcurrentMark* _cm; + BitMap* _region_bm; + BitMap* _card_bm; + intptr_t _bottom_card_num; + + size_t _total_live_bytes; + size_t _total_used_bytes; + size_t _total_words_done; + + void mark_card_num_range(intptr_t start_card_num, intptr_t last_card_num) { + BitMap::idx_t start_idx = start_card_num - _bottom_card_num; + BitMap::idx_t last_idx = last_card_num - _bottom_card_num; + + // Inclusive bit range [start_idx, last_idx]. par_at_put_range + // is exclusive so we have to also set the bit for last_idx. + // Passing last_idx+1 to the clear_range would work in + // most cases but could trip an OOB assertion. + + if ((last_idx - start_idx) > 0) { + _card_bm->par_at_put_range(start_idx, last_idx, true); + } + _card_bm->par_set_bit(last_idx); + } + + // It takes a region that's not empty (i.e., it has at least one + // live object in it and sets its corresponding bit on the region + // bitmap to 1. If the region is "starts humongous" it will also set + // to 1 the bits on the region bitmap that correspond to its + // associated "continues humongous" regions. + void set_bit_for_region(HeapRegion* hr) { + assert(!hr->continuesHumongous(), "should have filtered those out"); + + size_t index = hr->hrs_index(); + if (!hr->startsHumongous()) { + // Normal (non-humongous) case: just set the bit. + _region_bm->par_set_bit((BitMap::idx_t) index); + } else { + // Starts humongous case: calculate how many regions are part of + // this humongous region and then set the bit range. It might + // have been a bit more efficient to look at the object that + // spans these humongous regions to calculate their number from + // the object's size. However, it's a good idea to calculate + // this based on the metadata itself, and not the region + // contents, so that this code is not aware of what goes into + // the humongous regions (in case this changes in the future). + G1CollectedHeap* g1h = G1CollectedHeap::heap(); + size_t end_index = index + 1; + while (end_index < g1h->n_regions()) { + HeapRegion* chr = g1h->region_at(end_index); + if (!chr->continuesHumongous()) break; + end_index += 1; } + _region_bm->par_at_put_range((BitMap::idx_t) index, + (BitMap::idx_t) end_index, true); } + } + + public: + FinalCountDataUpdateClosure(ConcurrentMark* cm, + BitMap* region_bm, + BitMap* card_bm) : + _cm(cm), _region_bm(region_bm), _card_bm(card_bm), + _total_words_done(0), _total_live_bytes(0), _total_used_bytes(0) + { + _bottom_card_num = + intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >> + CardTableModRefBS::card_shift); + } + + bool doHeapRegion(HeapRegion* hr) { + + if (hr->continuesHumongous()) { + // We will ignore these here and process them when their + // associated "starts humongous" region is processed (see + // set_bit_for_heap_region()). Note that we cannot rely on their + // associated "starts humongous" region to have their bit set to + // 1 since, due to the region chunking in the parallel region + // iteration, a "continues humongous" region might be visited + // before its associated "starts humongous". + return false; + } + + HeapWord* start = hr->top_at_conc_mark_count(); + HeapWord* ntams = hr->next_top_at_mark_start(); + HeapWord* top = hr->top(); + + assert(hr->bottom() <= start && start <= hr->end() && + hr->bottom() <= ntams && ntams <= hr->end(), "Preconditions."); + + size_t words_done = ntams - hr->bottom(); + + intptr_t start_card_num = intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift); + intptr_t last_card_num = intptr_t(uintptr_t(top) >> CardTableModRefBS::card_shift); + + + if (start < ntams) { + // Region was changed between remark and cleanup pauses + // We need to add (ntams - start) to the marked bytes + // for this region, and set bits for the range + // [ card_num(start), card_num(ntams) ) in the + // card bitmap. + size_t live_bytes = (ntams - start) * HeapWordSize; + hr->add_to_marked_bytes(live_bytes); + + // Record the new top at conc count + hr->set_top_at_conc_mark_count(ntams); + + // The setting of the bits card bitmap takes place below + } + + // Mark the allocated-since-marking portion... + if (ntams < top) { + // This definitely means the region has live objects. + set_bit_for_region(hr); + } + + // Now set the bits for [start, top] + mark_card_num_range(start_card_num, last_card_num); + + // Set the bit for the region if it contains live data + if (hr->next_marked_bytes() > 0) { + set_bit_for_region(hr); + } + + _total_words_done += words_done; + _total_used_bytes += hr->used(); + _total_live_bytes += hr->next_marked_bytes(); return false; } - bool changed() { return _changed; } - void reset() { _changed = false; _words_done = 0; } - void no_yield() { _yield = false; } - size_t words_done() { return _words_done; } - size_t tot_live() { return _tot_live; } - size_t tot_used() { return _tot_used; } + size_t total_words_done() const { return _total_words_done; } + size_t total_live_bytes() const { return _total_live_bytes; } + size_t total_used_bytes() const { return _total_used_bytes; } }; +// Heap region closure used for verifying the counting data +// that was accumulated concurrently and aggregated during +// the remark pause. This closure is applied to the heap +// regions during the STW cleanup pause. -void ConcurrentMark::calcDesiredRegions() { - _region_bm.clear(); - _card_bm.clear(); - CalcLiveObjectsClosure calccl(false /*final*/, - nextMarkBitMap(), this, - &_region_bm, &_card_bm); - G1CollectedHeap *g1h = G1CollectedHeap::heap(); - g1h->heap_region_iterate(&calccl); +class VerifyLiveObjectDataHRClosure: public HeapRegionClosure { + ConcurrentMark* _cm; + CalcLiveObjectsClosure _calc_cl; + BitMap* _region_bm; // Region BM to be verified + BitMap* _card_bm; // Card BM to be verified + bool _verbose; // verbose output? - do { - calccl.reset(); - g1h->heap_region_iterate(&calccl); - } while (calccl.changed()); -} + BitMap* _exp_region_bm; // Expected Region BM values + BitMap* _exp_card_bm; // Expected card BM values + + intptr_t _bottom_card_num; // Used for calculatint bitmap indices + + int _failures; + +public: + VerifyLiveObjectDataHRClosure(ConcurrentMark* cm, + BitMap* region_bm, + BitMap* card_bm, + BitMap* exp_region_bm, + BitMap* exp_card_bm, + bool verbose) : + _cm(cm), + _calc_cl(_cm->nextMarkBitMap(), _cm, exp_region_bm, exp_card_bm), + _region_bm(region_bm), _card_bm(card_bm), _verbose(verbose), + _exp_region_bm(exp_region_bm), _exp_card_bm(exp_card_bm), + _failures(0) + { + _bottom_card_num = + intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >> + CardTableModRefBS::card_shift); + } + + int failures() const { return _failures; } + + bool doHeapRegion(HeapRegion* hr) { + if (hr->continuesHumongous()) { + // We will ignore these here and process them when their + // associated "starts humongous" region is processed (see + // set_bit_for_heap_region()). Note that we cannot rely on their + // associated "starts humongous" region to have their bit set to + // 1 since, due to the region chunking in the parallel region + // iteration, a "continues humongous" region might be visited + // before its associated "starts humongous". + return false; + } + + // Call the CalcLiveObjectsClosure to walk the marking bitmap for + // this region and set the corresponding bits in the expected region + // and card bitmaps. + bool res = _calc_cl.doHeapRegion(hr); + assert(res == false, "should be continuing"); + + // Note that the calculated count data could be a subset of the + // count data that was accumlated during marking. See the comment + // in G1ParCopyHelper::copy_to_survivor space for an explanation + // why. + + if (_verbose) { + gclog_or_tty->print("Region %d: bottom: "PTR_FORMAT", ntams: " + PTR_FORMAT", top: "PTR_FORMAT", end: "PTR_FORMAT, + hr->hrs_index(), hr->bottom(), hr->next_top_at_mark_start(), + hr->top(), hr->end()); + gclog_or_tty->print_cr(", marked_bytes: calc/actual "SIZE_FORMAT"/"SIZE_FORMAT, + _calc_cl.region_marked_bytes(), + hr->next_marked_bytes()); + } + + // Verify that _top_at_conc_count == ntams + if (hr->top_at_conc_mark_count() != hr->next_top_at_mark_start()) { + if (_verbose) { + gclog_or_tty->print_cr("Region %d: top at conc count incorrect: expected " + PTR_FORMAT", actual: "PTR_FORMAT, + hr->hrs_index(), hr->next_top_at_mark_start(), + hr->top_at_conc_mark_count()); + } + _failures += 1; + } + + // Verify the marked bytes for this region. + size_t exp_marked_bytes = _calc_cl.region_marked_bytes(); + size_t act_marked_bytes = hr->next_marked_bytes(); + + // We're OK if actual marked bytes >= expected. + if (exp_marked_bytes > act_marked_bytes) { + if (_verbose) { + gclog_or_tty->print_cr("Region %d: marked bytes mismatch: expected: " + SIZE_FORMAT", actual: "SIZE_FORMAT, + hr->hrs_index(), exp_marked_bytes, act_marked_bytes); + } + _failures += 1; + } + + // Verify the bit, for this region, in the actual and expected + // (which was just calculated) region bit maps. + // We're not OK if the expected bit is set and the actual is not set. + BitMap::idx_t index = (BitMap::idx_t)hr->hrs_index(); + + bool expected = _exp_region_bm->at(index); + bool actual = _region_bm->at(index); + if (expected && !actual) { + if (_verbose) { + gclog_or_tty->print_cr("Region %d: region bitmap mismatch: expected: %d, actual: %d", + hr->hrs_index(), expected, actual); + } + _failures += 1; + } + + // Verify that the card bit maps for the cards spanned by the current + // region match. The set of offsets that have set bits in the expected + // bitmap should be a subset of the offsets with set bits from the actual + // calculated card bitmap. + // Again it's more important that if the expected bit is set then the + // actual bit be set. + intptr_t start_card_num = + intptr_t(uintptr_t(hr->bottom()) >> CardTableModRefBS::card_shift); + intptr_t top_card_num = + intptr_t(uintptr_t(hr->top()) >> CardTableModRefBS::card_shift); + + BitMap::idx_t start_idx = start_card_num - _bottom_card_num; + BitMap::idx_t end_idx = top_card_num - _bottom_card_num; + + for (BitMap::idx_t i = start_idx; i < end_idx; i+=1) { + expected = _exp_card_bm->at(i); + actual = _card_bm->at(i); + + if (expected && !actual) { + if (_verbose) { + gclog_or_tty->print_cr("Region %d: card bitmap mismatch at idx %d: expected: %d, actual: %d", + hr->hrs_index(), i, expected, actual); + } + _failures += 1; + } + } + if (_failures) { + // Stop iteration? + return true; + } + + return false; + } +}; + +class Mux2HRClosure: public HeapRegionClosure { + HeapRegionClosure* _cl1; + HeapRegionClosure* _cl2; + +public: + Mux2HRClosure(HeapRegionClosure *c1, HeapRegionClosure *c2) : _cl1(c1), _cl2(c2) { } + bool doHeapRegion(HeapRegion* hr) { + bool res1 = _cl1->doHeapRegion(hr); + bool res2 = _cl2->doHeapRegion(hr); + + // Only continue if both return false; + return res1 || res2; + } +}; class G1ParFinalCountTask: public AbstractGangTask { protected: @@ -1457,56 +1842,97 @@ size_t _n_workers; size_t *_live_bytes; size_t *_used_bytes; - BitMap* _region_bm; - BitMap* _card_bm; + + BitMap* _actual_region_bm; + BitMap* _actual_card_bm; + + BitMap _expected_region_bm; + BitMap _expected_card_bm; + + int _failures; + public: G1ParFinalCountTask(G1CollectedHeap* g1h, CMBitMap* bm, BitMap* region_bm, BitMap* card_bm) - : AbstractGangTask("G1 final counting"), _g1h(g1h), - _bm(bm), _region_bm(region_bm), _card_bm(card_bm) { + : AbstractGangTask("G1 final counting"), + _g1h(g1h), _bm(bm), + _actual_region_bm(region_bm), _actual_card_bm(card_bm), + _expected_region_bm(0, false), _expected_card_bm(0, false), + _failures(0) + { if (ParallelGCThreads > 0) { _n_workers = _g1h->workers()->total_workers(); } else { _n_workers = 1; } + _live_bytes = NEW_C_HEAP_ARRAY(size_t, _n_workers); _used_bytes = NEW_C_HEAP_ARRAY(size_t, _n_workers); + + if (VerifyDuringGC) { + _expected_card_bm.resize(_actual_card_bm->size(), false); + _expected_region_bm.resize(_actual_region_bm->size(), false); + } } ~G1ParFinalCountTask() { + if (VerifyDuringGC) { + _expected_region_bm.resize(0); + _expected_card_bm.resize(0); + } FREE_C_HEAP_ARRAY(size_t, _live_bytes); FREE_C_HEAP_ARRAY(size_t, _used_bytes); } void work(int i) { - CalcLiveObjectsClosure calccl(true /*final*/, - _bm, _g1h->concurrent_mark(), - _region_bm, _card_bm); - calccl.no_yield(); + + FinalCountDataUpdateClosure final_update_cl(_g1h->concurrent_mark(), + _actual_region_bm, _actual_card_bm); + + VerifyLiveObjectDataHRClosure verify_cl(_g1h->concurrent_mark(), + _actual_region_bm, _actual_card_bm, + &_expected_region_bm, + &_expected_card_bm, + true /* verbose */); + + Mux2HRClosure update_and_verify_cl(&final_update_cl, &verify_cl); + + HeapRegionClosure* hr_cl = &final_update_cl; + if (VerifyDuringGC) { + hr_cl = &update_and_verify_cl; + } + if (G1CollectedHeap::use_parallel_gc_threads()) { - _g1h->heap_region_par_iterate_chunked(&calccl, i, + _g1h->heap_region_par_iterate_chunked(hr_cl, i, HeapRegion::FinalCountClaimValue); } else { - _g1h->heap_region_iterate(&calccl); + _g1h->heap_region_iterate(hr_cl); } - assert(calccl.complete(), "Shouldn't have yielded!"); assert((size_t) i < _n_workers, "invariant"); - _live_bytes[i] = calccl.tot_live(); - _used_bytes[i] = calccl.tot_used(); + _live_bytes[i] = final_update_cl.total_live_bytes(); + _used_bytes[i] = final_update_cl.total_used_bytes(); + + if (VerifyDuringGC) { + _failures += verify_cl.failures(); + } } + size_t live_bytes() { size_t live_bytes = 0; for (size_t i = 0; i < _n_workers; ++i) live_bytes += _live_bytes[i]; return live_bytes; } + size_t used_bytes() { size_t used_bytes = 0; for (size_t i = 0; i < _n_workers; ++i) used_bytes += _used_bytes[i]; return used_bytes; } + + int failures() const { return _failures; } }; class G1ParNoteEndTask; @@ -1707,12 +2133,16 @@ HeapRegionRemSet::reset_for_cleanup_tasks(); + // Clear the global region bitmap - it will be filled as part + // of the final counting task. + _region_bm.clear(); + // Do counting once more with the world stopped for good measure. G1ParFinalCountTask g1_par_count_task(g1h, nextMarkBitMap(), &_region_bm, &_card_bm); + if (G1CollectedHeap::use_parallel_gc_threads()) { - assert(g1h->check_heap_region_claim_values( - HeapRegion::InitialClaimValue), + assert(g1h->check_heap_region_claim_values(HeapRegion::InitialClaimValue), "sanity check"); int n_workers = g1h->workers()->total_workers(); @@ -1720,13 +2150,18 @@ g1h->workers()->run_task(&g1_par_count_task); g1h->set_par_threads(0); - assert(g1h->check_heap_region_claim_values( - HeapRegion::FinalCountClaimValue), + assert(g1h->check_heap_region_claim_values(HeapRegion::FinalCountClaimValue), "sanity check"); } else { g1_par_count_task.work(0); } + // Verify that there were no verification failures of + // the live counting data. + if (VerifyDuringGC) { + assert(g1_par_count_task.failures() == 0, "Unexpected failures"); + } + size_t known_garbage_bytes = g1_par_count_task.used_bytes() - g1_par_count_task.live_bytes(); g1p->set_known_garbage_bytes(known_garbage_bytes); @@ -1929,7 +2364,10 @@ G1CMKeepAliveClosure(G1CollectedHeap* g1, ConcurrentMark* cm, CMBitMap* bitMap) : _g1(g1), _cm(cm), - _bitMap(bitMap) {} + _bitMap(bitMap) + { + assert(Thread::current()->is_VM_thread(), "otherwise fix worker id"); + } virtual void do_oop(narrowOop* p) { do_oop_work(p); } virtual void do_oop( oop* p) { do_oop_work(p); } @@ -1946,6 +2384,9 @@ if (_g1->is_in_g1_reserved(addr) && _g1->is_obj_ill(obj)) { _bitMap->mark(addr); + // Update the task specific count data for obj + _cm->add_to_count_data_for(obj, 0 /* worker_i */); + _cm->mark_stack_push(obj); } } @@ -2596,6 +3037,9 @@ // we need to mark it first if (_nextMarkBitMap->parMark(objAddr)) { + // Update the task specific count data for obj + add_to_count_data_for(obj, hr, 0 /* worker_i */); + // No OrderAccess:store_load() is needed. It is implicit in the // CAS done in parMark(objAddr) above HeapWord* finger = _finger; @@ -2842,6 +3286,188 @@ } } +// Clear the per-worker arrays used to store the per-region counting data +void ConcurrentMark::clear_all_count_data() { + assert(SafepointSynchronize::is_at_safepoint() || + !Universe::is_fully_initialized(), "must be"); + + int max_regions = _g1h->max_regions(); + + assert(_max_task_num != 0, "unitialized"); + assert(_count_card_bitmaps != NULL, "uninitialized"); + assert(_count_marked_bytes != NULL, "uninitialized"); + + for (int i = 0; i < _max_task_num; i += 1) { + BitMap& task_card_bm = count_card_bitmap_for(i); + size_t* marked_bytes_array = count_marked_bytes_for(i); + + assert(task_card_bm.size() == _card_bm.size(), "size mismatch"); + assert(marked_bytes_array != NULL, "uninitialized"); + + for (int j = 0; j < max_regions; j++) { + marked_bytes_array[j] = 0; + } + task_card_bm.clear(); + } +} + +// Adds the given region to the counting data structures +// for the given task id. +void ConcurrentMark::add_to_count_data_for(MemRegion mr, + HeapRegion* hr, + int worker_i) { + G1CollectedHeap* g1h = _g1h; + HeapWord* start = mr.start(); + HeapWord* last = mr.last(); + size_t index = hr->hrs_index(); + + assert(!hr->continuesHumongous(), "should not be HC region"); + assert(hr == g1h->heap_region_containing(start), "sanity"); + assert(hr == g1h->heap_region_containing(mr.last()), "sanity"); + assert(0 <= worker_i && worker_i < _max_task_num, "oob"); + + BitMap& task_card_bm = count_card_bitmap_for(worker_i); + size_t* marked_bytes_array = count_marked_bytes_for(worker_i); + + // Below, the term "card num" means the result of shifting an address + // by the card shift -- address 0 corresponds to card number 0. One + // must subtract the card num of the bottom of the heap to obtain a + // card table index. + + intptr_t start_card_num = + intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift); + intptr_t last_card_num = + intptr_t(uintptr_t(last) >> CardTableModRefBS::card_shift); + + intptr_t bottom_card_num = + intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >> + CardTableModRefBS::card_shift); + + BitMap::idx_t start_idx = start_card_num - bottom_card_num; + BitMap::idx_t last_idx = last_card_num - bottom_card_num; + + // The card bitmap is task/worker specific => no need to use 'par' routines. + // Inclusive bit range [start_idx, last_idx]. set_range is exclusive + // so we have to also explicitly set the bit for last_idx. + // Passing last_idx+1 to the clear_range would work in most cases + // but could trip an OOB assertion. + + if ((last_idx - start_idx) > 0) { + task_card_bm.set_range(start_idx, last_idx); + } + task_card_bm.set_bit(last_idx); + + // Add to the task local marked bytes for this region. + marked_bytes_array[index] += mr.byte_size(); +} + +void ConcurrentMark::add_to_count_data_for(oop obj, HeapRegion* hr, int worker_i) { + MemRegion mr((HeapWord*)obj, obj->size()); + add_to_count_data_for(mr, hr, worker_i); +} + +void ConcurrentMark::add_to_count_data_for(MemRegion mr, int worker_i) { + HeapRegion* hr = _g1h->heap_region_containing(mr.start()); + add_to_count_data_for(mr, hr, worker_i); +} + +void ConcurrentMark::add_to_count_data_for(oop obj, int worker_i) { + MemRegion mr((HeapWord*)obj, obj->size()); + add_to_count_data_for(mr, worker_i); +} + +// Updates the counting data with liveness info recorded for a +// region (typically a GCLab). +void ConcurrentMark::add_to_count_data_for_region(MemRegion lab_mr, + BitMap* lab_card_bm, + intptr_t lab_bottom_card_num, + size_t lab_marked_bytes, + int worker_i) { + HeapRegion* hr = _g1h->heap_region_containing(lab_mr.start()); + + BitMap& task_card_bm = count_card_bitmap_for(worker_i); + size_t* marked_bytes_array = count_marked_bytes_for(worker_i); + + // Below, the term "card num" means the result of shifting an address + // by the card shift -- address 0 corresponds to card number 0. One + // must subtract the card num of the bottom of the heap to obtain a + // card table index. + + intptr_t heap_bottom_card_num = + intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >> + CardTableModRefBS::card_shift); + + assert(intptr_t(uintptr_t(lab_mr.start()) >> CardTableModRefBS::card_shift) == lab_bottom_card_num, + "sanity"); + + // We have to map the indices of set bits in lab_card_bm, using + // lab_bottom_card_num, to indices the card bitmap for the given task. + + BitMap::idx_t end_idx = lab_card_bm->size(); + BitMap::idx_t start_idx = lab_card_bm->get_next_one_offset(0, end_idx); + while (start_idx < end_idx) { + assert(lab_card_bm->at(start_idx), "should be set"); + + intptr_t lab_card_num = lab_bottom_card_num + start_idx; + BitMap::idx_t card_bm_idx = lab_card_num - heap_bottom_card_num; + + task_card_bm.set_bit(card_bm_idx); + + // Get the offset of the next set bit + start_idx = lab_card_bm->get_next_one_offset(start_idx+1, end_idx); + } + + // Now add to the marked bytes + marked_bytes_array[hr->hrs_index()] += lab_marked_bytes; +} + +void ConcurrentMark::clear_count_data_for_heap_region(HeapRegion* hr) { + // Clears the count data for the given region from _all_ of + // the per-task counting data structures. + + MemRegion used_region = hr->used_region(); + HeapWord* start = used_region.start(); + HeapWord* last = used_region.last(); + size_t hr_index = hr->hrs_index(); + + intptr_t bottom_card_num = + intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >> + CardTableModRefBS::card_shift); + + intptr_t start_card_num = + intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift); + intptr_t last_card_num = + intptr_t(uintptr_t(last) >> CardTableModRefBS::card_shift); + + BitMap::idx_t start_idx = start_card_num - bottom_card_num; + BitMap::idx_t last_idx = last_card_num - bottom_card_num; + + size_t used_region_bytes = used_region.byte_size(); + size_t marked_bytes = 0; + + for (int i=0; i < _max_task_num; i += 1) { + BitMap& task_card_bm = count_card_bitmap_for(i); + size_t* marked_bytes_array = count_marked_bytes_for(i); + + marked_bytes += marked_bytes_array[hr_index]; + // clear the amount of marked bytes in the task array for this + // region + marked_bytes_array[hr_index] = 0; + + // Clear the inclusive range [start_idx, last_idx] from the + // card bitmap. The clear_range routine is exclusive so we + // need to also explicitly clear the bit at last_idx. + // Passing last_idx+1 to the clear_range would work in + // most cases but could trip an OOB assertion. + + if ((last_idx - start_idx) > 0) { + task_card_bm.clear_range(start_idx, last_idx); + } + task_card_bm.clear_bit(last_idx); + } + // We could assert here that marked_bytes == used_region_bytes +} + void ConcurrentMark::print_stats() { if (verbose_stats()) { gclog_or_tty->print_cr("---------------------------------------------------------------------"); @@ -2947,6 +3573,9 @@ if (hr->in_collection_set()) { if (_g1h->is_obj_ill(obj)) { _bm->mark((HeapWord*)obj); + // Update the task specific count data for object + _cm->add_to_count_data_for(obj, hr, 0 /* worker_i */); + if (!push(obj)) { gclog_or_tty->print_cr("Setting abort in CSMarkOopClosure because push failed."); set_abort(); @@ -3028,14 +3657,18 @@ }; class ClearMarksInHRClosure: public HeapRegionClosure { + ConcurrentMark* _cm; CMBitMap* _bm; public: - ClearMarksInHRClosure(CMBitMap* bm): _bm(bm) { } + ClearMarksInHRClosure(ConcurrentMark* cm, CMBitMap* bm): + _cm(cm), _bm(bm) + { } bool doHeapRegion(HeapRegion* r) { if (!r->used_region().is_empty() && !r->evacuation_failed()) { - MemRegion usedMR = r->used_region(); _bm->clearRange(r->used_region()); + // Need to remove values from the count info + _cm->clear_count_data_for_heap_region(r); } return false; } @@ -3061,7 +3694,7 @@ double elapsed_time_ms = (end_time - start) * 1000.0; g1h->g1_policy()->record_mark_closure_time(elapsed_time_ms); - ClearMarksInHRClosure clr(nextMarkBitMap()); + ClearMarksInHRClosure clr(this, nextMarkBitMap()); g1h->collection_set_iterate(&clr); } @@ -3203,10 +3836,9 @@ (_init_times.sum() + _remark_times.sum() + _cleanup_times.sum())/1000.0); gclog_or_tty->print_cr(" Total concurrent time = %8.2f s " - "(%8.2f s marking, %8.2f s counting).", + "(%8.2f s marking).", cmThread()->vtime_accum(), - cmThread()->vtime_mark_accum(), - cmThread()->vtime_count_accum()); + cmThread()->vtime_mark_accum()); } void ConcurrentMark::print_worker_threads_on(outputStream* st) const { --- old/src/share/vm/gc_implementation/g1/concurrentMark.hpp 2011-10-03 15:48:21.013793770 -0700 +++ new/src/share/vm/gc_implementation/g1/concurrentMark.hpp 2011-10-03 15:48:20.810190638 -0700 @@ -428,7 +428,6 @@ WorkGangBarrierSync _first_overflow_barrier_sync; WorkGangBarrierSync _second_overflow_barrier_sync; - // this is set by any task, when an overflow on the global data // structures is detected. volatile bool _has_overflown; @@ -579,6 +578,23 @@ } } + // Live Data Counting data structures... + // These data structures are initialized at the start of + // marking. They are written to while marking is active. + // They are aggregated during remark; the aggregated values + // are then used to populate the _region_bm, _card_bm, and + // the total live bytes, which are then subsequently updated + // during cleanup. + + // An array of bitmaps (one bit map per task). Each bitmap + // is used to record the cards spanned by the live objects + // marked by that task/worker. + BitMap* _count_card_bitmaps; + + // Used to record the number of marked live bytes + // (for each region, by worker thread). + size_t** _count_marked_bytes; + public: // Manipulation of the global mark stack. // Notice that the first mark_stack_push is CAS-based, whereas the @@ -716,7 +732,8 @@ // grayed and it's MT-safe. Currently, we just mark it. But, in the // future, we can experiment with pushing it on the stack and we can // do this without changing G1CollectedHeap. - void grayRoot(oop p); + void grayRoot(oop p, int worker_i); + // It's used during evacuation pauses to gray a region, if // necessary, and it's MT-safe. It assumes that the caller has // marked any objects on that region. If _should_gray_objects is @@ -724,13 +741,14 @@ // pushed on the region stack, if it is located below the global // finger, otherwise we do nothing. void grayRegionIfNecessary(MemRegion mr); + // It's used during evacuation pauses to mark and, if necessary, // gray a single object and it's MT-safe. It assumes the caller did // not mark the object. If _should_gray_objects is true and we're // still doing concurrent marking, the objects is pushed on the // global stack, if it is located below the global finger, otherwise // we do nothing. - void markAndGrayObjectIfNecessary(oop p); + void markAndGrayObjectIfNecessary(oop p, int worker_i); // It iterates over the heap and for each object it comes across it // will dump the contents of its reference fields, as well as @@ -774,14 +792,16 @@ void checkpointRootsFinal(bool clear_all_soft_refs); void checkpointRootsFinalWork(); - void calcDesiredRegions(); void cleanup(); void completeCleanup(); // Mark in the previous bitmap. NB: this is usually read-only, so use // this carefully! void markPrev(oop p); + + // Clears the mark in the next bitmap for the given object. void clear(oop p); + // Clears marks for all objects in the given range, for both prev and // next bitmaps. NB: the previous bitmap is usually read-only, so use // this carefully! @@ -890,6 +910,70 @@ bool verbose_high() { return _MARKING_VERBOSE_ && _verbose_level >= high_verbose; } + + // Counting data structure accessors + + // Returns the card bitmap for a given task or worker id. + BitMap& count_card_bitmap_for(int worker_i) { + assert(0 <= worker_i && worker_i < _max_task_num, "oob"); + assert(_count_card_bitmaps != NULL, "uninitialized"); + BitMap& task_card_bm = _count_card_bitmaps[worker_i]; + assert(task_card_bm.size() == _card_bm.size(), "size mismatch"); + return task_card_bm; + } + + // Returns the array containing the marked bytes for each region, + // for the given worker or task id. + size_t* count_marked_bytes_for(int worker_i) { + assert(0 <= worker_i && worker_i < _max_task_num, "oob"); + assert(_count_marked_bytes != NULL, "uninitialized"); + size_t* marked_bytes_array = _count_marked_bytes[worker_i]; + assert(marked_bytes_array != NULL, "uninitialized"); + return marked_bytes_array; + } + + // Add the given region's size to the # marked bytes for + // the given worker and region. Set the bits in the + // worker's/task's card bitmap that are associated + // with the range of cards spanned by the region. + // Note the memory region could be just one object. + void add_to_count_data_for(MemRegion mr, int worker_i); + + // As above, but for a single object + void add_to_count_data_for(oop obj, int worker_i); + + // As above but when we have already been given the heap + // region. + void add_to_count_data_for(MemRegion mr, HeapRegion* hr, int worker_i); + + // As above, but for a single object + void add_to_count_data_for(oop obj, HeapRegion* hr, int worker_i); + + // Updates the counting data with liveness info recorded for a + // region (typically a GCLab). + void add_to_count_data_for_region(MemRegion lab_mr, + BitMap* lab_card_bm, + intptr_t lab_bottom_card_num, + size_t lab_marked_bytes, + int worker_i); + + // Clears the count data for the given region from _all_ of + // the per-task counting data structures. + void clear_count_data_for_heap_region(HeapRegion* hr); + +protected: + // Clear all the per-task bitmaps and arrays used to store the + // counting data. + void clear_all_count_data(); + + // Aggregates the counting data for each worker/task + // that was constructed while marking. Also sets + // the amount of marked bytes for each region and + // the top at concurrent mark count. + void aggregate_all_count_data(); + + // Verification routine + void verify_count_data(); }; // A class representing a marking task. --- old/src/share/vm/gc_implementation/g1/concurrentMark.inline.hpp 2011-10-03 15:48:22.121436352 -0700 +++ new/src/share/vm/gc_implementation/g1/concurrentMark.inline.hpp 2011-10-03 15:48:21.918989668 -0700 @@ -84,7 +84,7 @@ HeapWord* objAddr = (HeapWord*) obj; assert(obj->is_oop_or_null(true /* ignore mark word */), "Error"); - if (_g1h->is_in_g1_reserved(objAddr)) { + if (_g1h->is_in_g1_reserved(objAddr)) { assert(obj != NULL, "null check is implicit"); if (!_nextMarkBitMap->isMarked(objAddr)) { // Only get the containing region if the object is not marked on the @@ -99,6 +99,9 @@ // we need to mark it first if (_nextMarkBitMap->parMark(objAddr)) { + // Update the task specific count data with obj. + _cm->add_to_count_data_for(obj, hr, _task_id); + // No OrderAccess:store_load() is needed. It is implicit in the // CAS done in parMark(objAddr) above HeapWord* global_finger = _cm->finger(); --- old/src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp 2011-10-03 15:48:23.198781459 -0700 +++ new/src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp 2011-10-03 15:48:22.991541180 -0700 @@ -44,8 +44,7 @@ _started(false), _in_progress(false), _vtime_accum(0.0), - _vtime_mark_accum(0.0), - _vtime_count_accum(0.0) + _vtime_mark_accum(0.0) { create_and_start(); } @@ -147,40 +146,17 @@ } } } while (cm()->restart_for_overflow()); - double counting_start_time = os::elapsedVTime(); // YSR: These look dubious (i.e. redundant) !!! FIX ME slt()->manipulatePLL(SurrogateLockerThread::acquirePLL); slt()->manipulatePLL(SurrogateLockerThread::releaseAndNotifyPLL); - if (!cm()->has_aborted()) { - double count_start_sec = os::elapsedTime(); - if (PrintGC) { - gclog_or_tty->date_stamp(PrintGCDateStamps); - gclog_or_tty->stamp(PrintGCTimeStamps); - gclog_or_tty->print_cr("[GC concurrent-count-start]"); - } - - _sts.join(); - _cm->calcDesiredRegions(); - _sts.leave(); - - if (!cm()->has_aborted()) { - double count_end_sec = os::elapsedTime(); - if (PrintGC) { - gclog_or_tty->date_stamp(PrintGCDateStamps); - gclog_or_tty->stamp(PrintGCTimeStamps); - gclog_or_tty->print_cr("[GC concurrent-count-end, %1.7lf]", - count_end_sec - count_start_sec); - } - } - } double end_time = os::elapsedVTime(); - _vtime_count_accum += (end_time - counting_start_time); // Update the total virtual time before doing this, since it will try // to measure it to get the vtime for this marking. We purposely // neglect the presumably-short "completeCleanup" phase here. _vtime_accum = (end_time - _vtime_start); + if (!cm()->has_aborted()) { if (g1_policy->adaptive_young_list_length()) { double now = os::elapsedTime(); --- old/src/share/vm/gc_implementation/g1/concurrentMarkThread.hpp 2011-10-03 15:48:24.277644046 -0700 +++ new/src/share/vm/gc_implementation/g1/concurrentMarkThread.hpp 2011-10-03 15:48:24.074381633 -0700 @@ -40,7 +40,6 @@ double _vtime_accum; // Accumulated virtual time. double _vtime_mark_accum; - double _vtime_count_accum; public: virtual void run(); @@ -69,8 +68,6 @@ double vtime_accum(); // Marking virtual time so far double vtime_mark_accum(); - // Counting virtual time so far. - double vtime_count_accum() { return _vtime_count_accum; } ConcurrentMark* cm() { return _cm; } --- old/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp 2011-10-03 15:48:25.353517785 -0700 +++ new/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp 2011-10-03 15:48:25.142070296 -0700 @@ -3883,7 +3883,7 @@ assert(_cm->isPrevMarked(obj), "Should be marked!"); _prev_marked_bytes += (obj_size * HeapWordSize); if (_g1->mark_in_progress() && !_g1->is_obj_ill(obj)) { - _cm->markAndGrayObjectIfNecessary(obj); + _cm->markAndGrayObjectIfNecessary(obj, 0 /* worker_i */); } obj->set_mark(markOopDesc::prototype()); // While we were processing RSet buffers during the @@ -4005,7 +4005,8 @@ oop G1CollectedHeap::handle_evacuation_failure_par(OopsInHeapRegionClosure* cl, oop old, - bool should_mark_root) { + bool should_mark_root, + int worker_i) { assert(obj_in_cs(old), err_msg("obj: "PTR_FORMAT" should still be in the CSet", (HeapWord*) old)); @@ -4020,7 +4021,7 @@ // object is also responsible for marking the object. if (should_mark_root) { assert(!oopDesc::is_null(old), "shouldn't be"); - _cm->grayRoot(old); + _cm->grayRoot(old, worker_i); } if (_evac_failure_closure != cl) { @@ -4126,10 +4127,11 @@ } #endif // PRODUCT -G1ParGCAllocBuffer::G1ParGCAllocBuffer(size_t gclab_word_size) : +G1ParGCAllocBuffer::G1ParGCAllocBuffer(size_t gclab_word_size, int worker_i) : ParGCAllocBuffer(gclab_word_size), _should_mark_objects(false), _bitmap(G1CollectedHeap::heap()->reserved_region().start(), gclab_word_size), + _worker_i(worker_i), _retired(false) { //_should_mark_objects is set to true when G1ParCopyHelper needs to @@ -4151,8 +4153,8 @@ _g1_rem(g1h->g1_rem_set()), _hash_seed(17), _queue_num(queue_num), _term_attempts(0), - _surviving_alloc_buffer(g1h->desired_plab_sz(GCAllocForSurvived)), - _tenured_alloc_buffer(g1h->desired_plab_sz(GCAllocForTenured)), + _surviving_alloc_buffer(g1h->desired_plab_sz(GCAllocForSurvived), queue_num), + _tenured_alloc_buffer(g1h->desired_plab_sz(GCAllocForTenured), queue_num), _age_table(false), _strong_roots_time(0), _term_time(0), _alloc_buffer_waste(0), _undo_waste(0) @@ -4264,6 +4266,7 @@ G1ParClosureSuper::G1ParClosureSuper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) : _g1(g1), _g1_rem(_g1->g1_rem_set()), _cm(_g1->concurrent_mark()), _par_scan_state(par_scan_state), + _worker_i(par_scan_state->queue_num()), _during_initial_mark(_g1->g1_policy()->during_initial_mark_pause()), _mark_in_progress(_g1->mark_in_progress()) { } @@ -4277,7 +4280,7 @@ oop obj = oopDesc::decode_heap_oop(heap_oop); HeapWord* addr = (HeapWord*)obj; if (_g1->is_in_g1_reserved(addr)) { - _cm->grayRoot(oop(addr)); + _cm->grayRoot(oop(addr), _worker_i); } } } @@ -4303,7 +4306,7 @@ // This will either forward-to-self, or detect that someone else has // installed a forwarding pointer. OopsInHeapRegionClosure* cl = _par_scan_state->evac_failure_closure(); - return _g1->handle_evacuation_failure_par(cl, old, should_mark_root); + return _g1->handle_evacuation_failure_par(cl, old, should_mark_root, _worker_i); } // We're going to allocate linearly, so might as well prefetch ahead. @@ -4342,17 +4345,33 @@ // Mark the evacuated object or propagate "next" mark bit if (should_mark_copy) { if (!use_local_bitmaps || - !_par_scan_state->alloc_buffer(alloc_purpose)->mark(obj_ptr)) { + !_par_scan_state->alloc_buffer(alloc_purpose)->mark(obj_ptr, word_sz)) { // if we couldn't mark it on the local bitmap (this happens when // the object was not allocated in the GCLab), we have to bite // the bullet and do the standard parallel mark - _cm->markAndGrayObjectIfNecessary(obj); + _cm->markAndGrayObjectIfNecessary(obj, _worker_i); } if (_g1->isMarkedNext(old)) { // Unmark the object's old location so that marking // doesn't think the old object is alive. _cm->nextMarkBitMap()->parClear((HeapWord*)old); + + // We could clear the count data for the old object here but + // currently we do not. Why don't we do this? The thread/task + // that marks a newly copied object is likely _not_ the thread/task + // that originally marked the old object. So, to clear the count + // data for the old object, we would have to scan the count + // data for all of the tasks (and clear the data for the old object + // in parallel with other threads adding to the count data). Even + // then we could clear a bit incorrectly (e.g. if the old object + // does not start or end on a card boundary). It's more important + // that we don't have missed bits that should've been set than + // having extra bits set. + // + // As a result the accumulated count data could be a superset + // of the data that is/would have been calculated by walking + // the marking bitmap. } } @@ -4439,7 +4458,8 @@ } // When scanning the RS, we only care about objs in CS. if (barrier == G1BarrierRS) { - _par_scan_state->update_rs(_from, p, _par_scan_state->queue_num()); + assert(_worker_i == _par_scan_state->queue_num(), "sanity"); + _par_scan_state->update_rs(_from, p, _worker_i); } } else { // The object is not in collection set. If we're a root scanning @@ -4451,7 +4471,8 @@ } if (barrier == G1BarrierEvac && obj != NULL) { - _par_scan_state->update_rs(_from, p, _par_scan_state->queue_num()); + assert(_worker_i == _par_scan_state->queue_num(), "sanity"); + _par_scan_state->update_rs(_from, p, _worker_i); } if (do_gen_barrier && obj != NULL) { --- old/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp 2011-10-03 15:48:26.588863645 -0700 +++ new/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp 2011-10-03 15:48:26.381453165 -0700 @@ -848,7 +848,8 @@ // An attempt to evacuate "obj" has failed; take necessary steps. oop handle_evacuation_failure_par(OopsInHeapRegionClosure* cl, oop obj, - bool should_mark_root); + bool should_mark_root, + int worker_i); void handle_evacuation_failure_common(oop obj, markOop m); // ("Weak") Reference processing support. @@ -1664,6 +1665,18 @@ // size of a GCLab in words size_t _gclab_word_size; + // The size of the marked objects in this bitmap + size_t _marked_bytes; + + // Card bitmap associated with cards spanned by the live objects. + // It's sized for _gclab_word_size _plus_ 1 additional card + // in case the lab does not start at a card boundary + BitMap _card_bm; + + // Card number of the base of the LAB. Used to encode + // indices into the bitmap above. + intptr_t _bottom_card_num; + static int shifter() { return MinObjAlignment - 1; } @@ -1693,6 +1706,7 @@ // enough bits in the bitmap. return bits_in_bitmap + BitsPerWord - 1; } + public: GCLabBitMap(HeapWord* heap_start, size_t gclab_word_size) : BitMap(bitmap_size_in_bits(gclab_word_size)), @@ -1703,7 +1717,12 @@ _gclab_word_size(gclab_word_size), _real_start_word(NULL), _real_end_word(NULL), - _start_word(NULL) + _start_word(NULL), + _marked_bytes(0), + _card_bm((((gclab_word_size * BytesPerWord) + CardTableModRefBS::card_size - 1) >> + CardTableModRefBS::card_shift) + 1, + false /* in_resource_area*/), + _bottom_card_num(0) { guarantee( size_in_words() >= bitmap_size_in_words(), "just making sure"); @@ -1736,7 +1755,7 @@ return ret2; } - inline bool mark(HeapWord* addr) { + inline bool mark(HeapWord* addr, size_t obj_size) { guarantee(use_local_bitmaps, "invariant"); assert(fields_well_formed(), "invariant"); @@ -1746,6 +1765,25 @@ // first mark it on the bitmap at_put(heapWordToOffset(addr), true); + MemRegion mr(addr, obj_size); + + // Add to the marked_bytes total + _marked_bytes += mr.byte_size(); + + // Set the bits associated with the cards spanned by the + // object + intptr_t start_card_num = + intptr_t(uintptr_t(mr.start()) >> CardTableModRefBS::card_shift); + intptr_t last_card_num = + intptr_t(uintptr_t(mr.last()) >> CardTableModRefBS::card_shift); + + BitMap::idx_t start_idx = start_card_num - _bottom_card_num; + BitMap::idx_t last_idx = last_card_num - _bottom_card_num; + + _card_bm.set_range(start_idx, last_idx); + // set_range is exclusive so set the bit for last_idx + _card_bm.set_bit(last_idx); + return true; } else { return false; @@ -1771,6 +1809,13 @@ pointer_delta(start, _heap_start) % _bitmap_word_covers_words; _start_word = start - diff; + // Initialize _bottom_card_num for the card bitmap + _bottom_card_num = + intptr_t(uintptr_t(_real_start_word) >> CardTableModRefBS::card_shift); + + _marked_bytes = 0; + _card_bm.clear(); + assert(fields_well_formed(), "invariant"); } @@ -1782,7 +1827,7 @@ } #endif // PRODUCT - void retire() { + void retire(int worker_i) { guarantee(use_local_bitmaps, "invariant"); assert(fields_well_formed(), "invariant"); @@ -1796,7 +1841,20 @@ 0, // always start from the start of the bitmap _start_word, gclab_real_word_size()); - _cm->grayRegionIfNecessary(MemRegion(_real_start_word, _real_end_word)); + + // Note that not all objects copied into the LAB will have a bit set + // in the LAB bitmap (the LAB bitmap is used to propagate marks). + // So we can't just add the entire lab and its bitmap to the count + // data. The marking information has been recorded in _marked_bytes + // and _card_bm. + MemRegion lab_region(_real_start_word, _real_end_word); + _cm->add_to_count_data_for_region(lab_region, + &_card_bm, + _bottom_card_num, + _marked_bytes, + worker_i); + + _cm->grayRegionIfNecessary(lab_region); #ifndef PRODUCT if (use_local_bitmaps && verify_local_bitmaps) @@ -1818,14 +1876,15 @@ bool _retired; bool _should_mark_objects; GCLabBitMap _bitmap; + int _worker_i; public: - G1ParGCAllocBuffer(size_t gclab_word_size); + G1ParGCAllocBuffer(size_t gclab_word_size, int worker_i); - inline bool mark(HeapWord* addr) { + inline bool mark(HeapWord* addr, size_t obj_size) { guarantee(use_local_bitmaps, "invariant"); assert(_should_mark_objects, "invariant"); - return _bitmap.mark(addr); + return _bitmap.mark(addr, obj_size); } inline void set_buf(HeapWord* buf) { @@ -1840,7 +1899,7 @@ if (_retired) return; if (use_local_bitmaps && _should_mark_objects) { - _bitmap.retire(); + _bitmap.retire(_worker_i); } ParGCAllocBuffer::retire(end_of_gc, retain); _retired = true; --- old/src/share/vm/gc_implementation/g1/g1OopClosures.hpp 2011-10-03 15:48:27.718035556 -0700 +++ new/src/share/vm/gc_implementation/g1/g1OopClosures.hpp 2011-10-03 15:48:27.515866246 -0700 @@ -51,6 +51,7 @@ G1RemSet* _g1_rem; ConcurrentMark* _cm; G1ParScanThreadState* _par_scan_state; + int _worker_i; bool _during_initial_mark; bool _mark_in_progress; public: