src/share/vm/gc_implementation/g1/concurrentMark.cpp

Print this page
rev 2724 : 6484965: G1: piggy-back liveness accounting phase on marking
Summary: Remove the separate counting phase of concurrent marking by tracking the amount of marked bytes and the cards spanned by marked objects in marking task/worker thread local data structures, which are updated as individual objects are marked.
Reviewed-by:

@@ -471,10 +471,11 @@
   _cleanup_list("Cleanup List"),
   _region_bm(max_regions, false /* in_resource_area*/),
   _card_bm((rs.size() + CardTableModRefBS::card_size - 1) >>
            CardTableModRefBS::card_shift,
            false /* in_resource_area*/),
+
   _prevMarkBitMap(&_markBitMap1),
   _nextMarkBitMap(&_markBitMap2),
   _at_least_one_mark_complete(false),
 
   _markStack(this),

@@ -500,11 +501,15 @@
   _remark_times(), _remark_mark_times(), _remark_weak_ref_times(),
   _cleanup_times(),
   _total_counting_time(0.0),
   _total_rs_scrub_time(0.0),
 
-  _parallel_workers(NULL) {
+  _parallel_workers(NULL),
+
+  _count_card_bitmaps(NULL),
+  _count_marked_bytes(NULL)
+{
   CMVerboseLevel verbose_level = (CMVerboseLevel) G1MarkingVerboseLevel;
   if (verbose_level < no_verbose) {
     verbose_level = no_verbose;
   }
   if (verbose_level > high_verbose) {

@@ -534,19 +539,27 @@
   satb_qs.set_buffer_size(G1SATBBufferSize);
 
   _tasks = NEW_C_HEAP_ARRAY(CMTask*, _max_task_num);
   _accum_task_vtime = NEW_C_HEAP_ARRAY(double, _max_task_num);
 
+  _count_card_bitmaps = NEW_C_HEAP_ARRAY(BitMap, _max_task_num);
+  _count_marked_bytes = NEW_C_HEAP_ARRAY(size_t*, _max_task_num);
+
+  BitMap::idx_t card_bm_size = _card_bm.size();
+
   // so that the assertion in MarkingTaskQueue::task_queue doesn't fail
   _active_tasks = _max_task_num;
   for (int i = 0; i < (int) _max_task_num; ++i) {
     CMTaskQueue* task_queue = new CMTaskQueue();
     task_queue->initialize();
     _task_queues->register_queue(i, task_queue);
 
     _tasks[i] = new CMTask(i, this, task_queue, _task_queues);
     _accum_task_vtime[i] = 0.0;
+
+    _count_card_bitmaps[i] = BitMap(card_bm_size, false);
+    _count_marked_bytes[i] = NEW_C_HEAP_ARRAY(size_t, max_regions);
   }
 
   if (ConcGCThreads > ParallelGCThreads) {
     vm_exit_during_initialization("Can't have more ConcGCThreads "
                                   "than ParallelGCThreads.");

@@ -664,10 +677,12 @@
   assert(_heap_start < _heap_end, "heap bounds should look ok");
 
   // reset all the marking data structures and any necessary flags
   clear_marking_state();
 
+  clear_all_count_data();
+
   if (verbose_low()) {
     gclog_or_tty->print_cr("[global] resetting");
   }
 
   // We do reset all of them, since different phases will use

@@ -719,13 +734,21 @@
 
 ConcurrentMark::~ConcurrentMark() {
   for (int i = 0; i < (int) _max_task_num; ++i) {
     delete _task_queues->queue(i);
     delete _tasks[i];
+
+    _count_card_bitmaps[i].resize(0, false);
+    FREE_C_HEAP_ARRAY(size_t, _count_marked_bytes[i]);
   }
+
   delete _task_queues;
-  FREE_C_HEAP_ARRAY(CMTask*, _max_task_num);
+  FREE_C_HEAP_ARRAY(CMTask*, _tasks);
+  FREE_C_HEAP_ARRAY(double, _accum_task_vtime);
+
+  FREE_C_HEAP_ARRAY(BitMap*, _count_card_bitmaps);
+  FREE_C_HEAP_ARRAY(size_t*, _count_marked_bytes);
 }
 
 // This closure is used to mark refs into the g1 generation
 // from external roots in the CMS bit map.
 // Called at the first checkpoint.

@@ -940,22 +963,25 @@
     return false;
   }
 }
 #endif // !PRODUCT
 
-void ConcurrentMark::grayRoot(oop p) {
+void ConcurrentMark::grayRoot(oop p, int worker_i) {
   HeapWord* addr = (HeapWord*) p;
   // We can't really check against _heap_start and _heap_end, since it
   // is possible during an evacuation pause with piggy-backed
   // initial-mark that the committed space is expanded during the
   // pause without CM observing this change. So the assertions below
   // is a bit conservative; but better than nothing.
   assert(_g1h->g1_committed().contains(addr),
          "address should be within the heap bounds");
 
   if (!_nextMarkBitMap->isMarked(addr)) {
-    _nextMarkBitMap->parMark(addr);
+    if (_nextMarkBitMap->parMark(addr)) {
+      // Update the task specific count data for object p.
+      add_to_count_data_for(p, worker_i);
+    }
   }
 }
 
 void ConcurrentMark::grayRegionIfNecessary(MemRegion mr) {
   // The objects on the region have already been marked "in bulk" by

@@ -1000,19 +1026,22 @@
       }
     }
   }
 }
 
-void ConcurrentMark::markAndGrayObjectIfNecessary(oop p) {
+void ConcurrentMark::markAndGrayObjectIfNecessary(oop p, int worker_i) {
   // The object is not marked by the caller. We need to at least mark
   // it and maybe push in on the stack.
 
   HeapWord* addr = (HeapWord*)p;
   if (!_nextMarkBitMap->isMarked(addr)) {
     // We definitely need to mark it, irrespective whether we bail out
     // because we're done with marking.
     if (_nextMarkBitMap->parMark(addr)) {
+      // Update the task specific count data for object p
+      add_to_count_data_for(p, worker_i);
+      
       if (!concurrent_marking_in_progress() || !_should_gray_objects) {
         // If we're done with concurrent marking and we're waiting for
         // remark, then we're not pushing anything on the stack.
         return;
       }

@@ -1171,10 +1200,14 @@
     clear_has_overflown();
     if (G1TraceMarkStackOverflow) {
       gclog_or_tty->print_cr("\nRemark led to restart for overflow.");
     }
   } else {
+    // Aggregate the per-task counting data that we have accumulated
+    // while marking.
+    aggregate_all_count_data();
+
     SATBMarkQueueSet& satb_mq_set = JavaThread::satb_mark_queue_set();
     // We're done with marking.
     // This is the end of  the marking cycle, we're expected all
     // threads to have SATB queues with active set to true.
     satb_mq_set.set_active_all_threads(false, /* new active value */

@@ -1210,45 +1243,46 @@
   g1p->record_concurrent_mark_remark_end();
 }
 
 #define CARD_BM_TEST_MODE 0
 
+// Used to calculate the # live objects per region
+// for verification purposes
 class CalcLiveObjectsClosure: public HeapRegionClosure {
 
   CMBitMapRO* _bm;
   ConcurrentMark* _cm;
-  bool _changed;
-  bool _yield;
-  size_t _words_done;
+  BitMap* _region_bm;
+  BitMap* _card_bm;
+
+  size_t _tot_words_done;
   size_t _tot_live;
   size_t _tot_used;
-  size_t _regions_done;
-  double _start_vtime_sec;
 
-  BitMap* _region_bm;
-  BitMap* _card_bm;
+  size_t _region_marked_bytes;
+
   intptr_t _bottom_card_num;
-  bool _final;
 
   void mark_card_num_range(intptr_t start_card_num, intptr_t last_card_num) {
-    for (intptr_t i = start_card_num; i <= last_card_num; i++) {
+    BitMap::idx_t start_idx = start_card_num - _bottom_card_num;
+    BitMap::idx_t last_idx = last_card_num - _bottom_card_num;
+    
+    for (BitMap::idx_t i = start_idx; i <= last_idx; i += 1) {
 #if CARD_BM_TEST_MODE
-      guarantee(_card_bm->at(i - _bottom_card_num), "Should already be set.");
+      guarantee(_card_bm->at(i), "Should already be set.");
 #else
-      _card_bm->par_at_put(i - _bottom_card_num, 1);
+      _card_bm->par_at_put(i, 1);
 #endif
     }
   }
 
 public:
-  CalcLiveObjectsClosure(bool final,
-                         CMBitMapRO *bm, ConcurrentMark *cm,
+  CalcLiveObjectsClosure(CMBitMapRO *bm, ConcurrentMark *cm,
                          BitMap* region_bm, BitMap* card_bm) :
-    _bm(bm), _cm(cm), _changed(false), _yield(true),
-    _words_done(0), _tot_live(0), _tot_used(0),
-    _region_bm(region_bm), _card_bm(card_bm),_final(final),
-    _regions_done(0), _start_vtime_sec(0.0)
+    _bm(bm), _cm(cm), _region_bm(region_bm), _card_bm(card_bm),
+    _region_marked_bytes(0), _tot_words_done(0),
+    _tot_live(0), _tot_used(0)
   {
     _bottom_card_num =
       intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >>
                CardTableModRefBS::card_shift);
   }

@@ -1285,13 +1319,10 @@
                                    (BitMap::idx_t) end_index, true);
     }
   }
 
   bool doHeapRegion(HeapRegion* hr) {
-    if (!_final && _regions_done == 0) {
-      _start_vtime_sec = os::elapsedVTime();
-    }
 
     if (hr->continuesHumongous()) {
       // We will ignore these here and process them when their
       // associated "starts humongous" region is processed (see
       // set_bit_for_heap_region()). Note that we cannot rely on their

@@ -1301,52 +1332,44 @@
       // before its associated "starts humongous".
       return false;
     }
 
     HeapWord* nextTop = hr->next_top_at_mark_start();
-    HeapWord* start   = hr->top_at_conc_mark_count();
-    assert(hr->bottom() <= start && start <= hr->end() &&
-           hr->bottom() <= nextTop && nextTop <= hr->end() &&
-           start <= nextTop,
+    HeapWord* start   = hr->bottom();
+
+    assert(start <= hr->end() && start <= nextTop && nextTop <= hr->end(),
            "Preconditions.");
-    // Otherwise, record the number of word's we'll examine.
+
+    // Record the number of word's we'll examine.
     size_t words_done = (nextTop - start);
+
     // Find the first marked object at or after "start".
     start = _bm->getNextMarkedWordAddress(start, nextTop);
+
     size_t marked_bytes = 0;
+    _region_marked_bytes = 0;
 
     // Below, the term "card num" means the result of shifting an address
     // by the card shift -- address 0 corresponds to card number 0.  One
     // must subtract the card num of the bottom of the heap to obtain a
     // card table index.
+
     // The first card num of the sequence of live cards currently being
     // constructed.  -1 ==> no sequence.
     intptr_t start_card_num = -1;
+
     // The last card num of the sequence of live cards currently being
     // constructed.  -1 ==> no sequence.
     intptr_t last_card_num = -1;
 
     while (start < nextTop) {
-      if (_yield && _cm->do_yield_check()) {
-        // We yielded.  It might be for a full collection, in which case
-        // all bets are off; terminate the traversal.
-        if (_cm->has_aborted()) {
-          _changed = false;
-          return true;
-        } else {
-          // Otherwise, it might be a collection pause, and the region
-          // we're looking at might be in the collection set.  We'll
-          // abandon this region.
-          return false;
-        }
-      }
       oop obj = oop(start);
       int obj_sz = obj->size();
+
       // The card num of the start of the current object.
       intptr_t obj_card_num =
         intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift);
-
       HeapWord* obj_last = start + obj_sz - 1;
       intptr_t obj_last_card_num =
         intptr_t(uintptr_t(obj_last) >> CardTableModRefBS::card_shift);
 
       if (obj_card_num != last_card_num) {

@@ -1375,140 +1398,543 @@
       }
       // In any case, we set the last card num.
       last_card_num = obj_last_card_num;
 
       marked_bytes += (size_t)obj_sz * HeapWordSize;
+
       // Find the next marked object after this one.
       start = _bm->getNextMarkedWordAddress(start + 1, nextTop);
-      _changed = true;
     }
+
     // Handle the last range, if any.
     if (start_card_num != -1) {
       mark_card_num_range(start_card_num, last_card_num);
     }
-    if (_final) {
+
       // Mark the allocated-since-marking portion...
-      HeapWord* tp = hr->top();
-      if (nextTop < tp) {
-        start_card_num =
-          intptr_t(uintptr_t(nextTop) >> CardTableModRefBS::card_shift);
-        last_card_num =
-          intptr_t(uintptr_t(tp) >> CardTableModRefBS::card_shift);
+    HeapWord* top = hr->top();
+    if (nextTop < top) {
+      start_card_num = intptr_t(uintptr_t(nextTop) >> CardTableModRefBS::card_shift);
+      last_card_num = intptr_t(uintptr_t(top) >> CardTableModRefBS::card_shift);
+
         mark_card_num_range(start_card_num, last_card_num);
+      
         // This definitely means the region has live objects.
         set_bit_for_region(hr);
       }
-    }
 
-    hr->add_to_marked_bytes(marked_bytes);
     // Update the live region bitmap.
     if (marked_bytes > 0) {
       set_bit_for_region(hr);
     }
-    hr->set_top_at_conc_mark_count(nextTop);
+
+    // Set the marked bytes for the current region so that
+    // it can be queried by a calling verificiation routine
+    _region_marked_bytes = marked_bytes;
+
     _tot_live += hr->next_live_bytes();
     _tot_used += hr->used();
-    _words_done = words_done;
+    _tot_words_done = words_done;
 
-    if (!_final) {
-      ++_regions_done;
-      if (_regions_done % 10 == 0) {
-        double end_vtime_sec = os::elapsedVTime();
-        double elapsed_vtime_sec = end_vtime_sec - _start_vtime_sec;
-        if (elapsed_vtime_sec > (10.0 / 1000.0)) {
-          jlong sleep_time_ms =
-            (jlong) (elapsed_vtime_sec * _cm->cleanup_sleep_factor() * 1000.0);
-          os::sleep(Thread::current(), sleep_time_ms, false);
-          _start_vtime_sec = end_vtime_sec;
+    return false;
         }
+
+  size_t region_marked_bytes() const { return _region_marked_bytes; }
+  size_t tot_words_done() const      { return _tot_words_done; }
+  size_t tot_live() const            { return _tot_live; }
+  size_t tot_used() const            { return _tot_used; }
+};
+
+// Aggregate the counting data that was constructed concurrently
+// with marking.
+class AddToMarkedBytesClosure: public HeapRegionClosure {
+  ConcurrentMark* _cm;
+  size_t _task_num;
+  size_t _max_task_num;
+
+  bool _final;
+
+public:
+  AddToMarkedBytesClosure(ConcurrentMark *cm,
+                          size_t task_num,
+                          size_t max_task_num) :
+    _cm(cm),
+    _task_num(task_num),
+    _max_task_num(max_task_num),
+    _final(false)
+  {
+    assert(0 <= _task_num && _task_num < _max_task_num, "sanity");
+    if ((_max_task_num - _task_num) == 1) {
+      // Last task
+      _final = true;
       }
     }
 
+  bool doHeapRegion(HeapRegion* hr) {
+    // Adds the value in the counted marked bytes array for
+    // _task_num for region hr to the value cached in heap
+    // region itself.
+    // For the final task we also set the top at conc count
+    // for the region.
+    // The bits in the live region bitmap are set for regions
+    // that contain live data during the cleanup pause.
+
+    if (hr->continuesHumongous()) {
+      // We will ignore these here and process them when their
+      // associated "starts humongous" region is processed.
+      // Note that we cannot rely on their associated
+      // "starts humongous" region to have their bit set to 1
+      // since, due to the region chunking in the parallel region
+      // iteration, a "continues humongous" region might be visited
+      // before its associated "starts humongous".
     return false;
   }
 
-  bool changed() { return _changed;  }
-  void reset()   { _changed = false; _words_done = 0; }
-  void no_yield() { _yield = false; }
-  size_t words_done() { return _words_done; }
-  size_t tot_live() { return _tot_live; }
-  size_t tot_used() { return _tot_used; }
-};
+    int hrs_index = hr->hrs_index();
+    size_t* marked_bytes_array = _cm->count_marked_bytes_for(_task_num);
+    size_t marked_bytes = marked_bytes_array[hrs_index];
+    hr->add_to_marked_bytes(marked_bytes);
 
+    if (_final) {
+      HeapWord* ntams = hr->next_top_at_mark_start();
+      HeapWord* start = hr->bottom();
 
-void ConcurrentMark::calcDesiredRegions() {
-  _region_bm.clear();
+      assert(start <= ntams && ntams <= hr->top() && hr->top() <= hr->end(),
+             "Preconditions.");
+
+      hr->set_top_at_conc_mark_count(ntams);
+    }
+
+    return false;
+  }
+};
+
+void ConcurrentMark::aggregate_all_count_data() {
   _card_bm.clear();
-  CalcLiveObjectsClosure calccl(false /*final*/,
-                                nextMarkBitMap(), this,
-                                &_region_bm, &_card_bm);
-  G1CollectedHeap *g1h = G1CollectedHeap::heap();
-  g1h->heap_region_iterate(&calccl);
 
-  do {
-    calccl.reset();
-    g1h->heap_region_iterate(&calccl);
-  } while (calccl.changed());
+  // Unions the per task card bitmaps into the global card bitmap,
+  // and aggregates the per task marked bytes for each region into
+  // the heap region itself.
+
+  for (int i = 0; i < _max_task_num; i += 1) {
+    BitMap& task_card_bm = count_card_bitmap_for(i);
+    _card_bm.set_union(task_card_bm);
+
+    // Update the marked bytes for each region
+    AddToMarkedBytesClosure cl(this, i, _max_task_num);
+    _g1h->heap_region_iterate(&cl);
+  }
+
+  // We're done with the accumulated per-task concurrent
+  // counting data so let's clear it for the next marking.
+  clear_all_count_data();
 }
 
+// Final update of count data (during cleanup).
+// Adds [top_at_count, NTAMS) to the marked bytes for each
+// region. Sets the bits in the card bitmap corresponding
+// to the interval [top_at_count, top], and sets the
+// liveness bit for each region containing live data
+// in the region bitmap.
+
+class FinalCountDataUpdateClosure: public HeapRegionClosure {
+  ConcurrentMark* _cm;
+  BitMap* _region_bm;
+  BitMap* _card_bm;
+  intptr_t _bottom_card_num;
+
+  size_t _total_live_bytes;
+  size_t _total_used_bytes;
+  size_t _total_words_done;
+
+  void mark_card_num_range(intptr_t start_card_num, intptr_t last_card_num) {
+    BitMap::idx_t start_idx = start_card_num - _bottom_card_num;
+    BitMap::idx_t last_idx = last_card_num - _bottom_card_num;
+    
+    // Inclusive bit range [start_idx, last_idx]. par_at_put_range
+    // is exclusive so we have to also set the bit for last_idx.
+    // Passing last_idx+1 to the clear_range would work in
+    // most cases but could trip an OOB assertion.
+
+    if ((last_idx - start_idx) > 0) {
+      _card_bm->par_at_put_range(start_idx, last_idx, true);
+    }
+    _card_bm->par_set_bit(last_idx);
+  }
+
+  // It takes a region that's not empty (i.e., it has at least one
+  // live object in it and sets its corresponding bit on the region
+  // bitmap to 1. If the region is "starts humongous" it will also set
+  // to 1 the bits on the region bitmap that correspond to its
+  // associated "continues humongous" regions.
+  void set_bit_for_region(HeapRegion* hr) {
+    assert(!hr->continuesHumongous(), "should have filtered those out");
+
+    size_t index = hr->hrs_index();
+    if (!hr->startsHumongous()) {
+      // Normal (non-humongous) case: just set the bit.
+      _region_bm->par_set_bit((BitMap::idx_t) index);
+    } else {
+      // Starts humongous case: calculate how many regions are part of
+      // this humongous region and then set the bit range. It might
+      // have been a bit more efficient to look at the object that
+      // spans these humongous regions to calculate their number from
+      // the object's size. However, it's a good idea to calculate
+      // this based on the metadata itself, and not the region
+      // contents, so that this code is not aware of what goes into
+      // the humongous regions (in case this changes in the future).
+      G1CollectedHeap* g1h = G1CollectedHeap::heap();
+      size_t end_index = index + 1;
+      while (end_index < g1h->n_regions()) {
+        HeapRegion* chr = g1h->region_at(end_index);
+        if (!chr->continuesHumongous()) break;
+        end_index += 1;
+      }
+      _region_bm->par_at_put_range((BitMap::idx_t) index,
+                                   (BitMap::idx_t) end_index, true);
+    }
+  }
+
+ public:
+  FinalCountDataUpdateClosure(ConcurrentMark* cm,
+                              BitMap* region_bm,
+                              BitMap* card_bm) :
+    _cm(cm), _region_bm(region_bm), _card_bm(card_bm),
+    _total_words_done(0), _total_live_bytes(0), _total_used_bytes(0)
+  {
+    _bottom_card_num =
+      intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >>
+               CardTableModRefBS::card_shift);
+  }
+
+  bool doHeapRegion(HeapRegion* hr) {
+
+    if (hr->continuesHumongous()) {
+      // We will ignore these here and process them when their
+      // associated "starts humongous" region is processed (see
+      // set_bit_for_heap_region()). Note that we cannot rely on their
+      // associated "starts humongous" region to have their bit set to
+      // 1 since, due to the region chunking in the parallel region
+      // iteration, a "continues humongous" region might be visited
+      // before its associated "starts humongous".
+      return false;
+    }
+
+    HeapWord* start = hr->top_at_conc_mark_count();
+    HeapWord* ntams = hr->next_top_at_mark_start();
+    HeapWord* top   = hr->top();
+    
+    assert(hr->bottom() <= start && start <= hr->end() &&
+           hr->bottom() <= ntams && ntams <= hr->end(), "Preconditions.");
+    
+    size_t words_done = ntams - hr->bottom();
+
+    intptr_t start_card_num = intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift);
+    intptr_t last_card_num = intptr_t(uintptr_t(top) >> CardTableModRefBS::card_shift);
+
+
+    if (start < ntams) {
+      // Region was changed between remark and cleanup pauses
+      // We need to add (ntams - start) to the marked bytes
+      // for this region, and set bits for the range
+      // [ card_num(start), card_num(ntams) ) in the
+      // card bitmap.
+      size_t live_bytes = (ntams - start) * HeapWordSize;
+      hr->add_to_marked_bytes(live_bytes);
+      
+      // Record the new top at conc count
+      hr->set_top_at_conc_mark_count(ntams);
+
+      // The setting of the bits card bitmap takes place below
+    }
+
+    // Mark the allocated-since-marking portion...
+    if (ntams < top) {
+      // This definitely means the region has live objects.
+      set_bit_for_region(hr);
+    }
+
+    // Now set the bits for [start, top]
+    mark_card_num_range(start_card_num, last_card_num);
+
+    // Set the bit for the region if it contains live data
+    if (hr->next_marked_bytes() > 0) {
+      set_bit_for_region(hr);
+    }
+
+    _total_words_done += words_done;
+    _total_used_bytes += hr->used();
+    _total_live_bytes += hr->next_marked_bytes();
+
+    return false;
+  }
+
+  size_t total_words_done() const { return _total_words_done; }
+  size_t total_live_bytes() const { return _total_live_bytes; }
+  size_t total_used_bytes() const { return _total_used_bytes; }
+};
+
+// Heap region closure used for verifying the counting data
+// that was accumulated concurrently and aggregated during
+// the remark pause. This closure is applied to the heap
+// regions during the STW cleanup pause.
+
+class VerifyLiveObjectDataHRClosure: public HeapRegionClosure {
+  ConcurrentMark* _cm;
+  CalcLiveObjectsClosure _calc_cl;
+  BitMap* _region_bm;   // Region BM to be verified
+  BitMap* _card_bm;     // Card BM to be verified
+  bool _verbose;        // verbose output?
+
+  BitMap* _exp_region_bm; // Expected Region BM values
+  BitMap* _exp_card_bm;   // Expected card BM values
+
+  intptr_t _bottom_card_num; // Used for calculatint bitmap indices
+
+  int _failures;
+
+public:
+  VerifyLiveObjectDataHRClosure(ConcurrentMark* cm,
+                                BitMap* region_bm,
+                                BitMap* card_bm,
+                                BitMap* exp_region_bm,
+                                BitMap* exp_card_bm,
+                                bool verbose) :
+    _cm(cm),
+    _calc_cl(_cm->nextMarkBitMap(), _cm, exp_region_bm, exp_card_bm),
+    _region_bm(region_bm), _card_bm(card_bm), _verbose(verbose),
+    _exp_region_bm(exp_region_bm), _exp_card_bm(exp_card_bm),
+    _failures(0)
+  { 
+    _bottom_card_num =
+      intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >>
+               CardTableModRefBS::card_shift);
+  }
+
+  int failures() const { return _failures; }
+
+  bool doHeapRegion(HeapRegion* hr) {
+    if (hr->continuesHumongous()) {
+      // We will ignore these here and process them when their
+      // associated "starts humongous" region is processed (see
+      // set_bit_for_heap_region()). Note that we cannot rely on their
+      // associated "starts humongous" region to have their bit set to
+      // 1 since, due to the region chunking in the parallel region
+      // iteration, a "continues humongous" region might be visited
+      // before its associated "starts humongous".
+      return false;
+    }
+
+    // Call the CalcLiveObjectsClosure to walk the marking bitmap for
+    // this region and set the corresponding bits in the expected region
+    // and card bitmaps.
+    bool res = _calc_cl.doHeapRegion(hr);
+    assert(res == false, "should be continuing");
+
+    // Note that the calculated count data could be a subset of the
+    // count data that was accumlated during marking. See the comment
+    // in G1ParCopyHelper::copy_to_survivor space for an explanation
+    // why.
+
+    if (_verbose) {
+      gclog_or_tty->print("Region %d: bottom: "PTR_FORMAT", ntams: "
+                          PTR_FORMAT", top: "PTR_FORMAT", end: "PTR_FORMAT,
+                          hr->hrs_index(), hr->bottom(), hr->next_top_at_mark_start(),
+                          hr->top(), hr->end());
+      gclog_or_tty->print_cr(", marked_bytes: calc/actual "SIZE_FORMAT"/"SIZE_FORMAT,
+                             _calc_cl.region_marked_bytes(),
+                             hr->next_marked_bytes());
+    }
+
+    // Verify that _top_at_conc_count == ntams
+    if (hr->top_at_conc_mark_count() != hr->next_top_at_mark_start()) {
+      if (_verbose) {
+        gclog_or_tty->print_cr("Region %d: top at conc count incorrect: expected "
+                               PTR_FORMAT", actual: "PTR_FORMAT,
+                               hr->hrs_index(), hr->next_top_at_mark_start(),
+                               hr->top_at_conc_mark_count());
+      }
+      _failures += 1;
+    }
+
+    // Verify the marked bytes for this region. 
+    size_t exp_marked_bytes = _calc_cl.region_marked_bytes();
+    size_t act_marked_bytes = hr->next_marked_bytes();
+
+    // We're OK if actual marked bytes >= expected.
+    if (exp_marked_bytes > act_marked_bytes) {
+      if (_verbose) {
+        gclog_or_tty->print_cr("Region %d: marked bytes mismatch: expected: "
+                               SIZE_FORMAT", actual: "SIZE_FORMAT,
+                               hr->hrs_index(), exp_marked_bytes, act_marked_bytes);
+      }
+      _failures += 1;
+    }
+
+    // Verify the bit, for this region, in the actual and expected
+    // (which was just calculated) region bit maps.
+    // We're not OK if the expected bit is set and the actual is not set.
+    BitMap::idx_t index = (BitMap::idx_t)hr->hrs_index();
+    
+    bool expected = _exp_region_bm->at(index);
+    bool actual = _region_bm->at(index);
+    if (expected && !actual) {
+      if (_verbose) {
+        gclog_or_tty->print_cr("Region %d: region bitmap mismatch: expected: %d, actual: %d",
+                               hr->hrs_index(), expected, actual);
+      }
+      _failures += 1;
+    }
+
+    // Verify that the card bit maps for the cards spanned by the current
+    // region match. The set of offsets that have set bits in the expected
+    // bitmap should be a subset of the offsets with set bits from the actual
+    // calculated card bitmap.
+    // Again it's more important that if the expected bit is set then the
+    // actual bit be set.
+    intptr_t start_card_num =
+        intptr_t(uintptr_t(hr->bottom()) >> CardTableModRefBS::card_shift);
+    intptr_t top_card_num =
+        intptr_t(uintptr_t(hr->top()) >> CardTableModRefBS::card_shift);
+
+    BitMap::idx_t start_idx = start_card_num - _bottom_card_num;
+    BitMap::idx_t end_idx = top_card_num - _bottom_card_num;
+
+    for (BitMap::idx_t i = start_idx; i < end_idx; i+=1) {
+      expected = _exp_card_bm->at(i);
+      actual = _card_bm->at(i);
+      
+      if (expected && !actual) {
+        if (_verbose) {
+          gclog_or_tty->print_cr("Region %d: card bitmap mismatch at idx %d: expected: %d, actual: %d",
+                                 hr->hrs_index(), i, expected, actual);
+        }
+        _failures += 1;
+      }
+    }
+    if (_failures) {
+      // Stop iteration?
+      return true;
+    }
+
+    return false;
+  }
+};
+
+class Mux2HRClosure: public HeapRegionClosure {
+  HeapRegionClosure* _cl1;
+  HeapRegionClosure* _cl2;
+
+public:
+  Mux2HRClosure(HeapRegionClosure *c1, HeapRegionClosure *c2) : _cl1(c1), _cl2(c2) { }
+  bool doHeapRegion(HeapRegion* hr) {
+    bool res1 = _cl1->doHeapRegion(hr);
+    bool res2 = _cl2->doHeapRegion(hr);
+
+    // Only continue if both return false;
+    return res1 || res2;
+  }
+};
+
 class G1ParFinalCountTask: public AbstractGangTask {
 protected:
   G1CollectedHeap* _g1h;
   CMBitMap* _bm;
   size_t _n_workers;
   size_t *_live_bytes;
   size_t *_used_bytes;
-  BitMap* _region_bm;
-  BitMap* _card_bm;
+
+  BitMap* _actual_region_bm;
+  BitMap* _actual_card_bm;
+
+  BitMap _expected_region_bm;
+  BitMap _expected_card_bm;
+
+  int _failures;
+
 public:
   G1ParFinalCountTask(G1CollectedHeap* g1h, CMBitMap* bm,
                       BitMap* region_bm, BitMap* card_bm)
-    : AbstractGangTask("G1 final counting"), _g1h(g1h),
-      _bm(bm), _region_bm(region_bm), _card_bm(card_bm) {
+    : AbstractGangTask("G1 final counting"),
+      _g1h(g1h), _bm(bm),
+      _actual_region_bm(region_bm), _actual_card_bm(card_bm),
+      _expected_region_bm(0, false), _expected_card_bm(0, false),
+      _failures(0)
+  {
     if (ParallelGCThreads > 0) {
       _n_workers = _g1h->workers()->total_workers();
     } else {
       _n_workers = 1;
     }
+
     _live_bytes = NEW_C_HEAP_ARRAY(size_t, _n_workers);
     _used_bytes = NEW_C_HEAP_ARRAY(size_t, _n_workers);
+
+    if (VerifyDuringGC) {
+      _expected_card_bm.resize(_actual_card_bm->size(), false);
+      _expected_region_bm.resize(_actual_region_bm->size(), false);
+    }
   }
 
   ~G1ParFinalCountTask() {
+    if (VerifyDuringGC) {
+      _expected_region_bm.resize(0);
+      _expected_card_bm.resize(0);
+    }
     FREE_C_HEAP_ARRAY(size_t, _live_bytes);
     FREE_C_HEAP_ARRAY(size_t, _used_bytes);
   }
 
   void work(int i) {
-    CalcLiveObjectsClosure calccl(true /*final*/,
-                                  _bm, _g1h->concurrent_mark(),
-                                  _region_bm, _card_bm);
-    calccl.no_yield();
+
+    FinalCountDataUpdateClosure final_update_cl(_g1h->concurrent_mark(),
+                                                _actual_region_bm, _actual_card_bm);
+
+    VerifyLiveObjectDataHRClosure verify_cl(_g1h->concurrent_mark(),
+                                            _actual_region_bm, _actual_card_bm,
+                                            &_expected_region_bm,
+                                            &_expected_card_bm,
+                                            true /* verbose */);
+
+    Mux2HRClosure update_and_verify_cl(&final_update_cl, &verify_cl);
+
+    HeapRegionClosure* hr_cl = &final_update_cl;
+    if (VerifyDuringGC) {
+      hr_cl = &update_and_verify_cl;
+    }
+
     if (G1CollectedHeap::use_parallel_gc_threads()) {
-      _g1h->heap_region_par_iterate_chunked(&calccl, i,
+      _g1h->heap_region_par_iterate_chunked(hr_cl, i,
                                             HeapRegion::FinalCountClaimValue);
     } else {
-      _g1h->heap_region_iterate(&calccl);
+      _g1h->heap_region_iterate(hr_cl);
     }
-    assert(calccl.complete(), "Shouldn't have yielded!");
 
     assert((size_t) i < _n_workers, "invariant");
-    _live_bytes[i] = calccl.tot_live();
-    _used_bytes[i] = calccl.tot_used();
+    _live_bytes[i] = final_update_cl.total_live_bytes();
+    _used_bytes[i] = final_update_cl.total_used_bytes();
+
+    if (VerifyDuringGC) {
+      _failures += verify_cl.failures();
+    }
   }
+
   size_t live_bytes()  {
     size_t live_bytes = 0;
     for (size_t i = 0; i < _n_workers; ++i)
       live_bytes += _live_bytes[i];
     return live_bytes;
   }
+
   size_t used_bytes()  {
     size_t used_bytes = 0;
     for (size_t i = 0; i < _n_workers; ++i)
       used_bytes += _used_bytes[i];
     return used_bytes;
   }
+
+  int failures() const { return _failures; }
 };
 
 class G1ParNoteEndTask;
 
 class G1NoteEndOfConcMarkClosure : public HeapRegionClosure {

@@ -1705,30 +2131,39 @@
 
   double start = os::elapsedTime();
 
   HeapRegionRemSet::reset_for_cleanup_tasks();
 
+  // Clear the global region bitmap - it will be filled as part
+  // of the final counting task.
+  _region_bm.clear();
+
   // Do counting once more with the world stopped for good measure.
   G1ParFinalCountTask g1_par_count_task(g1h, nextMarkBitMap(),
                                         &_region_bm, &_card_bm);
+
   if (G1CollectedHeap::use_parallel_gc_threads()) {
-    assert(g1h->check_heap_region_claim_values(
-                                               HeapRegion::InitialClaimValue),
+    assert(g1h->check_heap_region_claim_values(HeapRegion::InitialClaimValue),
            "sanity check");
 
     int n_workers = g1h->workers()->total_workers();
     g1h->set_par_threads(n_workers);
     g1h->workers()->run_task(&g1_par_count_task);
     g1h->set_par_threads(0);
 
-    assert(g1h->check_heap_region_claim_values(
-                                             HeapRegion::FinalCountClaimValue),
+    assert(g1h->check_heap_region_claim_values(HeapRegion::FinalCountClaimValue),
            "sanity check");
   } else {
     g1_par_count_task.work(0);
   }
 
+  // Verify that there were no verification failures of
+  // the live counting data.
+  if (VerifyDuringGC) {
+    assert(g1_par_count_task.failures() == 0, "Unexpected failures");
+  }
+
   size_t known_garbage_bytes =
     g1_par_count_task.used_bytes() - g1_par_count_task.live_bytes();
   g1p->set_known_garbage_bytes(known_garbage_bytes);
 
   size_t start_used_bytes = g1h->used();

@@ -1927,11 +2362,14 @@
   CMBitMap*        _bitMap;
  public:
   G1CMKeepAliveClosure(G1CollectedHeap* g1, ConcurrentMark* cm,
                        CMBitMap* bitMap) :
     _g1(g1), _cm(cm),
-    _bitMap(bitMap) {}
+    _bitMap(bitMap)
+  {
+    assert(Thread::current()->is_VM_thread(), "otherwise fix worker id");
+  }
 
   virtual void do_oop(narrowOop* p) { do_oop_work(p); }
   virtual void do_oop(      oop* p) { do_oop_work(p); }
 
   template <class T> void do_oop_work(T* p) {

@@ -1944,10 +2382,13 @@
                              p, (void*) obj);
     }
 
     if (_g1->is_in_g1_reserved(addr) && _g1->is_obj_ill(obj)) {
       _bitMap->mark(addr);
+      // Update the task specific count data for obj
+      _cm->add_to_count_data_for(obj, 0 /* worker_i */);
+
       _cm->mark_stack_push(obj);
     }
   }
 };
 

@@ -2594,10 +3035,13 @@
                                  "marked", (void*) obj);
         }
 
         // we need to mark it first
         if (_nextMarkBitMap->parMark(objAddr)) {
+          // Update the task specific count data for obj
+          add_to_count_data_for(obj, hr, 0 /* worker_i */);
+
           // No OrderAccess:store_load() is needed. It is implicit in the
           // CAS done in parMark(objAddr) above
           HeapWord* finger = _finger;
           if (objAddr < finger) {
             if (verbose_high()) {

@@ -2840,10 +3284,192 @@
     // Clear any partial regions from the CMTasks
     _tasks[i]->clear_aborted_region();
   }
 }
 
+// Clear the per-worker arrays used to store the per-region counting data
+void ConcurrentMark::clear_all_count_data() {
+  assert(SafepointSynchronize::is_at_safepoint() ||
+         !Universe::is_fully_initialized(), "must be");
+
+  int max_regions = _g1h->max_regions();
+  
+  assert(_max_task_num != 0, "unitialized");
+  assert(_count_card_bitmaps != NULL, "uninitialized");
+  assert(_count_marked_bytes != NULL, "uninitialized");
+
+  for (int i = 0; i < _max_task_num; i += 1) {
+    BitMap& task_card_bm = count_card_bitmap_for(i);
+    size_t* marked_bytes_array = count_marked_bytes_for(i);
+
+    assert(task_card_bm.size() == _card_bm.size(), "size mismatch");
+    assert(marked_bytes_array != NULL, "uninitialized");
+
+    for (int j = 0; j < max_regions; j++) {
+      marked_bytes_array[j] = 0;
+    }
+    task_card_bm.clear();
+  }
+}
+
+// Adds the given region to the counting data structures
+// for the given task id.
+void ConcurrentMark::add_to_count_data_for(MemRegion mr,
+                                           HeapRegion* hr,
+                                           int worker_i) {
+  G1CollectedHeap* g1h = _g1h;
+  HeapWord* start = mr.start();
+  HeapWord* last = mr.last();
+  size_t index = hr->hrs_index();
+
+  assert(!hr->continuesHumongous(), "should not be HC region");
+  assert(hr == g1h->heap_region_containing(start), "sanity");
+  assert(hr == g1h->heap_region_containing(mr.last()), "sanity");
+  assert(0 <= worker_i && worker_i < _max_task_num, "oob");
+
+  BitMap& task_card_bm = count_card_bitmap_for(worker_i);
+  size_t* marked_bytes_array = count_marked_bytes_for(worker_i);
+
+  // Below, the term "card num" means the result of shifting an address
+  // by the card shift -- address 0 corresponds to card number 0.  One
+  // must subtract the card num of the bottom of the heap to obtain a
+  // card table index.
+
+  intptr_t start_card_num = 
+    intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift);
+  intptr_t last_card_num =
+    intptr_t(uintptr_t(last) >> CardTableModRefBS::card_shift);
+
+  intptr_t bottom_card_num = 
+    intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >> 
+        CardTableModRefBS::card_shift);
+
+  BitMap::idx_t start_idx = start_card_num - bottom_card_num;
+  BitMap::idx_t last_idx = last_card_num - bottom_card_num;
+  
+  // The card bitmap is task/worker specific => no need to use 'par' routines.
+  // Inclusive bit range [start_idx, last_idx]. set_range is exclusive
+  // so we have to also explicitly set the bit for last_idx.
+  // Passing last_idx+1 to the clear_range would work in most cases
+  // but could trip an OOB assertion.
+
+  if ((last_idx - start_idx) > 0) {
+    task_card_bm.set_range(start_idx, last_idx);
+  }
+  task_card_bm.set_bit(last_idx);
+  
+  // Add to the task local marked bytes for this region.
+  marked_bytes_array[index] += mr.byte_size();
+}
+
+void ConcurrentMark::add_to_count_data_for(oop obj, HeapRegion* hr, int worker_i) {
+  MemRegion mr((HeapWord*)obj, obj->size());
+  add_to_count_data_for(mr, hr, worker_i);
+}
+
+void ConcurrentMark::add_to_count_data_for(MemRegion mr, int worker_i) {
+  HeapRegion* hr = _g1h->heap_region_containing(mr.start());
+  add_to_count_data_for(mr, hr, worker_i);
+}
+
+void ConcurrentMark::add_to_count_data_for(oop obj, int worker_i) {
+  MemRegion mr((HeapWord*)obj, obj->size());
+  add_to_count_data_for(mr, worker_i);
+}
+
+// Updates the counting data with liveness info recorded for a
+// region (typically a GCLab).
+void ConcurrentMark::add_to_count_data_for_region(MemRegion lab_mr,
+                                                  BitMap* lab_card_bm,
+                                                  intptr_t lab_bottom_card_num,
+                                                  size_t lab_marked_bytes,
+                                                  int worker_i) {
+  HeapRegion* hr = _g1h->heap_region_containing(lab_mr.start());
+
+  BitMap& task_card_bm = count_card_bitmap_for(worker_i);
+  size_t* marked_bytes_array = count_marked_bytes_for(worker_i);
+
+  // Below, the term "card num" means the result of shifting an address
+  // by the card shift -- address 0 corresponds to card number 0.  One
+  // must subtract the card num of the bottom of the heap to obtain a
+  // card table index.
+  
+  intptr_t heap_bottom_card_num = 
+    intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >> 
+        CardTableModRefBS::card_shift);
+
+  assert(intptr_t(uintptr_t(lab_mr.start()) >> CardTableModRefBS::card_shift) == lab_bottom_card_num,
+         "sanity");
+
+  // We have to map the indices of set bits in lab_card_bm, using
+  // lab_bottom_card_num, to indices the card bitmap for the given task.
+
+  BitMap::idx_t end_idx = lab_card_bm->size();
+  BitMap::idx_t start_idx = lab_card_bm->get_next_one_offset(0, end_idx);
+  while (start_idx < end_idx) {
+    assert(lab_card_bm->at(start_idx), "should be set");
+
+    intptr_t lab_card_num = lab_bottom_card_num + start_idx;
+    BitMap::idx_t card_bm_idx = lab_card_num - heap_bottom_card_num;
+
+    task_card_bm.set_bit(card_bm_idx);
+
+    // Get the offset of the next set bit
+    start_idx = lab_card_bm->get_next_one_offset(start_idx+1, end_idx);
+  }
+
+  // Now add to the marked bytes
+  marked_bytes_array[hr->hrs_index()] += lab_marked_bytes;
+}
+
+void ConcurrentMark::clear_count_data_for_heap_region(HeapRegion* hr) {
+  // Clears the count data for the given region from _all_ of
+  // the per-task counting data structures.
+
+  MemRegion used_region = hr->used_region();
+  HeapWord* start = used_region.start();
+  HeapWord* last = used_region.last();
+  size_t hr_index = hr->hrs_index();
+
+  intptr_t bottom_card_num =
+      intptr_t(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >>
+               CardTableModRefBS::card_shift);
+  
+  intptr_t start_card_num =
+    intptr_t(uintptr_t(start) >> CardTableModRefBS::card_shift);
+  intptr_t last_card_num =
+    intptr_t(uintptr_t(last) >> CardTableModRefBS::card_shift);
+  
+  BitMap::idx_t start_idx = start_card_num - bottom_card_num;
+  BitMap::idx_t last_idx = last_card_num - bottom_card_num;
+
+  size_t used_region_bytes = used_region.byte_size();
+  size_t marked_bytes = 0;
+
+  for (int i=0; i < _max_task_num; i += 1) {
+    BitMap& task_card_bm = count_card_bitmap_for(i);
+    size_t* marked_bytes_array = count_marked_bytes_for(i);
+
+    marked_bytes += marked_bytes_array[hr_index];
+    // clear the amount of marked bytes in the task array for this
+    // region
+    marked_bytes_array[hr_index] = 0;
+    
+    // Clear the inclusive range [start_idx, last_idx] from the
+    // card bitmap. The clear_range routine is exclusive so we
+    // need to also explicitly clear the bit at last_idx.
+    // Passing last_idx+1 to the clear_range would work in
+    // most cases but could trip an OOB assertion.
+
+    if ((last_idx - start_idx) > 0) {
+      task_card_bm.clear_range(start_idx, last_idx);
+    }
+    task_card_bm.clear_bit(last_idx);
+  }
+  // We could assert here that marked_bytes == used_region_bytes
+}
+
 void ConcurrentMark::print_stats() {
   if (verbose_stats()) {
     gclog_or_tty->print_cr("---------------------------------------------------------------------");
     for (size_t i = 0; i < _active_tasks; ++i) {
       _tasks[i]->print_stats();

@@ -2945,10 +3571,13 @@
     HeapRegion* hr = _g1h->heap_region_containing(obj);
     if (hr != NULL) {
       if (hr->in_collection_set()) {
         if (_g1h->is_obj_ill(obj)) {
           _bm->mark((HeapWord*)obj);
+          // Update the task specific count data for object
+          _cm->add_to_count_data_for(obj, hr, 0 /* worker_i */);
+
           if (!push(obj)) {
             gclog_or_tty->print_cr("Setting abort in CSMarkOopClosure because push failed.");
             set_abort();
           }
         }

@@ -3026,18 +3655,22 @@
 
   bool completed() { return _completed; }
 };
 
 class ClearMarksInHRClosure: public HeapRegionClosure {
+  ConcurrentMark* _cm;
   CMBitMap* _bm;
 public:
-  ClearMarksInHRClosure(CMBitMap* bm): _bm(bm) { }
+  ClearMarksInHRClosure(ConcurrentMark* cm, CMBitMap* bm):
+    _cm(cm), _bm(bm)
+  { }
 
   bool doHeapRegion(HeapRegion* r) {
     if (!r->used_region().is_empty() && !r->evacuation_failed()) {
-      MemRegion usedMR = r->used_region();
       _bm->clearRange(r->used_region());
+      // Need to remove values from the count info
+      _cm->clear_count_data_for_heap_region(r);
     }
     return false;
   }
 };
 

@@ -3059,11 +3692,11 @@
   }
   double end_time = os::elapsedTime();
   double elapsed_time_ms = (end_time - start) * 1000.0;
   g1h->g1_policy()->record_mark_closure_time(elapsed_time_ms);
 
-  ClearMarksInHRClosure clr(nextMarkBitMap());
+  ClearMarksInHRClosure clr(this, nextMarkBitMap());
   g1h->collection_set_iterate(&clr);
 }
 
 // The next two methods deal with the following optimisation. Some
 // objects are gray by being marked and located above the finger. If

@@ -3201,14 +3834,13 @@
   }
   gclog_or_tty->print_cr("  Total stop_world time = %8.2f s.",
                          (_init_times.sum() + _remark_times.sum() +
                           _cleanup_times.sum())/1000.0);
   gclog_or_tty->print_cr("  Total concurrent time = %8.2f s "
-                "(%8.2f s marking, %8.2f s counting).",
+                "(%8.2f s marking).",
                 cmThread()->vtime_accum(),
-                cmThread()->vtime_mark_accum(),
-                cmThread()->vtime_count_accum());
+                cmThread()->vtime_mark_accum());
 }
 
 void ConcurrentMark::print_worker_threads_on(outputStream* st) const {
   _parallel_workers->print_worker_threads_on(st);
 }