hotspot Udiff src/share/vm/gc/g1/g1CollectedHeap.cpp

src/share/vm/gc/g1/g1CollectedHeap.cpp

rev 8802 : G1 performance improvements: card batching, joining, sorting, prefetching and write barrier fence elision and simplification based on a global syncrhonization using handshakes piggybacking on thread-local safepoints.
rev 8803 : Implementation improvements to pass JPRT
rev 8805 : Another JPRT attempt

@@ -63,10 +63,11 @@
 #include "gc/shared/taskqueue.inline.hpp"
 #include "memory/allocation.hpp"
 #include "memory/iterator.hpp"
 #include "oops/oop.inline.hpp"
 #include "runtime/atomic.inline.hpp"
+#include "runtime/globalSynchronizer.hpp"
 #include "runtime/orderAccess.inline.hpp"
 #include "runtime/vmThread.hpp"
 #include "utilities/globalDefinitions.hpp"
 #include "utilities/stack.inline.hpp"

@@ -87,33 +88,293 @@
 // allocation code from the rest of the JVM.  (Note that this does not
 // apply to TLAB allocation, which is not part of this interface: it
 // is done by clients of this interface.)
 
 // Local to this file.
-
-class RefineCardTableEntryClosure: public CardTableEntryClosure {
-  bool _concurrent;
-public:
-  RefineCardTableEntryClosure() : _concurrent(true) { }
-
-  bool do_card_ptr(jbyte* card_ptr, uint worker_i) {
+bool RefineCardTableEntryClosure::do_card_ptr(jbyte* card_ptr, uint worker_i) {
     bool oops_into_cset = G1CollectedHeap::heap()->g1_rem_set()->refine_card(card_ptr, worker_i, false);
     // This path is executed by the concurrent refine or mutator threads,
     // concurrently, and so we do not care if card_ptr contains references
     // that point into the collection set.
     assert(!oops_into_cset, "should be");
 
-    if (_concurrent && SuspendibleThreadSet::should_yield()) {
-      // Caller will actually yield.
+  // return false if caller should yield
+  return !(G1CollectedHeap::heap()->refine_cte_cl_concurrency() && SuspendibleThreadSet::should_yield());
+}
+
+CardBuffer::CardBuffer()
+  : _next(NULL) {
+  int size = BufferedRefineCardTableEntryClosure::buffer_size();
+  _card_buffer = NEW_C_HEAP_ARRAY(jbyte*, size, mtGC);
+  _mr_buffer = NEW_C_HEAP_ARRAY(MemRegion, size, mtGC);
+  _gs = new SynchronizerObj<mtGC>();
+  _misses = 0;
+}
+
+CardBuffer::~CardBuffer() {
+  FREE_C_HEAP_ARRAY(jbyte*, _card_buffer);
+  FREE_C_HEAP_ARRAY(MemRegion, _mr_buffer);
+  delete _gs;
+}
+
+BufferedRefineCardTableEntryClosure::BufferedRefineCardTableEntryClosure()
+  : _index(0), _g1h(G1CollectedHeap::heap()), _head_buffer(NULL), _tail_buffer(NULL),
+    _current_buffer(NULL), _async_buffers(0) {
+}
+
+BufferedRefineCardTableEntryClosure::~BufferedRefineCardTableEntryClosure() {
+  assert(_index == 0, "must flush refine card buffer");
+  assert(_head_buffer == NULL && _tail_buffer == NULL, "must flush all async cards first");
+  assert(_async_buffers == 0, "must flush all async cards first");
+  if (_current_buffer) delete _current_buffer;
+}
+
+bool BufferedRefineCardTableEntryClosure::do_card_ptr(jbyte *card_ptr, uint worker_i) {
+  _worker_i = worker_i;
+  if (_index == buffer_size()) soft_flush();
+  if (_current_buffer == NULL) _current_buffer = new CardBuffer();
+  _current_buffer->_card_buffer[_index++] = card_ptr;
+
+  bool should_yield = _g1h->refine_cte_cl_concurrency() && SuspendibleThreadSet::should_yield();
+  if (should_yield) flush_buffer();
+
+  // return false if caller should yield
+  return !should_yield;
+}
+
+void BufferedRefineCardTableEntryClosure::soft_flush() {
+  general_flush(false);
+}
+
+// Procedures used to sort and join G1 cards during refinement
+static void quick_sort(jbyte **card_array, MemRegion *region_array, int left, int right);
+static int partition(jbyte **card_array, MemRegion *region_array, int left, int right);
+static int join_cards(jbyte **card_array, MemRegion *region_array, int length);
+
+static void quick_sort(jbyte **card_array, MemRegion *region_array, int left, int right) {
+  int middle;
+  if (left < right)
+  {
+    middle = partition(card_array, region_array, left, right);
+    quick_sort(card_array, region_array, left, middle);
+    quick_sort(card_array, region_array, middle + 1, right);
+  }
+}
+
+static int partition(jbyte **card_array, MemRegion *region_array, int left, int right) {
+  jbyte *card = card_array[left];
+  int i = left;
+  int j;
+
+  for (j = left + 1; j < right; j++)
+  {
+    if (card_array[j] <= card)
+    {
+      i = i + 1;
+      swap(card_array[i], card_array[j]);
+      swap(region_array[i], region_array[j]);
+    }
+  }
+
+  swap(card_array[i], card_array[left]);
+  swap(region_array[i], region_array[left]);
+  return i;
+}
+
+static int join_cards(jbyte **card_array, MemRegion *region_array, int length) {
+  G1CollectedHeap *g1h = G1CollectedHeap::heap();
+  jbyte *prev_card = NULL;
+  HeapRegion *prev_hr = NULL;
+  int insert_head = 0;
+  for (int i = 0; i < length; i++) {
+    jbyte *card = card_array[i];
+
+    if (*card == CardTableModRefBS::clean_card_val()) {
+      HeapRegion *hr = g1h->heap_region_containing_raw(region_array[i].start());
+      if (card == prev_card + 1 && hr == prev_hr) {
+        MemRegion insert_region = region_array[insert_head - 1];
+        region_array[insert_head - 1] = MemRegion(insert_region.start(), region_array[i].end());
+      } else {
+        card_array[insert_head] = card;
+        region_array[insert_head] = region_array[i];
+        insert_head++;
+      }
+      prev_hr = hr;
+    }
+
+    prev_card = card;
+  }
+
+  return insert_head;
+}
+
+int BufferedRefineCardTableEntryClosure::buffer_size() {
+  return (int)G1UpdateBufferSize;
+}
+
+void BufferedRefineCardTableEntryClosure::flush_buffer() {
+  general_flush(true);
+}
+
+// Returns true if it needs post sync
+bool BufferedRefineCardTableEntryClosure::pre_sync(CardBuffer *buffer, bool hard) {
+  // 1. Clean all cards in the batch.
+  G1RemSet *g1rs = G1CollectedHeap::heap()->g1_rem_set();
+  int needs_processing = 0;
+
+  jbyte    **const card_buffer = buffer->_card_buffer;
+  MemRegion *const mr_buffer   = buffer->_mr_buffer;
+  const int length = buffer->_length;
+
+  for (int i = 0; i < length; i++) {
+    if (g1rs->clean_card(card_buffer[i], _worker_i, mr_buffer[i])) {
+      card_buffer[needs_processing] = card_buffer[i];
+      mr_buffer[needs_processing] = mr_buffer[i];
+      needs_processing++;
+    }
+  }
+  buffer->_length = needs_processing;
+
+  if (needs_processing == 0) {
+    if (hard) {
+      // If we are forced to finish scanning, we must serialize stores anyway.
+      OrderAccess::storeload();
+      if (G1ElideMembar) {
+        buffer->_gs->start_synchronizing();
+      }
+    }
       return false;
     }
-    // Otherwise, we finished successfully; return true.
+
+  OrderAccess::storeload();
+  if (G1ElideMembar) {
+    buffer->_gs->start_synchronizing();
+  }
+
+  // 2. Sort the cards
+  quick_sort(buffer->_card_buffer, buffer->_mr_buffer, 0, buffer->_length);
+
     return true;
+}
+
+bool BufferedRefineCardTableEntryClosure::sync(CardBuffer *buffer, bool hard) {
+  if (!G1ElideMembar) return true;
+
+  bool success = buffer->_gs->try_synchronize();
+  if (hard) {
+    if (!success) {
+      buffer->_gs->maximize_urgency();
+      buffer->_gs->synchronize();
+    }
+    return true;
+  } else {
+    return success;
   }
+}
 
-  void set_concurrent(bool b) { _concurrent = b; }
-};
+void BufferedRefineCardTableEntryClosure::post_sync(CardBuffer *buffer) {
+  const int length = buffer->_length;
+
+  const int card_batch_size = 16;
+  jbyte **current_card = buffer->_card_buffer;
+  MemRegion *current_region = buffer->_mr_buffer;
+
+  const uintx interval = PrefetchScanIntervalInBytes * 2;
+
+  G1RemSet *g1rs = G1CollectedHeap::heap()->g1_rem_set();
+
+  // 3. Batch 16 cards at a time
+
+  for (int j = 0; j < length; j += card_batch_size) {
+    // 4. Join consecutive cards together and prefetch next card
+    int batch = MIN2((length - j), card_batch_size);
+    batch = join_cards(current_card, current_region, batch);
+
+    jbyte dirty_card_val = CardTableModRefBS::dirty_card_val();
+    jbyte *end_card;
+    HeapWord *end_prefetch;
+
+    if (j + card_batch_size < length) {
+      end_prefetch = current_region[card_batch_size].start();
+      end_card = current_card[card_batch_size];
+    } else {
+      end_card = &dirty_card_val;
+    }
+
+    MemRegion *region_end = current_region + batch;
+    jbyte** batch_card;
+    MemRegion* batch_region;
+
+    for (batch_card = current_card, batch_region = current_region; batch_region != region_end; batch_card++) {
+      jbyte *card = *batch_card;
+      MemRegion mr = *batch_region;
+      MemRegion *next_region = batch_region + 1;
+
+      if (next_region != region_end) {
+        MemRegion next_region_val = *next_region;
+        // Prefetch interval in batch
+        Prefetch::read(next_region_val.start(), next_region_val.byte_size());
+      } else if (*end_card == CardTableModRefBS::clean_card_val()) {
+        // Prefetch broken interval to next batch
+        Prefetch::read(end_prefetch, interval);
+      }
+
+      g1rs->refine_card_buffered(card, _worker_i, /*check_for_cset_refs*/ false, mr);
+
+      batch_region = next_region;
+    }
+
+    current_region += card_batch_size;
+    current_card += card_batch_size;
+  }
+}
+
+void BufferedRefineCardTableEntryClosure::general_flush(bool hard) {
+  if (_index == 0) {
+    assert(hard, "invariant");
+    if (_async_buffers == 0) return;
+  }
+
+  // 1. Start asynchronous synchronization for the current buffer
+  if (_current_buffer == NULL) _current_buffer = new CardBuffer();
+  _current_buffer->_length = _index;
+  if (pre_sync(_current_buffer, hard) || hard) {
+    // append async buffer
+    CardBuffer *tail = _tail_buffer;
+    if (tail != NULL) tail->_next = _current_buffer;
+    _tail_buffer = _current_buffer;
+    if (_head_buffer == NULL) _head_buffer = _current_buffer;
+    if (hard) sync(_current_buffer, hard);
+    _current_buffer = NULL;
+    _async_buffers++;
+  }
+
+  _index = 0;
+
+  // 2. Process old batches that have been cleaned but couldn't synchronize (async completion)
+  CardBuffer *current = _head_buffer;
+  bool check_sync = true;
+  while (current != NULL) {
+    if (hard || sync(current, hard)) {
+      post_sync(current);
+      CardBuffer *next = current->_next;
+      _head_buffer = next;
+      if (next == NULL) _tail_buffer = NULL;
+      delete current;
+      current = next;
+      _async_buffers--;
+    } else {
+      current->_misses++;
+      if (_async_buffers > 4 && current->_misses > 2
+          || _async_buffers > 8 && current->_misses > 4
+          || _async_buffers > 16 && current->_misses > 6) {
+        current->_gs->increase_urgency();
+      }
+      break;
+    }
+  }
+}
 
 
 class RedirtyLoggedCardTableEntryClosure : public CardTableEntryClosure {
  private:
   size_t _num_processed;

@@ -1917,11 +2178,11 @@
   _ref_processor_cm(NULL),
   _ref_processor_stw(NULL),
   _bot_shared(NULL),
   _cg1r(NULL),
   _g1mm(NULL),
-  _refine_cte_cl(NULL),
+  _refine_cte_cl_concurrency(true),
   _secondary_free_list("Secondary Free List", new SecondaryFreeRegionListMtSafeChecker()),
   _old_set("Old Set", false /* humongous */, new OldRegionSetMtSafeChecker()),
   _humongous_set("Master Humongous Set", true /* humongous */, new HumongousRegionSetMtSafeChecker()),
   _humongous_reclaim_candidates(),
   _has_humongous_reclaim_candidates(false),

@@ -2030,13 +2291,11 @@
   // Ensure that the sizes are properly aligned.
   Universe::check_alignment(init_byte_size, HeapRegion::GrainBytes, "g1 heap");
   Universe::check_alignment(max_byte_size, HeapRegion::GrainBytes, "g1 heap");
   Universe::check_alignment(max_byte_size, heap_alignment, "g1 heap");
 
-  _refine_cte_cl = new RefineCardTableEntryClosure();
-
-  _cg1r = new ConcurrentG1Refine(this, _refine_cte_cl);
+  _cg1r = new ConcurrentG1Refine(this);
 
   // Reserve the maximum.
 
   // When compressed oops are enabled, the preferred heap base
   // is calculated by subtracting the requested size from the

@@ -2156,28 +2415,28 @@
   JavaThread::satb_mark_queue_set().initialize(SATB_Q_CBL_mon,
                                                SATB_Q_FL_lock,
                                                G1SATBProcessCompletedThreshold,
                                                Shared_SATB_Q_lock);
 
-  JavaThread::dirty_card_queue_set().initialize(_refine_cte_cl,
+  JavaThread::dirty_card_queue_set().initialize(true,
                                                 DirtyCardQ_CBL_mon,
                                                 DirtyCardQ_FL_lock,
                                                 concurrent_g1_refine()->yellow_zone(),
                                                 concurrent_g1_refine()->red_zone(),
                                                 Shared_DirtyCardQ_lock);
 
-  dirty_card_queue_set().initialize(NULL, // Should never be called by the Java code
+  dirty_card_queue_set().initialize(false, // Should never be called by the Java code
                                     DirtyCardQ_CBL_mon,
                                     DirtyCardQ_FL_lock,
                                     -1, // never trigger processing
                                     -1, // no limit on length
                                     Shared_DirtyCardQ_lock,
                                     &JavaThread::dirty_card_queue_set());
 
   // Initialize the card queue set used to hold cards containing
   // references into the collection set.
-  _into_cset_dirty_card_queue_set.initialize(NULL, // Should never be called by the Java code
+  _into_cset_dirty_card_queue_set.initialize(false, // Should never be called by the Java code
                                              DirtyCardQ_CBL_mon,
                                              DirtyCardQ_FL_lock,
                                              -1, // never trigger processing
                                              -1, // no limit on length
                                              Shared_DirtyCardQ_lock,

@@ -6379,11 +6638,15 @@
                  "value: " SIZE_FORMAT " recalculated: " SIZE_FORMAT,
                  used_unlocked(), recalculate_used()));
 }
 
 void G1CollectedHeap::set_refine_cte_cl_concurrency(bool concurrent) {
-  _refine_cte_cl->set_concurrent(concurrent);
+  _refine_cte_cl_concurrency = concurrent;
+}
+
+bool G1CollectedHeap::refine_cte_cl_concurrency() {
+  return _refine_cte_cl_concurrency;
 }
 
 bool G1CollectedHeap::is_in_closed_subset(const void* p) const {
   HeapRegion* hr = heap_region_containing(p);
   return hr->is_in(p);

< prev index next >