< prev index next >
src/share/vm/gc/g1/g1CollectedHeap.cpp
Print this page
rev 8802 : G1 performance improvements: card batching, joining, sorting, prefetching and write barrier fence elision and simplification based on a global syncrhonization using handshakes piggybacking on thread-local safepoints.
rev 8803 : Implementation improvements to pass JPRT
rev 8805 : Another JPRT attempt
@@ -63,10 +63,11 @@
#include "gc/shared/taskqueue.inline.hpp"
#include "memory/allocation.hpp"
#include "memory/iterator.hpp"
#include "oops/oop.inline.hpp"
#include "runtime/atomic.inline.hpp"
+#include "runtime/globalSynchronizer.hpp"
#include "runtime/orderAccess.inline.hpp"
#include "runtime/vmThread.hpp"
#include "utilities/globalDefinitions.hpp"
#include "utilities/stack.inline.hpp"
@@ -87,33 +88,293 @@
// allocation code from the rest of the JVM. (Note that this does not
// apply to TLAB allocation, which is not part of this interface: it
// is done by clients of this interface.)
// Local to this file.
-
-class RefineCardTableEntryClosure: public CardTableEntryClosure {
- bool _concurrent;
-public:
- RefineCardTableEntryClosure() : _concurrent(true) { }
-
- bool do_card_ptr(jbyte* card_ptr, uint worker_i) {
+bool RefineCardTableEntryClosure::do_card_ptr(jbyte* card_ptr, uint worker_i) {
bool oops_into_cset = G1CollectedHeap::heap()->g1_rem_set()->refine_card(card_ptr, worker_i, false);
// This path is executed by the concurrent refine or mutator threads,
// concurrently, and so we do not care if card_ptr contains references
// that point into the collection set.
assert(!oops_into_cset, "should be");
- if (_concurrent && SuspendibleThreadSet::should_yield()) {
- // Caller will actually yield.
+ // return false if caller should yield
+ return !(G1CollectedHeap::heap()->refine_cte_cl_concurrency() && SuspendibleThreadSet::should_yield());
+}
+
+CardBuffer::CardBuffer()
+ : _next(NULL) {
+ int size = BufferedRefineCardTableEntryClosure::buffer_size();
+ _card_buffer = NEW_C_HEAP_ARRAY(jbyte*, size, mtGC);
+ _mr_buffer = NEW_C_HEAP_ARRAY(MemRegion, size, mtGC);
+ _gs = new SynchronizerObj<mtGC>();
+ _misses = 0;
+}
+
+CardBuffer::~CardBuffer() {
+ FREE_C_HEAP_ARRAY(jbyte*, _card_buffer);
+ FREE_C_HEAP_ARRAY(MemRegion, _mr_buffer);
+ delete _gs;
+}
+
+BufferedRefineCardTableEntryClosure::BufferedRefineCardTableEntryClosure()
+ : _index(0), _g1h(G1CollectedHeap::heap()), _head_buffer(NULL), _tail_buffer(NULL),
+ _current_buffer(NULL), _async_buffers(0) {
+}
+
+BufferedRefineCardTableEntryClosure::~BufferedRefineCardTableEntryClosure() {
+ assert(_index == 0, "must flush refine card buffer");
+ assert(_head_buffer == NULL && _tail_buffer == NULL, "must flush all async cards first");
+ assert(_async_buffers == 0, "must flush all async cards first");
+ if (_current_buffer) delete _current_buffer;
+}
+
+bool BufferedRefineCardTableEntryClosure::do_card_ptr(jbyte *card_ptr, uint worker_i) {
+ _worker_i = worker_i;
+ if (_index == buffer_size()) soft_flush();
+ if (_current_buffer == NULL) _current_buffer = new CardBuffer();
+ _current_buffer->_card_buffer[_index++] = card_ptr;
+
+ bool should_yield = _g1h->refine_cte_cl_concurrency() && SuspendibleThreadSet::should_yield();
+ if (should_yield) flush_buffer();
+
+ // return false if caller should yield
+ return !should_yield;
+}
+
+void BufferedRefineCardTableEntryClosure::soft_flush() {
+ general_flush(false);
+}
+
+// Procedures used to sort and join G1 cards during refinement
+static void quick_sort(jbyte **card_array, MemRegion *region_array, int left, int right);
+static int partition(jbyte **card_array, MemRegion *region_array, int left, int right);
+static int join_cards(jbyte **card_array, MemRegion *region_array, int length);
+
+static void quick_sort(jbyte **card_array, MemRegion *region_array, int left, int right) {
+ int middle;
+ if (left < right)
+ {
+ middle = partition(card_array, region_array, left, right);
+ quick_sort(card_array, region_array, left, middle);
+ quick_sort(card_array, region_array, middle + 1, right);
+ }
+}
+
+static int partition(jbyte **card_array, MemRegion *region_array, int left, int right) {
+ jbyte *card = card_array[left];
+ int i = left;
+ int j;
+
+ for (j = left + 1; j < right; j++)
+ {
+ if (card_array[j] <= card)
+ {
+ i = i + 1;
+ swap(card_array[i], card_array[j]);
+ swap(region_array[i], region_array[j]);
+ }
+ }
+
+ swap(card_array[i], card_array[left]);
+ swap(region_array[i], region_array[left]);
+ return i;
+}
+
+static int join_cards(jbyte **card_array, MemRegion *region_array, int length) {
+ G1CollectedHeap *g1h = G1CollectedHeap::heap();
+ jbyte *prev_card = NULL;
+ HeapRegion *prev_hr = NULL;
+ int insert_head = 0;
+ for (int i = 0; i < length; i++) {
+ jbyte *card = card_array[i];
+
+ if (*card == CardTableModRefBS::clean_card_val()) {
+ HeapRegion *hr = g1h->heap_region_containing_raw(region_array[i].start());
+ if (card == prev_card + 1 && hr == prev_hr) {
+ MemRegion insert_region = region_array[insert_head - 1];
+ region_array[insert_head - 1] = MemRegion(insert_region.start(), region_array[i].end());
+ } else {
+ card_array[insert_head] = card;
+ region_array[insert_head] = region_array[i];
+ insert_head++;
+ }
+ prev_hr = hr;
+ }
+
+ prev_card = card;
+ }
+
+ return insert_head;
+}
+
+int BufferedRefineCardTableEntryClosure::buffer_size() {
+ return (int)G1UpdateBufferSize;
+}
+
+void BufferedRefineCardTableEntryClosure::flush_buffer() {
+ general_flush(true);
+}
+
+// Returns true if it needs post sync
+bool BufferedRefineCardTableEntryClosure::pre_sync(CardBuffer *buffer, bool hard) {
+ // 1. Clean all cards in the batch.
+ G1RemSet *g1rs = G1CollectedHeap::heap()->g1_rem_set();
+ int needs_processing = 0;
+
+ jbyte **const card_buffer = buffer->_card_buffer;
+ MemRegion *const mr_buffer = buffer->_mr_buffer;
+ const int length = buffer->_length;
+
+ for (int i = 0; i < length; i++) {
+ if (g1rs->clean_card(card_buffer[i], _worker_i, mr_buffer[i])) {
+ card_buffer[needs_processing] = card_buffer[i];
+ mr_buffer[needs_processing] = mr_buffer[i];
+ needs_processing++;
+ }
+ }
+ buffer->_length = needs_processing;
+
+ if (needs_processing == 0) {
+ if (hard) {
+ // If we are forced to finish scanning, we must serialize stores anyway.
+ OrderAccess::storeload();
+ if (G1ElideMembar) {
+ buffer->_gs->start_synchronizing();
+ }
+ }
return false;
}
- // Otherwise, we finished successfully; return true.
+
+ OrderAccess::storeload();
+ if (G1ElideMembar) {
+ buffer->_gs->start_synchronizing();
+ }
+
+ // 2. Sort the cards
+ quick_sort(buffer->_card_buffer, buffer->_mr_buffer, 0, buffer->_length);
+
return true;
+}
+
+bool BufferedRefineCardTableEntryClosure::sync(CardBuffer *buffer, bool hard) {
+ if (!G1ElideMembar) return true;
+
+ bool success = buffer->_gs->try_synchronize();
+ if (hard) {
+ if (!success) {
+ buffer->_gs->maximize_urgency();
+ buffer->_gs->synchronize();
+ }
+ return true;
+ } else {
+ return success;
}
+}
- void set_concurrent(bool b) { _concurrent = b; }
-};
+void BufferedRefineCardTableEntryClosure::post_sync(CardBuffer *buffer) {
+ const int length = buffer->_length;
+
+ const int card_batch_size = 16;
+ jbyte **current_card = buffer->_card_buffer;
+ MemRegion *current_region = buffer->_mr_buffer;
+
+ const uintx interval = PrefetchScanIntervalInBytes * 2;
+
+ G1RemSet *g1rs = G1CollectedHeap::heap()->g1_rem_set();
+
+ // 3. Batch 16 cards at a time
+
+ for (int j = 0; j < length; j += card_batch_size) {
+ // 4. Join consecutive cards together and prefetch next card
+ int batch = MIN2((length - j), card_batch_size);
+ batch = join_cards(current_card, current_region, batch);
+
+ jbyte dirty_card_val = CardTableModRefBS::dirty_card_val();
+ jbyte *end_card;
+ HeapWord *end_prefetch;
+
+ if (j + card_batch_size < length) {
+ end_prefetch = current_region[card_batch_size].start();
+ end_card = current_card[card_batch_size];
+ } else {
+ end_card = &dirty_card_val;
+ }
+
+ MemRegion *region_end = current_region + batch;
+ jbyte** batch_card;
+ MemRegion* batch_region;
+
+ for (batch_card = current_card, batch_region = current_region; batch_region != region_end; batch_card++) {
+ jbyte *card = *batch_card;
+ MemRegion mr = *batch_region;
+ MemRegion *next_region = batch_region + 1;
+
+ if (next_region != region_end) {
+ MemRegion next_region_val = *next_region;
+ // Prefetch interval in batch
+ Prefetch::read(next_region_val.start(), next_region_val.byte_size());
+ } else if (*end_card == CardTableModRefBS::clean_card_val()) {
+ // Prefetch broken interval to next batch
+ Prefetch::read(end_prefetch, interval);
+ }
+
+ g1rs->refine_card_buffered(card, _worker_i, /*check_for_cset_refs*/ false, mr);
+
+ batch_region = next_region;
+ }
+
+ current_region += card_batch_size;
+ current_card += card_batch_size;
+ }
+}
+
+void BufferedRefineCardTableEntryClosure::general_flush(bool hard) {
+ if (_index == 0) {
+ assert(hard, "invariant");
+ if (_async_buffers == 0) return;
+ }
+
+ // 1. Start asynchronous synchronization for the current buffer
+ if (_current_buffer == NULL) _current_buffer = new CardBuffer();
+ _current_buffer->_length = _index;
+ if (pre_sync(_current_buffer, hard) || hard) {
+ // append async buffer
+ CardBuffer *tail = _tail_buffer;
+ if (tail != NULL) tail->_next = _current_buffer;
+ _tail_buffer = _current_buffer;
+ if (_head_buffer == NULL) _head_buffer = _current_buffer;
+ if (hard) sync(_current_buffer, hard);
+ _current_buffer = NULL;
+ _async_buffers++;
+ }
+
+ _index = 0;
+
+ // 2. Process old batches that have been cleaned but couldn't synchronize (async completion)
+ CardBuffer *current = _head_buffer;
+ bool check_sync = true;
+ while (current != NULL) {
+ if (hard || sync(current, hard)) {
+ post_sync(current);
+ CardBuffer *next = current->_next;
+ _head_buffer = next;
+ if (next == NULL) _tail_buffer = NULL;
+ delete current;
+ current = next;
+ _async_buffers--;
+ } else {
+ current->_misses++;
+ if (_async_buffers > 4 && current->_misses > 2
+ || _async_buffers > 8 && current->_misses > 4
+ || _async_buffers > 16 && current->_misses > 6) {
+ current->_gs->increase_urgency();
+ }
+ break;
+ }
+ }
+}
class RedirtyLoggedCardTableEntryClosure : public CardTableEntryClosure {
private:
size_t _num_processed;
@@ -1917,11 +2178,11 @@
_ref_processor_cm(NULL),
_ref_processor_stw(NULL),
_bot_shared(NULL),
_cg1r(NULL),
_g1mm(NULL),
- _refine_cte_cl(NULL),
+ _refine_cte_cl_concurrency(true),
_secondary_free_list("Secondary Free List", new SecondaryFreeRegionListMtSafeChecker()),
_old_set("Old Set", false /* humongous */, new OldRegionSetMtSafeChecker()),
_humongous_set("Master Humongous Set", true /* humongous */, new HumongousRegionSetMtSafeChecker()),
_humongous_reclaim_candidates(),
_has_humongous_reclaim_candidates(false),
@@ -2030,13 +2291,11 @@
// Ensure that the sizes are properly aligned.
Universe::check_alignment(init_byte_size, HeapRegion::GrainBytes, "g1 heap");
Universe::check_alignment(max_byte_size, HeapRegion::GrainBytes, "g1 heap");
Universe::check_alignment(max_byte_size, heap_alignment, "g1 heap");
- _refine_cte_cl = new RefineCardTableEntryClosure();
-
- _cg1r = new ConcurrentG1Refine(this, _refine_cte_cl);
+ _cg1r = new ConcurrentG1Refine(this);
// Reserve the maximum.
// When compressed oops are enabled, the preferred heap base
// is calculated by subtracting the requested size from the
@@ -2156,28 +2415,28 @@
JavaThread::satb_mark_queue_set().initialize(SATB_Q_CBL_mon,
SATB_Q_FL_lock,
G1SATBProcessCompletedThreshold,
Shared_SATB_Q_lock);
- JavaThread::dirty_card_queue_set().initialize(_refine_cte_cl,
+ JavaThread::dirty_card_queue_set().initialize(true,
DirtyCardQ_CBL_mon,
DirtyCardQ_FL_lock,
concurrent_g1_refine()->yellow_zone(),
concurrent_g1_refine()->red_zone(),
Shared_DirtyCardQ_lock);
- dirty_card_queue_set().initialize(NULL, // Should never be called by the Java code
+ dirty_card_queue_set().initialize(false, // Should never be called by the Java code
DirtyCardQ_CBL_mon,
DirtyCardQ_FL_lock,
-1, // never trigger processing
-1, // no limit on length
Shared_DirtyCardQ_lock,
&JavaThread::dirty_card_queue_set());
// Initialize the card queue set used to hold cards containing
// references into the collection set.
- _into_cset_dirty_card_queue_set.initialize(NULL, // Should never be called by the Java code
+ _into_cset_dirty_card_queue_set.initialize(false, // Should never be called by the Java code
DirtyCardQ_CBL_mon,
DirtyCardQ_FL_lock,
-1, // never trigger processing
-1, // no limit on length
Shared_DirtyCardQ_lock,
@@ -6379,11 +6638,15 @@
"value: " SIZE_FORMAT " recalculated: " SIZE_FORMAT,
used_unlocked(), recalculate_used()));
}
void G1CollectedHeap::set_refine_cte_cl_concurrency(bool concurrent) {
- _refine_cte_cl->set_concurrent(concurrent);
+ _refine_cte_cl_concurrency = concurrent;
+}
+
+bool G1CollectedHeap::refine_cte_cl_concurrency() {
+ return _refine_cte_cl_concurrency;
}
bool G1CollectedHeap::is_in_closed_subset(const void* p) const {
HeapRegion* hr = heap_region_containing(p);
return hr->is_in(p);
< prev index next >