--- old/src/hotspot/share/gc/g1/g1BarrierSet.cpp 2020-01-16 01:05:07.811777407 -0500 +++ new/src/hotspot/share/gc/g1/g1BarrierSet.cpp 2020-01-16 01:05:07.367753437 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -36,7 +36,6 @@ #include "oops/compressedOops.inline.hpp" #include "oops/oop.inline.hpp" #include "runtime/interfaceSupport.inline.hpp" -#include "runtime/mutexLocker.hpp" #include "runtime/orderAccess.hpp" #include "runtime/thread.inline.hpp" #include "utilities/macros.hpp" @@ -59,7 +58,7 @@ _satb_mark_queue_buffer_allocator("SATB Buffer Allocator", G1SATBBufferSize), _dirty_card_queue_buffer_allocator("DC Buffer Allocator", G1UpdateBufferSize), _satb_mark_queue_set(&_satb_mark_queue_buffer_allocator), - _dirty_card_queue_set(DirtyCardQ_CBL_mon, &_dirty_card_queue_buffer_allocator), + _dirty_card_queue_set(&_dirty_card_queue_buffer_allocator), _shared_dirty_card_queue(&_dirty_card_queue_set) {} --- old/src/hotspot/share/gc/g1/g1CollectedHeap.cpp 2020-01-16 01:05:10.055898554 -0500 +++ new/src/hotspot/share/gc/g1/g1CollectedHeap.cpp 2020-01-16 01:05:09.627875448 -0500 @@ -2778,8 +2778,6 @@ Threads::threads_do(&count_from_threads); G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set(); - dcqs.verify_num_cards(); - return dcqs.num_cards() + count_from_threads._cards; } --- old/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp 2020-01-16 01:05:12.436027044 -0500 +++ new/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp 2020-01-16 01:05:12.056006529 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -89,6 +89,11 @@ } } } + + if (num_max_threads > 0) { + G1BarrierSet::dirty_card_queue_set().set_primary_refinement_thread(_threads[0]); + } + return JNI_OK; } @@ -108,7 +113,7 @@ _threads[worker_id] = create_refinement_thread(worker_id, false); thread_to_activate = _threads[worker_id]; } - if (thread_to_activate != NULL && !thread_to_activate->is_active()) { + if (thread_to_activate != NULL) { thread_to_activate->activate(); } } --- old/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp 2020-01-16 01:05:14.068115151 -0500 +++ new/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp 2020-01-16 01:05:13.692094852 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -29,9 +29,8 @@ #include "gc/g1/g1DirtyCardQueue.hpp" #include "gc/shared/suspendibleThreadSet.hpp" #include "logging/log.hpp" -#include "memory/resourceArea.hpp" -#include "runtime/handles.inline.hpp" -#include "runtime/mutexLocker.hpp" +#include "runtime/atomic.hpp" +#include "runtime/thread.hpp" G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint worker_id) : ConcurrentGCThread(), @@ -40,56 +39,59 @@ _total_refinement_time(), _total_refined_cards(0), _worker_id(worker_id), - _active(false), - _monitor(NULL), + _notifier(new Semaphore(0)), + _should_notify(true), _cr(cr) { - // Each thread has its own monitor. The i-th thread is responsible for signaling - // to thread i+1 if the number of buffers in the queue exceeds a threshold for this - // thread. Monitors are also used to wake up the threads during termination. - // The 0th (primary) worker is notified by mutator threads and has a special monitor. - if (!is_primary()) { - _monitor = new Monitor(Mutex::nonleaf, "Refinement monitor", true, - Monitor::_safepoint_check_never); - } else { - _monitor = DirtyCardQ_CBL_mon; - } - // set name set_name("G1 Refine#%d", worker_id); create_and_start(); } +// _notifier and _should_notify form a single-reader / multi-writer +// notification mechanism. The thread is the single reader. The writers +// are (other) threads that call activate() on the thread. + void G1ConcurrentRefineThread::wait_for_completed_buffers() { - MonitorLocker ml(_monitor, Mutex::_no_safepoint_check_flag); - while (!should_terminate() && !is_active()) { - ml.wait(); + assert(this == Thread::current(), "precondition"); + while (Atomic::load_acquire(&_should_notify)) { + _notifier->wait(); } } -bool G1ConcurrentRefineThread::is_active() { - G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set(); - return is_primary() ? dcqs.process_completed_buffers() : _active; -} - void G1ConcurrentRefineThread::activate() { - MutexLocker x(_monitor, Mutex::_no_safepoint_check_flag); - if (!is_primary()) { - set_active(true); - } else { - G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set(); - dcqs.set_process_completed_buffers(true); + assert(this != Thread::current(), "precondition"); + // Notify iff transitioning from needing activation to not. This helps + // keep the semaphore count bounded and minimizes the work done by + // activators when the thread is already active. + if (Atomic::load_acquire(&_should_notify) && + Atomic::cmpxchg(&_should_notify, true, false)) { + _notifier->signal(); } - _monitor->notify(); } -void G1ConcurrentRefineThread::deactivate() { - MutexLocker x(_monitor, Mutex::_no_safepoint_check_flag); - if (!is_primary()) { - set_active(false); +// Called when no refinement work found for this thread. +// Returns true if should deactivate. +bool G1ConcurrentRefineThread::maybe_deactivate(bool more_work) { + assert(this == Thread::current(), "precondition"); + + if (more_work) { + // Suppress unnecessary notifications. + Atomic::release_store(&_should_notify, false); + return false; + } else if (Atomic::load_acquire(&_should_notify)) { + // Deactivate if no notifications since enabled (see below). + return true; } else { - G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set(); - dcqs.set_process_completed_buffers(false); + // Try for more refinement work with notifications enabled, to close + // race; there could be a plethora of suppressed activation attempts + // after we found no work but before we enable notifications here + // (so there could be lots of work for this thread to do), followed + // by a long time without activation after enabling notifications. + // But first, clear any pending signals to prevent accumulation. + while (_notifier->trywait()) {} + Atomic::release_store(&_should_notify, true); + return false; } } @@ -119,14 +121,13 @@ } Ticks start_time = Ticks::now(); - if (!_cr->do_refinement_step(_worker_id, &_total_refined_cards)) { - break; // No cards to process. - } + bool more_work = _cr->do_refinement_step(_worker_id, &_total_refined_cards); _total_refinement_time += (Ticks::now() - start_time); + + if (maybe_deactivate(more_work)) break; } } - deactivate(); log_debug(gc, refine)("Deactivated worker %d, off threshold: " SIZE_FORMAT ", current: " SIZE_FORMAT ", refined cards: " SIZE_FORMAT ", total refined cards: " SIZE_FORMAT, @@ -146,6 +147,5 @@ } void G1ConcurrentRefineThread::stop_service() { - MutexLocker x(_monitor, Mutex::_no_safepoint_check_flag); - _monitor->notify(); + activate(); } --- old/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp 2020-01-16 01:05:15.656200883 -0500 +++ new/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp 2020-01-16 01:05:15.324182959 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -45,24 +45,25 @@ uint _worker_id; - bool _active; - Monitor* _monitor; + // Each thread has its own semaphore. The i-th thread is responsible for + // signaling thread i+1 if the number of buffers in the queue exceeds a + // threshold for that thread. Semaphores are also used to wake up the + // threads during termination. The 0th (primary) worker is notified by + // mutator threads. + Semaphore* _notifier; + volatile bool _should_notify; + bool maybe_deactivate(bool more_work); + G1ConcurrentRefine* _cr; void wait_for_completed_buffers(); - void set_active(bool x) { _active = x; } - // Deactivate this thread. - void deactivate(); - - bool is_primary() { return (_worker_id == 0); } + virtual void run_service(); + virtual void stop_service(); - void run_service(); - void stop_service(); public: G1ConcurrentRefineThread(G1ConcurrentRefine* cg1r, uint worker_id); - bool is_active(); // Activate this thread. void activate(); --- old/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp 2020-01-16 01:05:17.368293309 -0500 +++ new/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp 2020-01-16 01:05:16.968271714 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -26,6 +26,7 @@ #include "gc/g1/g1BufferNodeList.hpp" #include "gc/g1/g1CardTableEntryClosure.hpp" #include "gc/g1/g1CollectedHeap.inline.hpp" +#include "gc/g1/g1ConcurrentRefineThread.hpp" #include "gc/g1/g1DirtyCardQueue.hpp" #include "gc/g1/g1FreeIdSet.hpp" #include "gc/g1/g1RedirtyCardsQueue.hpp" @@ -42,7 +43,10 @@ #include "runtime/safepoint.hpp" #include "runtime/thread.inline.hpp" #include "runtime/threadSMR.hpp" +#include "utilities/globalCounter.inline.hpp" +#include "utilities/macros.hpp" #include "utilities/quickSort.hpp" +#include G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) : // Dirty card queues are always active, so we create them with their @@ -54,6 +58,8 @@ flush(); } +BufferNode* const NULL_buffer = NULL; + void G1DirtyCardQueue::handle_completed_buffer() { assert(_buf != NULL, "precondition"); BufferNode* node = BufferNode::make_node_from_buffer(_buf, index()); @@ -68,15 +74,15 @@ // Assumed to be zero by concurrent threads. static uint par_ids_start() { return 0; } -G1DirtyCardQueueSet::G1DirtyCardQueueSet(Monitor* cbl_mon, - BufferNode::Allocator* allocator) : +G1DirtyCardQueueSet::G1DirtyCardQueueSet(BufferNode::Allocator* allocator) : PtrQueueSet(allocator), - _cbl_mon(cbl_mon), - _completed_buffers_head(NULL), - _completed_buffers_tail(NULL), + _primary_refinement_thread(NULL), + _completed_buffers_head(NULL_buffer), + _completed_buffers_tail(NULL_buffer), _num_cards(0), + DEBUG_ONLY(_concurrency(0) COMMA) + _paused(), _process_cards_threshold(ProcessCardsThresholdNever), - _process_completed_buffers(false), _max_cards(MaxCardsUnlimited), _max_cards_padding(0), _free_ids(par_ids_start(), num_par_ids()), @@ -108,123 +114,324 @@ G1ThreadLocalData::dirty_card_queue(t).handle_zero_index(); } -void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) { - MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag); - cbn->set_next(NULL); - if (_completed_buffers_tail == NULL) { - assert(_completed_buffers_head == NULL, "Well-formedness"); - _completed_buffers_head = cbn; - _completed_buffers_tail = cbn; +// _concurrency is an int that is used in debug-only context to verify +// we're not overlapping queue operations that support concurrency with +// those which don't. The value is initially zero, meaning there are no +// relevant operations in progress. A "no concurrency" context is entered +// by atomically changing the value from 0 to -1, with an assert on failure. +// It is similarly exited by reverting the value back to 0. A "concurrent" +// context is entered by atomically incrementing the value and verifying the +// result is greater than zero (so we weren't in a "no concurrency" context). +// It is similarly exited by atomically decrementing the value and verifying +// the result is at least zero (so no mismatches). +// +// ConcurrentVerifier and NonconcurrentVerifier are helper classes to +// establish and remove such contexts. + +class G1DirtyCardQueueSet::ConcurrentVerifier : public StackObj { +#ifdef ASSERT + const G1DirtyCardQueueSet* _dcqs; + +public: + ~ConcurrentVerifier() { + assert(Atomic::sub(&_dcqs->_concurrency, 1) >= 0, "invariant"); + } +#endif // ASSERT + +public: + ConcurrentVerifier(const G1DirtyCardQueueSet* dcqs) DEBUG_ONLY(: _dcqs(dcqs)) { + assert(Atomic::add(&_dcqs->_concurrency, 1) > 0, "invariant"); + } +}; + +class G1DirtyCardQueueSet::NonconcurrentVerifier : public StackObj { +#ifdef ASSERT + const G1DirtyCardQueueSet* _dcqs; + +public: + ~NonconcurrentVerifier() { + assert(Atomic::cmpxchg(&_dcqs->_concurrency, -1, 0) == -1, "invariant"); + } +#endif // ASSERT + +public: + NonconcurrentVerifier(const G1DirtyCardQueueSet* dcqs) DEBUG_ONLY(: _dcqs(dcqs)) { + assert(Atomic::cmpxchg(&_dcqs->_concurrency, 0, -1) == 0, "invariant"); + } +}; + +// _completed_buffers_{head,tail} and _num_cards provide a lock-free FIFO +// of buffers, linked through their next() fields. +// +// The key idea to make this work is that pop (get_completed_buffer) never +// returns an element of the queue if it is the only accessible element, +// e.g. its "next" value is NULL. It is expected that there will be a +// later push/append that will make that element available to a future pop, +// or there will eventually be a complete transfer (take_all_completed_buffers). +// +// An append operation atomically exchanges the new tail with the queue tail. +// It then sets the "next" value of the old tail to the head of the list being +// appended. (It is an invariant that the old tail's "next" value is NULL.) +// But if the old tail is NULL then the queue was empty. In this case the +// head of the list being appended is instead stored in the queue head (which +// must be NULL). +// +// A push operation is just a degenerate append, where the buffer being pushed +// is both the head and the tail of the list being appended. +// +// This means there is a period between the exchange and the old tail update +// where the queue sequence is split into two parts, the list from the queue +// head to the old tail, and the list being appended. If there are concurrent +// push/append operations, each may introduce another such segment. But they +// all eventually get resolved by their respective updates of their old tail's +// "next" value. +// +// pop gets the queue head as the candidate result (returning NULL if the +// queue head was NULL), and then gets that result node's "next" value. If +// that "next" value is NULL and the queue head hasn't changed, then there +// is only one element in the (accessible) list. We can't return that +// element, because it may be the old tail of a concurrent push/append. So +// return NULL in this case. Otherwise, attempt to cmpxchg that "next" +// value into the queue head, retrying the whole operation if that fails. +// This is the "usual" lock-free pop from head of slist, with the additional +// restriction on taking the last element. +// +// In order to address the ABA problem for pop, a pop operation protects its +// access to the head of the list with a GlobalCounter critical section. This +// works with the buffer allocator's use of GlobalCounter synchronization to +// prevent ABA from arising in the normal buffer usage cycle. The paused +// buffer handling prevents another ABA source (see record_paused_buffer and +// enqueue_previous_paused_buffers). + +size_t G1DirtyCardQueueSet::append_buffers(BufferNode* first, + BufferNode* last, + size_t card_count) { + assert(last->next() == NULL_buffer, "precondition"); + ConcurrentVerifier cv(this); + // Increment _num_cards before adding to queue, so queue removal doesn't + // need to deal with _num_cards possibly going negative. + size_t new_num_cards = Atomic::add(&_num_cards, card_count); + BufferNode* old_tail = Atomic::xchg(&_completed_buffers_tail, last); + if (old_tail == NULL_buffer) { // Empty list. + assert(Atomic::load(&_completed_buffers_head) == NULL_buffer, "invariant"); + Atomic::store(&_completed_buffers_head, first); } else { - _completed_buffers_tail->set_next(cbn); - _completed_buffers_tail = cbn; + assert(old_tail->next() == NULL_buffer, "invariant"); + old_tail->set_next(first); } - _num_cards += buffer_size() - cbn->index(); + return new_num_cards; +} - if (!process_completed_buffers() && - (num_cards() > process_cards_threshold())) { - set_process_completed_buffers(true); - ml.notify_all(); +void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) { + assert(cbn != NULL_buffer, "precondition"); + size_t new_num_cards = append_buffers(cbn, cbn, buffer_size() - cbn->index()); + if ((_primary_refinement_thread != NULL) && + (new_num_cards > process_cards_threshold())) { + _primary_refinement_thread->activate(); } - verify_num_cards(); } BufferNode* G1DirtyCardQueueSet::get_completed_buffer(size_t stop_at) { - MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag); + enqueue_previous_paused_buffers(); - if (num_cards() <= stop_at) { - return NULL; - } + ConcurrentVerifier cv(this); - assert(num_cards() > 0, "invariant"); - assert(_completed_buffers_head != NULL, "invariant"); - assert(_completed_buffers_tail != NULL, "invariant"); - - BufferNode* bn = _completed_buffers_head; - _num_cards -= buffer_size() - bn->index(); - _completed_buffers_head = bn->next(); - if (_completed_buffers_head == NULL) { - assert(num_cards() == 0, "invariant"); - _completed_buffers_tail = NULL; - set_process_completed_buffers(false); + // Check for unsufficient cards to satisfy request. We only do this once, + // up front, rather than on each iteration below, since the test is racy + // regardless of when we do it. + if (Atomic::load_acquire(&_num_cards) <= stop_at) { + return NULL_buffer; + } + + Thread* current_thread = Thread::current(); + + while (true) { + // Use a critical section per iteration, rather than over the whole + // operation. We're not guaranteed to make progress, because of possible + // contention on the queue head. Lingering in one CS the whole time could + // lead to excessive allocation of buffers, because the CS blocks return + // of released buffers to the free list for reuse. + GlobalCounter::CriticalSection cs(current_thread); + + BufferNode* result = Atomic::load_acquire(&_completed_buffers_head); + // Check for empty queue. Only needs to be done on first iteration, + // since we never take the last element, but it's messy to make use + // of that and we expect one iteration to be the common case. + if (result == NULL_buffer) return result; + + BufferNode* next = Atomic::load_acquire(BufferNode::next_ptr(*result)); + if (next != NULL_buffer) { + next = Atomic::cmpxchg(&_completed_buffers_head, result, next); + if (next == result) { + // Former head successfully taken; it is not the last. + assert(Atomic::load(&_completed_buffers_tail) != result, "invariant"); + assert(result->next() != NULL_buffer, "invariant"); + result->set_next(NULL_buffer); + Atomic::sub(&_num_cards, buffer_size() - result->index()); + return result; + } + // cmpxchg failed; try again. + } else if (result == Atomic::load_acquire(&_completed_buffers_head)) { + // If follower of head is NULL and head hasn't changed, then only + // the one element is currently accessible. We don't take the last + // accessible element, because there may be a concurrent add using it. + // The check for unchanged head isn't needed for correctness, but the + // retry on change may sometimes let us get a buffer after all. + return NULL_buffer; + } + // Head changed; try again. } - verify_num_cards(); - bn->set_next(NULL); - return bn; + // Unreachable } #ifdef ASSERT void G1DirtyCardQueueSet::verify_num_cards() const { + NonconcurrentVerifier ncv(this); size_t actual = 0; - BufferNode* cur = _completed_buffers_head; - while (cur != NULL) { + BufferNode* cur = Atomic::load(&_completed_buffers_head); + for ( ; cur != NULL_buffer; cur = cur->next()) { actual += buffer_size() - cur->index(); - cur = cur->next(); } - assert(actual == _num_cards, + assert(actual == Atomic::load(&_num_cards), "Num entries in completed buffers should be " SIZE_FORMAT " but are " SIZE_FORMAT, - _num_cards, actual); + Atomic::load(&_num_cards), actual); } -#endif +#endif // ASSERT -void G1DirtyCardQueueSet::abandon_completed_buffers() { - BufferNode* buffers_to_delete = NULL; - { - MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag); - buffers_to_delete = _completed_buffers_head; - _completed_buffers_head = NULL; - _completed_buffers_tail = NULL; - _num_cards = 0; - set_process_completed_buffers(false); +// Refinement processing stops early if there is a pending safepoint, to +// avoid long delays to safepoint. We need to record the partially +// processed buffer for later continued processing. However, we can't +// simply add it back to the completed buffer queue, as that would introduce +// a new source of ABA for the queue. Instead, we have a pair of buffer +// lists (with each list represented by head and tail), one for each of the +// previous and next safepoints (*). When pausing the processing of a +// buffer for a safepoint, we add the buffer (lock free) to the list for the +// next safepoint. Before attempting to obtain a buffer from the queue we +// first transfer any buffers in the previous safepoint list to the queue. +// This is safe (doesn't introduce ABA) because threads cannot be in the +// midst of a queue pop across a safepoint. +// +// These paused buffer lists are conceptually an extension of the queue, and +// operations which need to deal with all of the queued buffers (such as +// concatenate_logs) also need to deal with any paused buffers. In general, +// if the safepoint performs a GC then the paused buffers will be processed +// as part of it and both lists will be empty afterward. +// +// An alternative would be to directly reenqueue a paused buffer, but only +// after first calling GlobalCounter::write_synchronize. However, that +// might noticeably delay the pending safepoint. +// +// A single paused list and a safepoint cleanup action to perform the transfer +// doesn't work because cleanup actions are not invoked for every safepoint. +// +// (*) If the safepoint does not perform a GC, the next list becomes the +// previous list after the safepoint. Since buffers are only added to the +// next list if there were threads performing refinement work, there will +// likely be refinement work done after the safepoint, which will transfer +// those buffers to the queue. However, multiple non-GC safepoints in +// succession, without intervening refinement work to perform a transfer +// (possibly through lack of time), can result in old buffers being present +// and inaccessible in the next list. This doesn't affect correctness, but +// might affect performance. The alternatives discussed above don't have +// this problem, but have problems of their own. + +static size_t next_paused_buffer_list_index() { + return SafepointSynchronize::safepoint_id() & 1; +} + +static size_t previous_paused_buffer_list_index() { + return next_paused_buffer_list_index() ^ 1; +} + +void G1DirtyCardQueueSet::record_paused_buffer(BufferNode* node) { + assert_not_at_safepoint(); + assert(node->next() == NULL_buffer, "precondition"); + size_t next_index = next_paused_buffer_list_index(); + // Cards for paused buffers are included in count, to contribute to + // notification checking after the coming safepoint if it doesn't GC. + Atomic::add(&_num_cards, buffer_size() - node->index()); + BufferNode* old_head = Atomic::xchg(&_paused[next_index]._head, node); + if (old_head == NULL_buffer) { + assert(_paused[next_index]._tail == NULL, "invariant"); + _paused[next_index]._tail = node; + } else { + node->set_next(old_head); } - while (buffers_to_delete != NULL) { +} + +void G1DirtyCardQueueSet::enqueue_paused_buffers_aux(size_t index) { + if (Atomic::load(&_paused[index]._head) != NULL_buffer) { + BufferNode* paused = Atomic::xchg(&_paused[index]._head, NULL_buffer); + if (paused != NULL_buffer) { + BufferNode* tail = _paused[index]._tail; + assert(tail != NULL, "invariant"); + _paused[index]._tail = NULL_buffer; + append_buffers(paused, tail, 0); // Cards already counted when recorded. + } + } +} + +void G1DirtyCardQueueSet::enqueue_previous_paused_buffers() { + size_t previous_index = previous_paused_buffer_list_index(); + enqueue_paused_buffers_aux(previous_index); +} + +void G1DirtyCardQueueSet::enqueue_all_paused_buffers() { + assert_at_safepoint(); + for (size_t i = 0; i < ARRAY_SIZE(_paused); ++i) { + enqueue_paused_buffers_aux(i); + } +} + +void G1DirtyCardQueueSet::clear_completed_buffers() { + Atomic::store(&_completed_buffers_head, NULL_buffer); + Atomic::store(&_completed_buffers_tail, NULL_buffer); + Atomic::store(&_num_cards, size_t(0)); +} + +void G1DirtyCardQueueSet::abandon_completed_buffers() { + enqueue_all_paused_buffers(); + verify_num_cards(); + NonconcurrentVerifier ncv(this); + BufferNode* buffers_to_delete = Atomic::load(&_completed_buffers_head); + clear_completed_buffers(); + while (buffers_to_delete != NULL_buffer) { BufferNode* bn = buffers_to_delete; buffers_to_delete = bn->next(); - bn->set_next(NULL); + bn->set_next(NULL_buffer); deallocate_buffer(bn); } } void G1DirtyCardQueueSet::notify_if_necessary() { - MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag); - if (num_cards() > process_cards_threshold()) { - set_process_completed_buffers(true); - ml.notify_all(); + if ((_primary_refinement_thread != NULL) && + (num_cards() > process_cards_threshold())) { + _primary_refinement_thread->activate(); } } -// Merge lists of buffers. Notify the processing threads. -// The source queue is emptied as a result. The queues -// must share the monitor. +// Merge lists of buffers. The source queue set is emptied as a +// result. The queue sets must share the same allocator. void G1DirtyCardQueueSet::merge_bufferlists(G1RedirtyCardsQueueSet* src) { assert(allocator() == src->allocator(), "precondition"); const G1BufferNodeList from = src->take_all_completed_buffers(); - if (from._head == NULL) return; - - MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag); - if (_completed_buffers_tail == NULL) { - assert(_completed_buffers_head == NULL, "Well-formedness"); - _completed_buffers_head = from._head; - _completed_buffers_tail = from._tail; - } else { - assert(_completed_buffers_head != NULL, "Well formedness"); - _completed_buffers_tail->set_next(from._head); - _completed_buffers_tail = from._tail; - } - _num_cards += from._entry_count; - - assert(_completed_buffers_head == NULL && _completed_buffers_tail == NULL || - _completed_buffers_head != NULL && _completed_buffers_tail != NULL, - "Sanity"); - verify_num_cards(); + if (from._head == NULL_buffer) return; + append_buffers(from._head, from._tail, from._entry_count); } G1BufferNodeList G1DirtyCardQueueSet::take_all_completed_buffers() { - MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag); - G1BufferNodeList result(_completed_buffers_head, _completed_buffers_tail, _num_cards); - _completed_buffers_head = NULL; - _completed_buffers_tail = NULL; - _num_cards = 0; +#ifdef ASSERT + for (size_t i = 0; i < ARRAY_SIZE(_paused); ++i) { + assert(Atomic::load(&_paused[i]._head) == NULL_buffer, "precondition"); + assert(Atomic::load(&_paused[i]._tail) == NULL_buffer, "precondition"); + } +#endif // ASSERT + verify_num_cards(); + NonconcurrentVerifier ncv(this); + G1BufferNodeList result(Atomic::load(&_completed_buffers_head), + Atomic::load(&_completed_buffers_tail), + Atomic::load(&_num_cards)); + clear_completed_buffers(); return result; } @@ -399,7 +606,7 @@ size_t stop_at, size_t* total_refined_cards) { BufferNode* node = get_completed_buffer(stop_at); - if (node == NULL) { + if (node == NULL_buffer) { return false; } else if (refine_buffer(node, worker_id, total_refined_cards)) { assert_fully_consumed(node, buffer_size()); @@ -407,14 +614,15 @@ deallocate_buffer(node); return true; } else { - // Return partially processed buffer to the queue. - enqueue_completed_buffer(node); + // Buffer incompletely processed because there is a pending safepoint. + // Record partially processed buffer, to be finished later. + record_paused_buffer(node); return true; } } void G1DirtyCardQueueSet::abandon_logs() { - assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint."); + assert_at_safepoint(); abandon_completed_buffers(); // Since abandon is done only at safepoints, we can safely manipulate @@ -433,7 +641,7 @@ // Iterate over all the threads, if we find a partial log add it to // the global list of logs. Temporarily turn off the limit on the number // of outstanding buffers. - assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint."); + assert_at_safepoint(); size_t old_limit = max_cards(); set_max_cards(MaxCardsUnlimited); @@ -448,5 +656,7 @@ Threads::threads_do(&closure); G1BarrierSet::shared_dirty_card_queue().flush(); + enqueue_all_paused_buffers(); + verify_num_cards(); set_max_cards(old_limit); } --- old/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp 2020-01-16 01:05:19.052384223 -0500 +++ new/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp 2020-01-16 01:05:18.644362197 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -29,11 +29,12 @@ #include "gc/g1/g1FreeIdSet.hpp" #include "gc/shared/ptrQueue.hpp" #include "memory/allocation.hpp" +#include "memory/padded.hpp" +class G1ConcurrentRefineThread; class G1DirtyCardQueueSet; class G1RedirtyCardsQueueSet; class Thread; -class Monitor; // A ptrQueue whose elements are "oops", pointers to object heads. class G1DirtyCardQueue: public PtrQueue { @@ -66,18 +67,45 @@ }; class G1DirtyCardQueueSet: public PtrQueueSet { - Monitor* _cbl_mon; // Protects the list and count members. - BufferNode* _completed_buffers_head; - BufferNode* _completed_buffers_tail; - - // Number of actual cards in the list of completed buffers. + G1ConcurrentRefineThread* _primary_refinement_thread; + // Add padding for improved performance for shared access. There's only + // one instance of this class, so using a little extra space is fine. + // _completed_buffers_{head,tail} and _num_cards are isolated to their + // own cache lines. Other members are not, even if shared access, because + // they aren't as critical to performance. + DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(G1ConcurrentRefineThread*)); + BufferNode* volatile _completed_buffers_head; + DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(BufferNode* volatile)); + BufferNode* volatile _completed_buffers_tail; + DEFINE_PAD_MINUS_SIZE(3, DEFAULT_CACHE_LINE_SIZE, sizeof(BufferNode* volatile)); volatile size_t _num_cards; + DEFINE_PAD_MINUS_SIZE(4, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile size_t)); - size_t _process_cards_threshold; - volatile bool _process_completed_buffers; + DEBUG_ONLY(mutable volatile int _concurrency;) + class ConcurrentVerifier; + class NonconcurrentVerifier; + + size_t append_buffers(BufferNode* first, BufferNode* last, size_t card_count); + // Verify _num_cards == sum of cards in the completed queue. + void verify_num_cards() const NOT_DEBUG_RETURN; + + struct PauseList { + BufferNode* volatile _head; + BufferNode* _tail; + PauseList() : _head(NULL), _tail(NULL) {} + }; + PauseList _paused[2]; + + void record_paused_buffer(BufferNode* node); + void enqueue_paused_buffers_aux(size_t index); + void enqueue_previous_paused_buffers(); + void enqueue_all_paused_buffers(); + void clear_completed_buffers(); void abandon_completed_buffers(); + size_t _process_cards_threshold; + // Refine the cards in "node" from its index to buffer_size. // Stops processing if SuspendibleThreadSet::should_yield() is true. // Returns true if the entire buffer was processed, false if there @@ -103,9 +131,13 @@ size_t* _mutator_refined_cards_counters; public: - G1DirtyCardQueueSet(Monitor* cbl_mon, BufferNode::Allocator* allocator); + G1DirtyCardQueueSet(BufferNode::Allocator* allocator); ~G1DirtyCardQueueSet(); + void set_primary_refinement_thread(G1ConcurrentRefineThread* thread) { + _primary_refinement_thread = thread; + } + // The number of parallel ids that can be claimed to allow collector or // mutator threads to do card-processing work. static uint num_par_ids(); @@ -126,13 +158,6 @@ // The number of cards in completed buffers. Read without synchronization. size_t num_cards() const { return _num_cards; } - // Verify that _num_cards is equal to the sum of actual cards - // in the completed buffers. - void verify_num_cards() const NOT_DEBUG_RETURN; - - bool process_completed_buffers() { return _process_completed_buffers; } - void set_process_completed_buffers(bool x) { _process_completed_buffers = x; } - // Get/Set the number of cards that triggers log processing. // Log processing should be done when the number of cards exceeds the // threshold. --- old/src/hotspot/share/gc/shared/ptrQueue.hpp 2020-01-16 01:05:20.864482048 -0500 +++ new/src/hotspot/share/gc/shared/ptrQueue.hpp 2020-01-16 01:05:20.420458078 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -210,8 +210,6 @@ return offset_of(BufferNode, _buffer); } - static BufferNode* volatile* next_ptr(BufferNode& bn) { return &bn._next; } - // Allocate a new BufferNode with the "buffer" having size elements. static BufferNode* allocate(size_t size); @@ -219,6 +217,7 @@ static void deallocate(BufferNode* node); public: + static BufferNode* volatile* next_ptr(BufferNode& bn) { return &bn._next; } typedef LockFreeStack Stack; BufferNode* next() const { return _next; } --- old/src/hotspot/share/runtime/mutexLocker.cpp 2020-01-16 01:05:22.640577929 -0500 +++ new/src/hotspot/share/runtime/mutexLocker.cpp 2020-01-16 01:05:22.276558278 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -73,7 +73,6 @@ Monitor* STS_lock = NULL; Monitor* FullGCCount_lock = NULL; Monitor* G1OldGCCount_lock = NULL; -Monitor* DirtyCardQ_CBL_mon = NULL; Mutex* Shared_DirtyCardQ_lock = NULL; Mutex* MarkStackFreeList_lock = NULL; Mutex* MarkStackChunkList_lock = NULL; @@ -211,7 +210,6 @@ if (UseG1GC) { def(G1OldGCCount_lock , PaddedMonitor, leaf, true, _safepoint_check_always); - def(DirtyCardQ_CBL_mon , PaddedMonitor, access, true, _safepoint_check_never); def(Shared_DirtyCardQ_lock , PaddedMutex , access + 1, true, _safepoint_check_never); def(FreeList_lock , PaddedMutex , leaf , true, _safepoint_check_never); --- old/src/hotspot/share/runtime/mutexLocker.hpp 2020-01-16 01:05:24.496678130 -0500 +++ new/src/hotspot/share/runtime/mutexLocker.hpp 2020-01-16 01:05:24.112657399 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -69,8 +69,6 @@ extern Monitor* STS_lock; // used for joining/leaving SuspendibleThreadSet. extern Monitor* FullGCCount_lock; // in support of "concurrent" full gc extern Monitor* G1OldGCCount_lock; // in support of "concurrent" full gc -extern Monitor* DirtyCardQ_CBL_mon; // Protects dirty card Q - // completed buffer queue. extern Mutex* Shared_DirtyCardQ_lock; // Lock protecting dirty card // queue shared by // non-Java threads.