--- old/src/hotspot/share/gc/g1/g1BarrierSet.cpp 2020-01-31 17:22:31.426541561 -0500 +++ new/src/hotspot/share/gc/g1/g1BarrierSet.cpp 2020-01-31 17:22:31.142526282 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -36,7 +36,6 @@ #include "oops/compressedOops.inline.hpp" #include "oops/oop.inline.hpp" #include "runtime/interfaceSupport.inline.hpp" -#include "runtime/mutexLocker.hpp" #include "runtime/orderAccess.hpp" #include "runtime/thread.inline.hpp" #include "utilities/macros.hpp" @@ -59,7 +58,7 @@ _satb_mark_queue_buffer_allocator("SATB Buffer Allocator", G1SATBBufferSize), _dirty_card_queue_buffer_allocator("DC Buffer Allocator", G1UpdateBufferSize), _satb_mark_queue_set(&_satb_mark_queue_buffer_allocator), - _dirty_card_queue_set(DirtyCardQ_CBL_mon, &_dirty_card_queue_buffer_allocator), + _dirty_card_queue_set(&_dirty_card_queue_buffer_allocator), _shared_dirty_card_queue(&_dirty_card_queue_set) {} --- old/src/hotspot/share/gc/g1/g1CollectedHeap.cpp 2020-01-31 17:22:32.418594932 -0500 +++ new/src/hotspot/share/gc/g1/g1CollectedHeap.cpp 2020-01-31 17:22:32.126579222 -0500 @@ -2778,8 +2778,6 @@ Threads::threads_do(&count_from_threads); G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set(); - dcqs.verify_num_cards(); - return dcqs.num_cards() + count_from_threads._cards; } --- old/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp 2020-01-31 17:22:33.482652177 -0500 +++ new/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp 2020-01-31 17:22:33.190636467 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -89,6 +89,11 @@ } } } + + if (num_max_threads > 0) { + G1BarrierSet::dirty_card_queue_set().set_primary_refinement_thread(_threads[0]); + } + return JNI_OK; } @@ -108,7 +113,7 @@ _threads[worker_id] = create_refinement_thread(worker_id, false); thread_to_activate = _threads[worker_id]; } - if (thread_to_activate != NULL && !thread_to_activate->is_active()) { + if (thread_to_activate != NULL) { thread_to_activate->activate(); } } --- old/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp 2020-01-31 17:22:34.478705763 -0500 +++ new/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp 2020-01-31 17:22:34.202690914 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -29,9 +29,8 @@ #include "gc/g1/g1DirtyCardQueue.hpp" #include "gc/shared/suspendibleThreadSet.hpp" #include "logging/log.hpp" -#include "memory/resourceArea.hpp" -#include "runtime/handles.inline.hpp" -#include "runtime/mutexLocker.hpp" +#include "runtime/atomic.hpp" +#include "runtime/thread.hpp" G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint worker_id) : ConcurrentGCThread(), @@ -40,56 +39,53 @@ _total_refinement_time(), _total_refined_cards(0), _worker_id(worker_id), - _active(false), - _monitor(NULL), + _notifier(new Semaphore(0)), + _should_notify(true), _cr(cr) { - // Each thread has its own monitor. The i-th thread is responsible for signaling - // to thread i+1 if the number of buffers in the queue exceeds a threshold for this - // thread. Monitors are also used to wake up the threads during termination. - // The 0th (primary) worker is notified by mutator threads and has a special monitor. - if (!is_primary()) { - _monitor = new Monitor(Mutex::nonleaf, "Refinement monitor", true, - Monitor::_safepoint_check_never); - } else { - _monitor = DirtyCardQ_CBL_mon; - } - // set name set_name("G1 Refine#%d", worker_id); create_and_start(); } void G1ConcurrentRefineThread::wait_for_completed_buffers() { - MonitorLocker ml(_monitor, Mutex::_no_safepoint_check_flag); - while (!should_terminate() && !is_active()) { - ml.wait(); + assert(this == Thread::current(), "precondition"); + while (Atomic::load_acquire(&_should_notify)) { + _notifier->wait(); } } -bool G1ConcurrentRefineThread::is_active() { - G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set(); - return is_primary() ? dcqs.process_completed_buffers() : _active; -} - void G1ConcurrentRefineThread::activate() { - MutexLocker x(_monitor, Mutex::_no_safepoint_check_flag); - if (!is_primary()) { - set_active(true); - } else { - G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set(); - dcqs.set_process_completed_buffers(true); + assert(this != Thread::current(), "precondition"); + // Notify iff transitioning from needing activation to not. This helps + // keep the semaphore count bounded and minimizes the work done by + // activators when the thread is already active. + if (Atomic::load_acquire(&_should_notify) && + Atomic::cmpxchg(&_should_notify, true, false)) { + _notifier->signal(); } - _monitor->notify(); } -void G1ConcurrentRefineThread::deactivate() { - MutexLocker x(_monitor, Mutex::_no_safepoint_check_flag); - if (!is_primary()) { - set_active(false); +bool G1ConcurrentRefineThread::maybe_deactivate(bool more_work) { + assert(this == Thread::current(), "precondition"); + + if (more_work) { + // Suppress unnecessary notifications. + Atomic::release_store(&_should_notify, false); + return false; + } else if (Atomic::load_acquire(&_should_notify)) { + // Deactivate if no notifications since enabled (see below). + return true; } else { - G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set(); - dcqs.set_process_completed_buffers(false); + // Try for more refinement work with notifications enabled, to close + // race; there could be a plethora of suppressed activation attempts + // after we found no work but before we enable notifications here + // (so there could be lots of work for this thread to do), followed + // by a long time without activation after enabling notifications. + // But first, clear any pending signals to prevent accumulation. + while (_notifier->trywait()) {} + Atomic::release_store(&_should_notify, true); + return false; } } @@ -119,14 +115,13 @@ } Ticks start_time = Ticks::now(); - if (!_cr->do_refinement_step(_worker_id, &_total_refined_cards)) { - break; // No cards to process. - } + bool more_work = _cr->do_refinement_step(_worker_id, &_total_refined_cards); _total_refinement_time += (Ticks::now() - start_time); + + if (maybe_deactivate(more_work)) break; } } - deactivate(); log_debug(gc, refine)("Deactivated worker %d, off threshold: " SIZE_FORMAT ", current: " SIZE_FORMAT ", refined cards: " SIZE_FORMAT ", total refined cards: " SIZE_FORMAT, @@ -146,6 +141,5 @@ } void G1ConcurrentRefineThread::stop_service() { - MutexLocker x(_monitor, Mutex::_no_safepoint_check_flag); - _monitor->notify(); + activate(); } --- old/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp 2020-01-31 17:22:35.458758488 -0500 +++ new/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp 2020-01-31 17:22:35.186743854 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -45,24 +45,33 @@ uint _worker_id; - bool _active; - Monitor* _monitor; + // _notifier and _should_notify form a single-reader / multi-writer + // notification mechanism. The owning concurrent refinement thread is the + // single reader. The writers are (other) threads that call activate() on + // the thread. The i-th concurrent refinement thread is responsible for + // activating thread i+1 if the number of buffers in the queue exceeds a + // threshold for that i+1th thread. The 0th (primary) thread is activated + // by threads that add cards to the dirty card queue set when the primary + // thread's threshold is exceeded. activate() is also used to wake up the + // threads during termination, so even the non-primary thread case is + // multi-writer. + Semaphore* _notifier; + volatile bool _should_notify; + + // Called when no refinement work found for this thread. + // Returns true if should deactivate. + bool maybe_deactivate(bool more_work); + G1ConcurrentRefine* _cr; void wait_for_completed_buffers(); - void set_active(bool x) { _active = x; } - // Deactivate this thread. - void deactivate(); - - bool is_primary() { return (_worker_id == 0); } + virtual void run_service(); + virtual void stop_service(); - void run_service(); - void stop_service(); public: G1ConcurrentRefineThread(G1ConcurrentRefine* cg1r, uint worker_id); - bool is_active(); // Activate this thread. void activate(); --- old/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp 2020-01-31 17:22:36.414809922 -0500 +++ new/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp 2020-01-31 17:22:36.150795719 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -26,6 +26,7 @@ #include "gc/g1/g1BufferNodeList.hpp" #include "gc/g1/g1CardTableEntryClosure.hpp" #include "gc/g1/g1CollectedHeap.inline.hpp" +#include "gc/g1/g1ConcurrentRefineThread.hpp" #include "gc/g1/g1DirtyCardQueue.hpp" #include "gc/g1/g1FreeIdSet.hpp" #include "gc/g1/g1RedirtyCardsQueue.hpp" @@ -33,15 +34,14 @@ #include "gc/g1/g1ThreadLocalData.hpp" #include "gc/g1/heapRegionRemSet.hpp" #include "gc/shared/suspendibleThreadSet.hpp" -#include "gc/shared/workgroup.hpp" #include "memory/iterator.hpp" -#include "runtime/flags/flagSetting.hpp" -#include "runtime/mutexLocker.hpp" -#include "runtime/orderAccess.hpp" +#include "runtime/atomic.hpp" #include "runtime/os.hpp" #include "runtime/safepoint.hpp" #include "runtime/thread.inline.hpp" #include "runtime/threadSMR.hpp" +#include "utilities/globalCounter.inline.hpp" +#include "utilities/macros.hpp" #include "utilities/quickSort.hpp" G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) : @@ -68,18 +68,16 @@ // Assumed to be zero by concurrent threads. static uint par_ids_start() { return 0; } -G1DirtyCardQueueSet::G1DirtyCardQueueSet(Monitor* cbl_mon, - BufferNode::Allocator* allocator) : +G1DirtyCardQueueSet::G1DirtyCardQueueSet(BufferNode::Allocator* allocator) : PtrQueueSet(allocator), - _cbl_mon(cbl_mon), - _completed_buffers_head(NULL), - _completed_buffers_tail(NULL), + _primary_refinement_thread(NULL), _num_cards(0), + _completed(), + _paused(), + _free_ids(par_ids_start(), num_par_ids()), _process_cards_threshold(ProcessCardsThresholdNever), - _process_completed_buffers(false), _max_cards(MaxCardsUnlimited), _max_cards_padding(0), - _free_ids(par_ids_start(), num_par_ids()), _mutator_refined_cards_counters(NEW_C_HEAP_ARRAY(size_t, num_par_ids(), mtGC)) { ::memset(_mutator_refined_cards_counters, 0, num_par_ids() * sizeof(size_t)); @@ -108,75 +106,304 @@ G1ThreadLocalData::dirty_card_queue(t).handle_zero_index(); } -void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) { - MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag); - cbn->set_next(NULL); - if (_completed_buffers_tail == NULL) { - assert(_completed_buffers_head == NULL, "Well-formedness"); - _completed_buffers_head = cbn; - _completed_buffers_tail = cbn; +#ifdef ASSERT +G1DirtyCardQueueSet::Queue::~Queue() { + assert(_head == NULL, "precondition"); + assert(_tail == NULL, "precondition"); +} +#endif // ASSERT + +BufferNode* G1DirtyCardQueueSet::Queue::top() const { + return Atomic::load(&_head); +} + +// An append operation atomically exchanges the new tail with the queue tail. +// It then sets the "next" value of the old tail to the head of the list being +// appended; it is an invariant that the old tail's "next" value is NULL. +// But if the old tail is NULL then the queue was empty. In this case the +// head of the list being appended is instead stored in the queue head; it is +// an invariant that the queue head is NULL in this case. +// +// This means there is a period between the exchange and the old tail update +// where the queue sequence is split into two parts, the list from the queue +// head to the old tail, and the list being appended. If there are concurrent +// push/append operations, each may introduce another such segment. But they +// all eventually get resolved by their respective updates of their old tail's +// "next" value. This also means that pop operations must handle a buffer +// with a NULL "next" value specially. +// +// A push operation is just a degenerate append, where the buffer being pushed +// is both the head and the tail of the list being appended. +void G1DirtyCardQueueSet::Queue::append(BufferNode& first, BufferNode& last) { + assert(last.next() == NULL, "precondition"); + BufferNode* old_tail = Atomic::xchg(&_tail, &last); + if (old_tail == NULL) { // Was empty. + assert(Atomic::load(&_head) == NULL, "invariant"); + Atomic::store(&_head, &first); } else { - _completed_buffers_tail->set_next(cbn); - _completed_buffers_tail = cbn; + assert(old_tail->next() == NULL, "invariant"); + old_tail->set_next(&first); } - _num_cards += buffer_size() - cbn->index(); +} - if (!process_completed_buffers() && - (num_cards() > process_cards_threshold())) { - set_process_completed_buffers(true); - ml.notify_all(); +// pop gets the queue head as the candidate result (returning NULL if the +// queue head was NULL), and then gets that result node's "next" value. If +// that "next" value is NULL and the queue head hasn't changed, then there +// is only one element in the accessible part of the list (the sequence from +// head to a node with a NULL "next" value). We can't return that element, +// because it may be the old tail of a concurrent push/append that has not +// yet had its "next" field set to the new tail. So return NULL in this case. +// Otherwise, attempt to cmpxchg that "next" value into the queue head, +// retrying the whole operation if that fails. This is the "usual" lock-free +// pop from the head of a singly linked list, with the additional restriction +// on taking the last element. +BufferNode* G1DirtyCardQueueSet::Queue::pop() { + Thread* current_thread = Thread::current(); + while (true) { + // Use a critical section per iteration, rather than over the whole + // operation. We're not guaranteed to make progress, because of possible + // contention on the queue head. Lingering in one CS the whole time could + // lead to excessive allocation of buffers, because the CS blocks return + // of released buffers to the free list for reuse. + GlobalCounter::CriticalSection cs(current_thread); + + BufferNode* result = Atomic::load_acquire(&_head); + // Check for empty queue. Only needs to be done on first iteration, + // since we never take the last element, but it's messy to make use + // of that and we expect one iteration to be the common case. + if (result == NULL) return NULL; + + BufferNode* next = Atomic::load_acquire(BufferNode::next_ptr(*result)); + if (next != NULL) { + next = Atomic::cmpxchg(&_head, result, next); + if (next == result) { + // Former head successfully taken; it is not the last. + assert(Atomic::load(&_tail) != result, "invariant"); + assert(result->next() != NULL, "invariant"); + result->set_next(NULL); + return result; + } + // cmpxchg failed; try again. + } else if (result == Atomic::load_acquire(&_head)) { + // If follower of head is NULL and head hasn't changed, then only + // the one element is currently accessible. We don't take the last + // accessible element, because there may be a concurrent add using it. + // The check for unchanged head isn't needed for correctness, but the + // retry on change may sometimes let us get a buffer after all. + return NULL; + } + // Head changed; try again. + } +} + +G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::Queue::take_all() { + assert_at_safepoint(); + HeadTail result(Atomic::load(&_head), Atomic::load(&_tail)); + Atomic::store(&_head, (BufferNode*)NULL); + Atomic::store(&_tail, (BufferNode*)NULL); + return result; +} + +void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) { + assert(cbn != NULL, "precondition"); + // Increment _num_cards before adding to queue, so queue removal doesn't + // need to deal with _num_cards possibly going negative. + size_t new_num_cards = Atomic::add(&_num_cards, buffer_size() - cbn->index()); + _completed.push(*cbn); + if ((new_num_cards > process_cards_threshold()) && + (_primary_refinement_thread != NULL)) { + _primary_refinement_thread->activate(); } - verify_num_cards(); } BufferNode* G1DirtyCardQueueSet::get_completed_buffer(size_t stop_at) { - MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag); + enqueue_previous_paused_buffers(); - if (num_cards() <= stop_at) { + // Check for unsufficient cards to satisfy request. We only do this once, + // up front, rather than on each iteration below, since the test is racy + // regardless of when we do it. + if (Atomic::load_acquire(&_num_cards) <= stop_at) { return NULL; } - assert(num_cards() > 0, "invariant"); - assert(_completed_buffers_head != NULL, "invariant"); - assert(_completed_buffers_tail != NULL, "invariant"); - - BufferNode* bn = _completed_buffers_head; - _num_cards -= buffer_size() - bn->index(); - _completed_buffers_head = bn->next(); - if (_completed_buffers_head == NULL) { - assert(num_cards() == 0, "invariant"); - _completed_buffers_tail = NULL; - set_process_completed_buffers(false); + BufferNode* result = _completed.pop(); + if (result != NULL) { + Atomic::sub(&_num_cards, buffer_size() - result->index()); } - verify_num_cards(); - bn->set_next(NULL); - return bn; + return result; } #ifdef ASSERT void G1DirtyCardQueueSet::verify_num_cards() const { size_t actual = 0; - BufferNode* cur = _completed_buffers_head; - while (cur != NULL) { + BufferNode* cur = _completed.top(); + for ( ; cur != NULL; cur = cur->next()) { actual += buffer_size() - cur->index(); - cur = cur->next(); } - assert(actual == _num_cards, + assert(actual == Atomic::load(&_num_cards), "Num entries in completed buffers should be " SIZE_FORMAT " but are " SIZE_FORMAT, - _num_cards, actual); + Atomic::load(&_num_cards), actual); } -#endif +#endif // ASSERT -void G1DirtyCardQueueSet::abandon_completed_buffers() { - BufferNode* buffers_to_delete = NULL; +G1DirtyCardQueueSet::PausedBuffers::PausedList::PausedList() : + _head(NULL), _tail(NULL), + _safepoint_id(SafepointSynchronize::safepoint_id()) +{} + +#ifdef ASSERT +G1DirtyCardQueueSet::PausedBuffers::PausedList::~PausedList() { + assert(Atomic::load(&_head) == NULL, "precondition"); + assert(_tail == NULL, "precondition"); +} +#endif // ASSERT + +bool G1DirtyCardQueueSet::PausedBuffers::PausedList::is_next() const { + assert_not_at_safepoint(); + return _safepoint_id == SafepointSynchronize::safepoint_id(); +} + +void G1DirtyCardQueueSet::PausedBuffers::PausedList::add(BufferNode* node) { + assert_not_at_safepoint(); + assert(is_next(), "precondition"); + BufferNode* old_head = Atomic::xchg(&_head, node); + if (old_head == NULL) { + assert(_tail == NULL, "invariant"); + _tail = node; + } else { + node->set_next(old_head); + } +} + +G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::PausedList::take() { + BufferNode* head = Atomic::load(&_head); + BufferNode* tail = _tail; + Atomic::store(&_head, (BufferNode*)NULL); + _tail = NULL; + return HeadTail(head, tail); +} + +G1DirtyCardQueueSet::PausedBuffers::PausedBuffers() : _plist(NULL) {} + +#ifdef ASSERT +G1DirtyCardQueueSet::PausedBuffers::~PausedBuffers() { + assert(empty(), "invariant"); +} +#endif // ASSERT + +bool G1DirtyCardQueueSet::PausedBuffers::empty() const { + return Atomic::load(&_plist) == NULL; +} + +void G1DirtyCardQueueSet::PausedBuffers::add(BufferNode* node) { + assert_not_at_safepoint(); + PausedList* plist = Atomic::load_acquire(&_plist); + if (plist != NULL) { + // Already have a next list, so use it. We know it's a next list because + // of the precondition that take_previous() has already been called. + assert(plist->is_next(), "invariant"); + } else { + // Try to install a new next list. + plist = new PausedList(); + PausedList* old_plist = Atomic::cmpxchg(&_plist, (PausedList*)NULL, plist); + if (old_plist != NULL) { + // Some other thread installed a new next list. Use it instead. + delete plist; + plist = old_plist; + } + } + plist->add(node); +} + +G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::take_previous() { + assert_not_at_safepoint(); + PausedList* previous; { - MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag); - buffers_to_delete = _completed_buffers_head; - _completed_buffers_head = NULL; - _completed_buffers_tail = NULL; - _num_cards = 0; - set_process_completed_buffers(false); + // Deal with plist in a critical section, to prevent it from being + // deleted out from under us by a concurrent take_previous(). + GlobalCounter::CriticalSection cs(Thread::current()); + previous = Atomic::load_acquire(&_plist); + if ((previous == NULL) || // Nothing to take. + previous->is_next() || // Not from a previous safepoint. + // Some other thread stole it. + (Atomic::cmpxchg(&_plist, previous, (PausedList*)NULL) != previous)) { + return HeadTail(); + } + } + // We now own previous. + HeadTail result = previous->take(); + // There might be other threads examining previous (in concurrent + // take_previous()). Synchronize to wait until any such threads are + // done with such examination before deleting. + GlobalCounter::write_synchronize(); + delete previous; + return result; +} + +G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::take_all() { + assert_at_safepoint(); + HeadTail result; + PausedList* plist = Atomic::load(&_plist); + if (plist != NULL) { + Atomic::store(&_plist, (PausedList*)NULL); + result = plist->take(); + delete plist; + } + return result; +} + +void G1DirtyCardQueueSet::record_paused_buffer(BufferNode* node) { + assert_not_at_safepoint(); + assert(node->next() == NULL, "precondition"); + // Cards for paused buffers are included in count, to contribute to + // notification checking after the coming safepoint if it doesn't GC. + // Note that this means the queue's _num_cards differs from the number + // of cards in the queued buffers when there are paused buffers. + Atomic::add(&_num_cards, buffer_size() - node->index()); + _paused.add(node); +} + +void G1DirtyCardQueueSet::enqueue_paused_buffers_aux(const HeadTail& paused) { + if (paused._head != NULL) { + assert(paused._tail != NULL, "invariant"); + // Cards from paused buffers are already recorded in the queue count. + _completed.append(*paused._head, *paused._tail); + } +} + +void G1DirtyCardQueueSet::enqueue_previous_paused_buffers() { + assert_not_at_safepoint(); + // The fast-path still satisfies the precondition for record_paused_buffer + // and PausedBuffers::add, even with a racy test. If there are paused + // buffers from a previous safepoint, empty() will return false; there will + // have been a safepoint between recording and test, so there can't be a + // false negative (empty() returns true) while such buffers are present. + // If empty() is false, there are two cases: + // + // (1) There were paused buffers from a previous safepoint. A concurrent + // caller may take and enqueue them first, but that's okay; the precondition + // for a possible later record_paused_buffer by this thread will still hold. + // + // (2) There are paused buffers for a requested next safepoint. + // + // In each of those cases some effort may be spent detecting and dealing + // with those circumstances; any wasted effort in such cases is expected to + // be well compensated by the fast path. + if (!_paused.empty()) { + enqueue_paused_buffers_aux(_paused.take_previous()); } +} + +void G1DirtyCardQueueSet::enqueue_all_paused_buffers() { + assert_at_safepoint(); + enqueue_paused_buffers_aux(_paused.take_all()); +} + +void G1DirtyCardQueueSet::abandon_completed_buffers() { + enqueue_all_paused_buffers(); + verify_num_cards(); + G1BufferNodeList list = take_all_completed_buffers(); + BufferNode* buffers_to_delete = list._head; while (buffers_to_delete != NULL) { BufferNode* bn = buffers_to_delete; buffers_to_delete = bn->next(); @@ -186,46 +413,30 @@ } void G1DirtyCardQueueSet::notify_if_necessary() { - MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag); - if (num_cards() > process_cards_threshold()) { - set_process_completed_buffers(true); - ml.notify_all(); + if ((_primary_refinement_thread != NULL) && + (num_cards() > process_cards_threshold())) { + _primary_refinement_thread->activate(); } } -// Merge lists of buffers. Notify the processing threads. -// The source queue is emptied as a result. The queues -// must share the monitor. +// Merge lists of buffers. The source queue set is emptied as a +// result. The queue sets must share the same allocator. void G1DirtyCardQueueSet::merge_bufferlists(G1RedirtyCardsQueueSet* src) { assert(allocator() == src->allocator(), "precondition"); const G1BufferNodeList from = src->take_all_completed_buffers(); - if (from._head == NULL) return; - - MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag); - if (_completed_buffers_tail == NULL) { - assert(_completed_buffers_head == NULL, "Well-formedness"); - _completed_buffers_head = from._head; - _completed_buffers_tail = from._tail; - } else { - assert(_completed_buffers_head != NULL, "Well formedness"); - _completed_buffers_tail->set_next(from._head); - _completed_buffers_tail = from._tail; - } - _num_cards += from._entry_count; - - assert(_completed_buffers_head == NULL && _completed_buffers_tail == NULL || - _completed_buffers_head != NULL && _completed_buffers_tail != NULL, - "Sanity"); - verify_num_cards(); + if (from._head != NULL) { + Atomic::add(&_num_cards, from._entry_count); + _completed.append(*from._head, *from._tail); + } } G1BufferNodeList G1DirtyCardQueueSet::take_all_completed_buffers() { - MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag); - G1BufferNodeList result(_completed_buffers_head, _completed_buffers_tail, _num_cards); - _completed_buffers_head = NULL; - _completed_buffers_tail = NULL; - _num_cards = 0; - return result; + enqueue_all_paused_buffers(); + verify_num_cards(); + HeadTail buffers = _completed.take_all(); + size_t num_cards = Atomic::load(&_num_cards); + Atomic::store(&_num_cards, size_t(0)); + return G1BufferNodeList(buffers._head, buffers._tail, num_cards); } class G1RefineBufferedCards : public StackObj { @@ -368,14 +579,20 @@ bool G1DirtyCardQueueSet::process_or_enqueue_completed_buffer(BufferNode* node) { if (Thread::current()->is_Java_thread()) { // If the number of buffers exceeds the limit, make this Java - // thread do the processing itself. We don't lock to access - // buffer count or padding; it is fine to be imprecise here. The - // add of padding could overflow, which is treated as unlimited. + // thread do the processing itself. Calculation is racy but we + // don't need precision here. The add of padding could overflow, + // which is treated as unlimited. size_t limit = max_cards() + max_cards_padding(); if ((num_cards() > limit) && (limit >= max_cards())) { if (mut_process_buffer(node)) { return true; } + // Buffer was incompletely processed because of a pending safepoint + // request. Unlike with refinement thread processing, for mutator + // processing the buffer did not come from the completed buffer queue, + // so it is okay to add it to the queue rather than to the paused set. + // Indeed, it can't be added to the paused set because we didn't pass + // through enqueue_previous_paused_buffers. } } enqueue_completed_buffer(node); @@ -407,14 +624,15 @@ deallocate_buffer(node); return true; } else { - // Return partially processed buffer to the queue. - enqueue_completed_buffer(node); + // Buffer incompletely processed because there is a pending safepoint. + // Record partially processed buffer, to be finished later. + record_paused_buffer(node); return true; } } void G1DirtyCardQueueSet::abandon_logs() { - assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint."); + assert_at_safepoint(); abandon_completed_buffers(); // Since abandon is done only at safepoints, we can safely manipulate @@ -433,7 +651,7 @@ // Iterate over all the threads, if we find a partial log add it to // the global list of logs. Temporarily turn off the limit on the number // of outstanding buffers. - assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint."); + assert_at_safepoint(); size_t old_limit = max_cards(); set_max_cards(MaxCardsUnlimited); @@ -448,5 +666,7 @@ Threads::threads_do(&closure); G1BarrierSet::shared_dirty_card_queue().flush(); + enqueue_all_paused_buffers(); + verify_num_cards(); set_max_cards(old_limit); } --- old/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp 2020-01-31 17:22:37.438865015 -0500 +++ new/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp 2020-01-31 17:22:37.162850166 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -29,11 +29,12 @@ #include "gc/g1/g1FreeIdSet.hpp" #include "gc/shared/ptrQueue.hpp" #include "memory/allocation.hpp" +#include "memory/padded.hpp" +class G1ConcurrentRefineThread; class G1DirtyCardQueueSet; class G1RedirtyCardsQueueSet; class Thread; -class Monitor; // A ptrQueue whose elements are "oops", pointers to object heads. class G1DirtyCardQueue: public PtrQueue { @@ -66,15 +67,178 @@ }; class G1DirtyCardQueueSet: public PtrQueueSet { - Monitor* _cbl_mon; // Protects the list and count members. - BufferNode* _completed_buffers_head; - BufferNode* _completed_buffers_tail; - - // Number of actual cards in the list of completed buffers. + // Head and tail of a list of BufferNodes, linked through their next() + // fields. Similar to G1BufferNodeList, but without the _entry_count. + struct HeadTail { + BufferNode* _head; + BufferNode* _tail; + HeadTail() : _head(NULL), _tail(NULL) {} + HeadTail(BufferNode* head, BufferNode* tail) : _head(head), _tail(tail) {} + }; + + // A lock-free FIFO of BufferNodes, linked through their next() fields. + // This class has a restriction that pop() cannot return the last buffer + // in the queue, or what was the last buffer for a concurrent push/append + // operation. It is expected that there will be a later push/append that + // will make that buffer available to a future pop(), or there will + // eventually be a complete transfer via take_all(). + class Queue { + BufferNode* volatile _head; + DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(BufferNode*)); + BufferNode* volatile _tail; + DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(BufferNode*)); + + NONCOPYABLE(Queue); + + public: + Queue() : _head(NULL), _tail(NULL) {} + DEBUG_ONLY(~Queue();) + + // Return the first buffer in the queue. + // Thread-safe, but the result may change immediately. + BufferNode* top() const; + + // Thread-safe add the buffer to the end of the queue. + void push(BufferNode& node) { append(node, node); } + + // Thread-safe add the buffers from first to last to the end of the queue. + void append(BufferNode& first, BufferNode& last); + + // Thread-safe attempt to remove and return the first buffer in the queue. + // Returns NULL if the queue is empty, or if only one buffer is found. + // Uses GlobalCounter critical sections to address the ABA problem; this + // works with the buffer allocator's use of GlobalCounter synchronization. + BufferNode* pop(); + + // Take all the buffers from the queue, leaving the queue empty. + // Not thread-safe. + HeadTail take_all(); + }; + + // Concurrent refinement may stop processing in the middle of a buffer if + // there is a pending safepoint, to avoid long delays to safepoint. A + // partially processed buffer needs to be recorded for processing by the + // safepoint if it's a GC safepoint; otherwise it needs to be recorded for + // further concurrent refinement work after the safepoint. But if the + // buffer was obtained from the completed buffer queue then it can't simply + // be added back to the queue, as that would introduce a new source of ABA + // for the queue. + // + // The PausedBuffer object is used to record such buffers for the upcoming + // safepoint, and provides access to the buffers recorded for previous + // safepoints. Before obtaining a buffer from the completed buffers queue, + // we first transfer any buffers from previous safepoints to the queue. + // This is ABA-safe because threads cannot be in the midst of a queue pop + // across a safepoint. + // + // The paused buffers are conceptually an extension of the completed buffers + // queue, and operations which need to deal with all of the queued buffers + // (such as concatenate_logs) also need to deal with any paused buffers. In + // general, if a safepoint performs a GC then the paused buffers will be + // processed as part of it, and there won't be any paused buffers after a + // GC safepoint. + class PausedBuffers { + class PausedList : public CHeapObj { + BufferNode* volatile _head; + BufferNode* _tail; + size_t _safepoint_id; + + NONCOPYABLE(PausedList); + + public: + PausedList(); + DEBUG_ONLY(~PausedList();) + + // Return true if this list was created to hold buffers for the + // next safepoint. + // precondition: not at safepoint. + bool is_next() const; + + // Thread-safe add the buffer to the list. + // precondition: not at safepoint. + // precondition: is_next(). + void add(BufferNode* node); + + // Take all the buffers from the list. Not thread-safe. + HeadTail take(); + }; + + // The most recently created list, which might be for either the next or + // a previous safepoint, or might be NULL if the next list hasn't been + // created yet. We only need one list because of the requirement that + // threads calling add() must first ensure there are no paused buffers + // from a previous safepoint. There might be many list instances existing + // at the same time though; there can be many threads competing to create + // and install the next list, and meanwhile there can be a thread dealing + // with the previous list. + PausedList* volatile _plist; + DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(PausedList*)); + + NONCOPYABLE(PausedBuffers); + + public: + PausedBuffers(); + DEBUG_ONLY(~PausedBuffers();) + + // Test whether there are any paused lists. + // Thread-safe, but the answer may change immediately. + bool empty() const; + + // Thread-safe add the buffer to paused list for next safepoint. + // precondition: not at safepoint. + // precondition: does not have paused buffers from a previous safepoint. + void add(BufferNode* node); + + // Thread-safe take all paused buffers for previous safepoints. + // precondition: not at safepoint. + HeadTail take_previous(); + + // Take all the paused buffers. + // precondition: at safepoint. + HeadTail take_all(); + }; + + // The primary refinement thread, for activation when the processing + // threshold is reached. NULL if there aren't any refinement threads. + G1ConcurrentRefineThread* _primary_refinement_thread; + DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(G1ConcurrentRefineThread*)); + // Upper bound on the number of cards in the completed and paused buffers. volatile size_t _num_cards; + DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(size_t)); + // Buffers ready for refinement. + Queue _completed; // Has inner padding, including trailer. + // Buffers for which refinement is temporarily paused. + PausedBuffers _paused; // Has inner padding, including trailer. + G1FreeIdSet _free_ids; + + // Activation threshold for the primary refinement thread. size_t _process_cards_threshold; - volatile bool _process_completed_buffers; + + // If the queue contains more cards than configured here, the + // mutator must start doing some of the concurrent refinement work. + size_t _max_cards; + size_t _max_cards_padding; + static const size_t MaxCardsUnlimited = SIZE_MAX; + + // Array of cumulative dirty cards refined by mutator threads. + // Array has an entry per id in _free_ids. + size_t* _mutator_refined_cards_counters; + + // Verify _num_cards == sum of cards in the completed queue. + void verify_num_cards() const NOT_DEBUG_RETURN; + + // Thread-safe add a buffer to paused list for next safepoint. + // precondition: not at safepoint. + // precondition: does not have paused buffers from a previous safepoint. + void record_paused_buffer(BufferNode* node); + void enqueue_paused_buffers_aux(const HeadTail& paused); + // Thread-safe transfer paused buffers for previous safepoints to the queue. + // precondition: not at safepoint. + void enqueue_previous_paused_buffers(); + // Transfer all paused buffers to the queue. + // precondition: at safepoint. + void enqueue_all_paused_buffers(); void abandon_completed_buffers(); @@ -90,22 +254,18 @@ bool mut_process_buffer(BufferNode* node); - // If the queue contains more cards than configured here, the - // mutator must start doing some of the concurrent refinement work. - size_t _max_cards; - size_t _max_cards_padding; - static const size_t MaxCardsUnlimited = SIZE_MAX; - - G1FreeIdSet _free_ids; - - // Array of cumulative dirty cards refined by mutator threads. - // Array has an entry per id in _free_ids. - size_t* _mutator_refined_cards_counters; + // If the number of completed buffers is > stop_at, then remove and + // return a completed buffer from the list. Otherwise, return NULL. + BufferNode* get_completed_buffer(size_t stop_at = 0); public: - G1DirtyCardQueueSet(Monitor* cbl_mon, BufferNode::Allocator* allocator); + G1DirtyCardQueueSet(BufferNode::Allocator* allocator); ~G1DirtyCardQueueSet(); + void set_primary_refinement_thread(G1ConcurrentRefineThread* thread) { + _primary_refinement_thread = thread; + } + // The number of parallel ids that can be claimed to allow collector or // mutator threads to do card-processing work. static uint num_par_ids(); @@ -119,20 +279,11 @@ virtual void enqueue_completed_buffer(BufferNode* node); - // If the number of completed buffers is > stop_at, then remove and - // return a completed buffer from the list. Otherwise, return NULL. - BufferNode* get_completed_buffer(size_t stop_at = 0); - - // The number of cards in completed buffers. Read without synchronization. + // Upper bound on the number of cards currently in in this queue set. + // Read without synchronization. The value may be high because there + // is a concurrent modification of the set of buffers. size_t num_cards() const { return _num_cards; } - // Verify that _num_cards is equal to the sum of actual cards - // in the completed buffers. - void verify_num_cards() const NOT_DEBUG_RETURN; - - bool process_completed_buffers() { return _process_completed_buffers; } - void set_process_completed_buffers(bool x) { _process_completed_buffers = x; } - // Get/Set the number of cards that triggers log processing. // Log processing should be done when the number of cards exceeds the // threshold. @@ -156,8 +307,8 @@ // false. // // Stops processing a buffer if SuspendibleThreadSet::should_yield(), - // returning the incompletely processed buffer to the completed buffer - // list, for later processing of the remainder. + // recording the incompletely processed buffer for later processing of + // the remainder. // // Increments *total_refined_cards by the number of cards processed and // removed from the buffer. --- old/src/hotspot/share/gc/shared/ptrQueue.hpp 2020-01-31 17:22:38.438918816 -0500 +++ new/src/hotspot/share/gc/shared/ptrQueue.hpp 2020-01-31 17:22:38.146903106 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -210,8 +210,6 @@ return offset_of(BufferNode, _buffer); } - static BufferNode* volatile* next_ptr(BufferNode& bn) { return &bn._next; } - // Allocate a new BufferNode with the "buffer" having size elements. static BufferNode* allocate(size_t size); @@ -219,6 +217,7 @@ static void deallocate(BufferNode* node); public: + static BufferNode* volatile* next_ptr(BufferNode& bn) { return &bn._next; } typedef LockFreeStack Stack; BufferNode* next() const { return _next; } --- old/src/hotspot/share/runtime/mutexLocker.cpp 2020-01-31 17:22:39.394970250 -0500 +++ new/src/hotspot/share/runtime/mutexLocker.cpp 2020-01-31 17:22:39.098954325 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -73,7 +73,6 @@ Monitor* STS_lock = NULL; Monitor* FullGCCount_lock = NULL; Monitor* G1OldGCCount_lock = NULL; -Monitor* DirtyCardQ_CBL_mon = NULL; Mutex* Shared_DirtyCardQ_lock = NULL; Mutex* MarkStackFreeList_lock = NULL; Mutex* MarkStackChunkList_lock = NULL; @@ -211,7 +210,6 @@ if (UseG1GC) { def(G1OldGCCount_lock , PaddedMonitor, leaf, true, _safepoint_check_always); - def(DirtyCardQ_CBL_mon , PaddedMonitor, access, true, _safepoint_check_never); def(Shared_DirtyCardQ_lock , PaddedMutex , access + 1, true, _safepoint_check_never); def(FreeList_lock , PaddedMutex , leaf , true, _safepoint_check_never); --- old/src/hotspot/share/runtime/mutexLocker.hpp 2020-01-31 17:22:40.431025988 -0500 +++ new/src/hotspot/share/runtime/mutexLocker.hpp 2020-01-31 17:22:40.139010278 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -69,8 +69,6 @@ extern Monitor* STS_lock; // used for joining/leaving SuspendibleThreadSet. extern Monitor* FullGCCount_lock; // in support of "concurrent" full gc extern Monitor* G1OldGCCount_lock; // in support of "concurrent" full gc -extern Monitor* DirtyCardQ_CBL_mon; // Protects dirty card Q - // completed buffer queue. extern Mutex* Shared_DirtyCardQ_lock; // Lock protecting dirty card // queue shared by // non-Java threads.