--- old/src/hotspot/share/gc/g1/g1BarrierSet.cpp	2020-01-16 01:05:07.811777407 -0500
+++ new/src/hotspot/share/gc/g1/g1BarrierSet.cpp	2020-01-16 01:05:07.367753437 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -36,7 +36,6 @@
 #include "oops/compressedOops.inline.hpp"
 #include "oops/oop.inline.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
-#include "runtime/mutexLocker.hpp"
 #include "runtime/orderAccess.hpp"
 #include "runtime/thread.inline.hpp"
 #include "utilities/macros.hpp"
@@ -59,7 +58,7 @@
   _satb_mark_queue_buffer_allocator("SATB Buffer Allocator", G1SATBBufferSize),
   _dirty_card_queue_buffer_allocator("DC Buffer Allocator", G1UpdateBufferSize),
   _satb_mark_queue_set(&_satb_mark_queue_buffer_allocator),
-  _dirty_card_queue_set(DirtyCardQ_CBL_mon, &_dirty_card_queue_buffer_allocator),
+  _dirty_card_queue_set(&_dirty_card_queue_buffer_allocator),
   _shared_dirty_card_queue(&_dirty_card_queue_set)
 {}
 
--- old/src/hotspot/share/gc/g1/g1CollectedHeap.cpp	2020-01-16 01:05:10.055898554 -0500
+++ new/src/hotspot/share/gc/g1/g1CollectedHeap.cpp	2020-01-16 01:05:09.627875448 -0500
@@ -2778,8 +2778,6 @@
   Threads::threads_do(&count_from_threads);
 
   G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-  dcqs.verify_num_cards();
-
   return dcqs.num_cards() + count_from_threads._cards;
 }
 
--- old/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp	2020-01-16 01:05:12.436027044 -0500
+++ new/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp	2020-01-16 01:05:12.056006529 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -89,6 +89,11 @@
       }
     }
   }
+
+  if (num_max_threads > 0) {
+    G1BarrierSet::dirty_card_queue_set().set_primary_refinement_thread(_threads[0]);
+  }
+
   return JNI_OK;
 }
 
@@ -108,7 +113,7 @@
     _threads[worker_id] = create_refinement_thread(worker_id, false);
     thread_to_activate = _threads[worker_id];
   }
-  if (thread_to_activate != NULL && !thread_to_activate->is_active()) {
+  if (thread_to_activate != NULL) {
     thread_to_activate->activate();
   }
 }
--- old/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp	2020-01-16 01:05:14.068115151 -0500
+++ new/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp	2020-01-16 01:05:13.692094852 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -29,9 +29,8 @@
 #include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/shared/suspendibleThreadSet.hpp"
 #include "logging/log.hpp"
-#include "memory/resourceArea.hpp"
-#include "runtime/handles.inline.hpp"
-#include "runtime/mutexLocker.hpp"
+#include "runtime/atomic.hpp"
+#include "runtime/thread.hpp"
 
 G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint worker_id) :
   ConcurrentGCThread(),
@@ -40,56 +39,59 @@
   _total_refinement_time(),
   _total_refined_cards(0),
   _worker_id(worker_id),
-  _active(false),
-  _monitor(NULL),
+  _notifier(new Semaphore(0)),
+  _should_notify(true),
   _cr(cr)
 {
-  // Each thread has its own monitor. The i-th thread is responsible for signaling
-  // to thread i+1 if the number of buffers in the queue exceeds a threshold for this
-  // thread. Monitors are also used to wake up the threads during termination.
-  // The 0th (primary) worker is notified by mutator threads and has a special monitor.
-  if (!is_primary()) {
-    _monitor = new Monitor(Mutex::nonleaf, "Refinement monitor", true,
-                           Monitor::_safepoint_check_never);
-  } else {
-    _monitor = DirtyCardQ_CBL_mon;
-  }
-
   // set name
   set_name("G1 Refine#%d", worker_id);
   create_and_start();
 }
 
+// _notifier and _should_notify form a single-reader / multi-writer
+// notification mechanism.  The thread is the single reader. The writers
+// are (other) threads that call activate() on the thread.
+
 void G1ConcurrentRefineThread::wait_for_completed_buffers() {
-  MonitorLocker ml(_monitor, Mutex::_no_safepoint_check_flag);
-  while (!should_terminate() && !is_active()) {
-    ml.wait();
+  assert(this == Thread::current(), "precondition");
+  while (Atomic::load_acquire(&_should_notify)) {
+    _notifier->wait();
   }
 }
 
-bool G1ConcurrentRefineThread::is_active() {
-  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-  return is_primary() ? dcqs.process_completed_buffers() : _active;
-}
-
 void G1ConcurrentRefineThread::activate() {
-  MutexLocker x(_monitor, Mutex::_no_safepoint_check_flag);
-  if (!is_primary()) {
-    set_active(true);
-  } else {
-    G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-    dcqs.set_process_completed_buffers(true);
+  assert(this != Thread::current(), "precondition");
+  // Notify iff transitioning from needing activation to not.  This helps
+  // keep the semaphore count bounded and minimizes the work done by
+  // activators when the thread is already active.
+  if (Atomic::load_acquire(&_should_notify) &&
+      Atomic::cmpxchg(&_should_notify, true, false)) {
+    _notifier->signal();
   }
-  _monitor->notify();
 }
 
-void G1ConcurrentRefineThread::deactivate() {
-  MutexLocker x(_monitor, Mutex::_no_safepoint_check_flag);
-  if (!is_primary()) {
-    set_active(false);
+// Called when no refinement work found for this thread.
+// Returns true if should deactivate.
+bool G1ConcurrentRefineThread::maybe_deactivate(bool more_work) {
+  assert(this == Thread::current(), "precondition");
+
+  if (more_work) {
+    // Suppress unnecessary notifications.
+    Atomic::release_store(&_should_notify, false);
+    return false;
+  } else if (Atomic::load_acquire(&_should_notify)) {
+    // Deactivate if no notifications since enabled (see below).
+    return true;
   } else {
-    G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-    dcqs.set_process_completed_buffers(false);
+    // Try for more refinement work with notifications enabled, to close
+    // race; there could be a plethora of suppressed activation attempts
+    // after we found no work but before we enable notifications here
+    // (so there could be lots of work for this thread to do), followed
+    // by a long time without activation after enabling notifications.
+    // But first, clear any pending signals to prevent accumulation.
+    while (_notifier->trywait()) {}
+    Atomic::release_store(&_should_notify, true);
+    return false;
   }
 }
 
@@ -119,14 +121,13 @@
         }
 
         Ticks start_time = Ticks::now();
-        if (!_cr->do_refinement_step(_worker_id, &_total_refined_cards)) {
-          break;                // No cards to process.
-        }
+        bool more_work = _cr->do_refinement_step(_worker_id, &_total_refined_cards);
         _total_refinement_time += (Ticks::now() - start_time);
+
+        if (maybe_deactivate(more_work)) break;
       }
     }
 
-    deactivate();
     log_debug(gc, refine)("Deactivated worker %d, off threshold: " SIZE_FORMAT
                           ", current: " SIZE_FORMAT ", refined cards: "
                           SIZE_FORMAT ", total refined cards: " SIZE_FORMAT,
@@ -146,6 +147,5 @@
 }
 
 void G1ConcurrentRefineThread::stop_service() {
-  MutexLocker x(_monitor, Mutex::_no_safepoint_check_flag);
-  _monitor->notify();
+  activate();
 }
--- old/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp	2020-01-16 01:05:15.656200883 -0500
+++ new/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp	2020-01-16 01:05:15.324182959 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -45,24 +45,25 @@
 
   uint _worker_id;
 
-  bool _active;
-  Monitor* _monitor;
+  // Each thread has its own semaphore.  The i-th thread is responsible for
+  // signaling thread i+1 if the number of buffers in the queue exceeds a
+  // threshold for that thread.  Semaphores are also used to wake up the
+  // threads during termination.  The 0th (primary) worker is notified by
+  // mutator threads.
+  Semaphore* _notifier;
+  volatile bool _should_notify;
+  bool maybe_deactivate(bool more_work);
+
   G1ConcurrentRefine* _cr;
 
   void wait_for_completed_buffers();
 
-  void set_active(bool x) { _active = x; }
-  // Deactivate this thread.
-  void deactivate();
-
-  bool is_primary() { return (_worker_id == 0); }
+  virtual void run_service();
+  virtual void stop_service();
 
-  void run_service();
-  void stop_service();
 public:
   G1ConcurrentRefineThread(G1ConcurrentRefine* cg1r, uint worker_id);
 
-  bool is_active();
   // Activate this thread.
   void activate();
 
--- old/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp	2020-01-16 01:05:17.368293309 -0500
+++ new/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp	2020-01-16 01:05:16.968271714 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -26,6 +26,7 @@
 #include "gc/g1/g1BufferNodeList.hpp"
 #include "gc/g1/g1CardTableEntryClosure.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
+#include "gc/g1/g1ConcurrentRefineThread.hpp"
 #include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1FreeIdSet.hpp"
 #include "gc/g1/g1RedirtyCardsQueue.hpp"
@@ -42,7 +43,10 @@
 #include "runtime/safepoint.hpp"
 #include "runtime/thread.inline.hpp"
 #include "runtime/threadSMR.hpp"
+#include "utilities/globalCounter.inline.hpp"
+#include "utilities/macros.hpp"
 #include "utilities/quickSort.hpp"
+#include <new>
 
 G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) :
   // Dirty card queues are always active, so we create them with their
@@ -54,6 +58,8 @@
   flush();
 }
 
+BufferNode* const NULL_buffer = NULL;
+
 void G1DirtyCardQueue::handle_completed_buffer() {
   assert(_buf != NULL, "precondition");
   BufferNode* node = BufferNode::make_node_from_buffer(_buf, index());
@@ -68,15 +74,15 @@
 // Assumed to be zero by concurrent threads.
 static uint par_ids_start() { return 0; }
 
-G1DirtyCardQueueSet::G1DirtyCardQueueSet(Monitor* cbl_mon,
-                                         BufferNode::Allocator* allocator) :
+G1DirtyCardQueueSet::G1DirtyCardQueueSet(BufferNode::Allocator* allocator) :
   PtrQueueSet(allocator),
-  _cbl_mon(cbl_mon),
-  _completed_buffers_head(NULL),
-  _completed_buffers_tail(NULL),
+  _primary_refinement_thread(NULL),
+  _completed_buffers_head(NULL_buffer),
+  _completed_buffers_tail(NULL_buffer),
   _num_cards(0),
+  DEBUG_ONLY(_concurrency(0) COMMA)
+  _paused(),
   _process_cards_threshold(ProcessCardsThresholdNever),
-  _process_completed_buffers(false),
   _max_cards(MaxCardsUnlimited),
   _max_cards_padding(0),
   _free_ids(par_ids_start(), num_par_ids()),
@@ -108,123 +114,324 @@
   G1ThreadLocalData::dirty_card_queue(t).handle_zero_index();
 }
 
-void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) {
-  MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  cbn->set_next(NULL);
-  if (_completed_buffers_tail == NULL) {
-    assert(_completed_buffers_head == NULL, "Well-formedness");
-    _completed_buffers_head = cbn;
-    _completed_buffers_tail = cbn;
+// _concurrency is an int that is used in debug-only context to verify
+// we're not overlapping queue operations that support concurrency with
+// those which don't.  The value is initially zero, meaning there are no
+// relevant operations in progress.  A "no concurrency" context is entered
+// by atomically changing the value from 0 to -1, with an assert on failure.
+// It is similarly exited by reverting the value back to 0.  A "concurrent"
+// context is entered by atomically incrementing the value and verifying the
+// result is greater than zero (so we weren't in a "no concurrency" context).
+// It is similarly exited by atomically decrementing the value and verifying
+// the result is at least zero (so no mismatches).
+//
+// ConcurrentVerifier and NonconcurrentVerifier are helper classes to
+// establish and remove such contexts.
+
+class G1DirtyCardQueueSet::ConcurrentVerifier : public StackObj {
+#ifdef ASSERT
+  const G1DirtyCardQueueSet* _dcqs;
+
+public:
+  ~ConcurrentVerifier() {
+    assert(Atomic::sub(&_dcqs->_concurrency, 1) >= 0, "invariant");
+  }
+#endif // ASSERT
+
+public:
+  ConcurrentVerifier(const G1DirtyCardQueueSet* dcqs) DEBUG_ONLY(: _dcqs(dcqs)) {
+    assert(Atomic::add(&_dcqs->_concurrency, 1) > 0, "invariant");
+  }
+};
+
+class G1DirtyCardQueueSet::NonconcurrentVerifier : public StackObj {
+#ifdef ASSERT
+  const G1DirtyCardQueueSet* _dcqs;
+
+public:
+  ~NonconcurrentVerifier() {
+    assert(Atomic::cmpxchg(&_dcqs->_concurrency, -1, 0) == -1, "invariant");
+  }
+#endif // ASSERT
+
+public:
+  NonconcurrentVerifier(const G1DirtyCardQueueSet* dcqs) DEBUG_ONLY(: _dcqs(dcqs)) {
+    assert(Atomic::cmpxchg(&_dcqs->_concurrency, 0, -1) == 0, "invariant");
+  }
+};
+
+// _completed_buffers_{head,tail} and _num_cards provide a lock-free FIFO
+// of buffers, linked through their next() fields.
+//
+// The key idea to make this work is that pop (get_completed_buffer) never
+// returns an element of the queue if it is the only accessible element,
+// e.g.  its "next" value is NULL.  It is expected that there will be a
+// later push/append that will make that element available to a future pop,
+// or there will eventually be a complete transfer (take_all_completed_buffers).
+//
+// An append operation atomically exchanges the new tail with the queue tail.
+// It then sets the "next" value of the old tail to the head of the list being
+// appended.  (It is an invariant that the old tail's "next" value is NULL.)
+// But if the old tail is NULL then the queue was empty.  In this case the
+// head of the list being appended is instead stored in the queue head (which
+// must be NULL).
+//
+// A push operation is just a degenerate append, where the buffer being pushed
+// is both the head and the tail of the list being appended.
+//
+// This means there is a period between the exchange and the old tail update
+// where the queue sequence is split into two parts, the list from the queue
+// head to the old tail, and the list being appended.  If there are concurrent
+// push/append operations, each may introduce another such segment.  But they
+// all eventually get resolved by their respective updates of their old tail's
+// "next" value.
+//
+// pop gets the queue head as the candidate result (returning NULL if the
+// queue head was NULL), and then gets that result node's "next" value.  If
+// that "next" value is NULL and the queue head hasn't changed, then there
+// is only one element in the (accessible) list.  We can't return that
+// element, because it may be the old tail of a concurrent push/append.  So
+// return NULL in this case.  Otherwise, attempt to cmpxchg that "next"
+// value into the queue head, retrying the whole operation if that fails.
+// This is the "usual" lock-free pop from head of slist, with the additional
+// restriction on taking the last element.
+//
+// In order to address the ABA problem for pop, a pop operation protects its
+// access to the head of the list with a GlobalCounter critical section. This
+// works with the buffer allocator's use of GlobalCounter synchronization to
+// prevent ABA from arising in the normal buffer usage cycle.  The paused
+// buffer handling prevents another ABA source (see record_paused_buffer and
+// enqueue_previous_paused_buffers).
+
+size_t G1DirtyCardQueueSet::append_buffers(BufferNode* first,
+                                           BufferNode* last,
+                                           size_t card_count) {
+  assert(last->next() == NULL_buffer, "precondition");
+  ConcurrentVerifier cv(this);
+  // Increment _num_cards before adding to queue, so queue removal doesn't
+  // need to deal with _num_cards possibly going negative.
+  size_t new_num_cards = Atomic::add(&_num_cards, card_count);
+  BufferNode* old_tail = Atomic::xchg(&_completed_buffers_tail, last);
+  if (old_tail == NULL_buffer) { // Empty list.
+    assert(Atomic::load(&_completed_buffers_head) == NULL_buffer, "invariant");
+    Atomic::store(&_completed_buffers_head, first);
   } else {
-    _completed_buffers_tail->set_next(cbn);
-    _completed_buffers_tail = cbn;
+    assert(old_tail->next() == NULL_buffer, "invariant");
+    old_tail->set_next(first);
   }
-  _num_cards += buffer_size() - cbn->index();
+  return new_num_cards;
+}
 
-  if (!process_completed_buffers() &&
-      (num_cards() > process_cards_threshold())) {
-    set_process_completed_buffers(true);
-    ml.notify_all();
+void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) {
+  assert(cbn != NULL_buffer, "precondition");
+  size_t new_num_cards = append_buffers(cbn, cbn, buffer_size() - cbn->index());
+  if ((_primary_refinement_thread != NULL) &&
+      (new_num_cards > process_cards_threshold())) {
+    _primary_refinement_thread->activate();
   }
-  verify_num_cards();
 }
 
 BufferNode* G1DirtyCardQueueSet::get_completed_buffer(size_t stop_at) {
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
+  enqueue_previous_paused_buffers();
 
-  if (num_cards() <= stop_at) {
-    return NULL;
-  }
+  ConcurrentVerifier cv(this);
 
-  assert(num_cards() > 0, "invariant");
-  assert(_completed_buffers_head != NULL, "invariant");
-  assert(_completed_buffers_tail != NULL, "invariant");
-
-  BufferNode* bn = _completed_buffers_head;
-  _num_cards -= buffer_size() - bn->index();
-  _completed_buffers_head = bn->next();
-  if (_completed_buffers_head == NULL) {
-    assert(num_cards() == 0, "invariant");
-    _completed_buffers_tail = NULL;
-    set_process_completed_buffers(false);
+  // Check for unsufficient cards to satisfy request.  We only do this once,
+  // up front, rather than on each iteration below, since the test is racy
+  // regardless of when we do it.
+  if (Atomic::load_acquire(&_num_cards) <= stop_at) {
+    return NULL_buffer;
+  }
+
+  Thread* current_thread = Thread::current();
+
+  while (true) {
+    // Use a critical section per iteration, rather than over the whole
+    // operation.  We're not guaranteed to make progress, because of possible
+    // contention on the queue head.  Lingering in one CS the whole time could
+    // lead to excessive allocation of buffers, because the CS blocks return
+    // of released buffers to the free list for reuse.
+    GlobalCounter::CriticalSection cs(current_thread);
+
+    BufferNode* result = Atomic::load_acquire(&_completed_buffers_head);
+    // Check for empty queue.  Only needs to be done on first iteration,
+    // since we never take the last element, but it's messy to make use
+    // of that and we expect one iteration to be the common case.
+    if (result == NULL_buffer) return result;
+
+    BufferNode* next = Atomic::load_acquire(BufferNode::next_ptr(*result));
+    if (next != NULL_buffer) {
+      next = Atomic::cmpxchg(&_completed_buffers_head, result, next);
+      if (next == result) {
+        // Former head successfully taken; it is not the last.
+        assert(Atomic::load(&_completed_buffers_tail) != result, "invariant");
+        assert(result->next() != NULL_buffer, "invariant");
+        result->set_next(NULL_buffer);
+        Atomic::sub(&_num_cards, buffer_size() - result->index());
+        return result;
+      }
+      // cmpxchg failed; try again.
+    } else if (result == Atomic::load_acquire(&_completed_buffers_head)) {
+      // If follower of head is NULL and head hasn't changed, then only
+      // the one element is currently accessible.  We don't take the last
+      // accessible element, because there may be a concurrent add using it.
+      // The check for unchanged head isn't needed for correctness, but the
+      // retry on change may sometimes let us get a buffer after all.
+      return NULL_buffer;
+    }
+    // Head changed; try again.
   }
-  verify_num_cards();
-  bn->set_next(NULL);
-  return bn;
+  // Unreachable
 }
 
 #ifdef ASSERT
 void G1DirtyCardQueueSet::verify_num_cards() const {
+  NonconcurrentVerifier ncv(this);
   size_t actual = 0;
-  BufferNode* cur = _completed_buffers_head;
-  while (cur != NULL) {
+  BufferNode* cur = Atomic::load(&_completed_buffers_head);
+  for ( ; cur != NULL_buffer; cur = cur->next()) {
     actual += buffer_size() - cur->index();
-    cur = cur->next();
   }
-  assert(actual == _num_cards,
+  assert(actual == Atomic::load(&_num_cards),
          "Num entries in completed buffers should be " SIZE_FORMAT " but are " SIZE_FORMAT,
-         _num_cards, actual);
+         Atomic::load(&_num_cards), actual);
 }
-#endif
+#endif // ASSERT
 
-void G1DirtyCardQueueSet::abandon_completed_buffers() {
-  BufferNode* buffers_to_delete = NULL;
-  {
-    MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
-    buffers_to_delete = _completed_buffers_head;
-    _completed_buffers_head = NULL;
-    _completed_buffers_tail = NULL;
-    _num_cards = 0;
-    set_process_completed_buffers(false);
+// Refinement processing stops early if there is a pending safepoint, to
+// avoid long delays to safepoint.  We need to record the partially
+// processed buffer for later continued processing.  However, we can't
+// simply add it back to the completed buffer queue, as that would introduce
+// a new source of ABA for the queue.  Instead, we have a pair of buffer
+// lists (with each list represented by head and tail), one for each of the
+// previous and next safepoints (*).  When pausing the processing of a
+// buffer for a safepoint, we add the buffer (lock free) to the list for the
+// next safepoint.  Before attempting to obtain a buffer from the queue we
+// first transfer any buffers in the previous safepoint list to the queue.
+// This is safe (doesn't introduce ABA) because threads cannot be in the
+// midst of a queue pop across a safepoint.
+//
+// These paused buffer lists are conceptually an extension of the queue, and
+// operations which need to deal with all of the queued buffers (such as
+// concatenate_logs) also need to deal with any paused buffers.  In general,
+// if the safepoint performs a GC then the paused buffers will be processed
+// as part of it and both lists will be empty afterward.
+//
+// An alternative would be to directly reenqueue a paused buffer, but only
+// after first calling GlobalCounter::write_synchronize.  However, that
+// might noticeably delay the pending safepoint.
+//
+// A single paused list and a safepoint cleanup action to perform the transfer
+// doesn't work because cleanup actions are not invoked for every safepoint.
+//
+// (*) If the safepoint does not perform a GC, the next list becomes the
+// previous list after the safepoint.  Since buffers are only added to the
+// next list if there were threads performing refinement work, there will
+// likely be refinement work done after the safepoint, which will transfer
+// those buffers to the queue.  However, multiple non-GC safepoints in
+// succession, without intervening refinement work to perform a transfer
+// (possibly through lack of time), can result in old buffers being present
+// and inaccessible in the next list.  This doesn't affect correctness, but
+// might affect performance.  The alternatives discussed above don't have
+// this problem, but have problems of their own.
+
+static size_t next_paused_buffer_list_index() {
+  return SafepointSynchronize::safepoint_id() & 1;
+}
+
+static size_t previous_paused_buffer_list_index() {
+  return next_paused_buffer_list_index() ^ 1;
+}
+
+void G1DirtyCardQueueSet::record_paused_buffer(BufferNode* node) {
+  assert_not_at_safepoint();
+  assert(node->next() == NULL_buffer, "precondition");
+  size_t next_index = next_paused_buffer_list_index();
+  // Cards for paused buffers are included in count, to contribute to
+  // notification checking after the coming safepoint if it doesn't GC.
+  Atomic::add(&_num_cards, buffer_size() - node->index());
+  BufferNode* old_head = Atomic::xchg(&_paused[next_index]._head, node);
+  if (old_head == NULL_buffer) {
+    assert(_paused[next_index]._tail == NULL, "invariant");
+    _paused[next_index]._tail = node;
+  } else {
+    node->set_next(old_head);
   }
-  while (buffers_to_delete != NULL) {
+}
+
+void G1DirtyCardQueueSet::enqueue_paused_buffers_aux(size_t index) {
+  if (Atomic::load(&_paused[index]._head) != NULL_buffer) {
+    BufferNode* paused = Atomic::xchg(&_paused[index]._head, NULL_buffer);
+    if (paused != NULL_buffer) {
+      BufferNode* tail = _paused[index]._tail;
+      assert(tail != NULL, "invariant");
+      _paused[index]._tail = NULL_buffer;
+      append_buffers(paused, tail, 0); // Cards already counted when recorded.
+    }
+  }
+}
+
+void G1DirtyCardQueueSet::enqueue_previous_paused_buffers() {
+  size_t previous_index = previous_paused_buffer_list_index();
+  enqueue_paused_buffers_aux(previous_index);
+}
+
+void G1DirtyCardQueueSet::enqueue_all_paused_buffers() {
+  assert_at_safepoint();
+  for (size_t i = 0; i < ARRAY_SIZE(_paused); ++i) {
+    enqueue_paused_buffers_aux(i);
+  }
+}
+
+void G1DirtyCardQueueSet::clear_completed_buffers() {
+  Atomic::store(&_completed_buffers_head, NULL_buffer);
+  Atomic::store(&_completed_buffers_tail, NULL_buffer);
+  Atomic::store(&_num_cards, size_t(0));
+}
+
+void G1DirtyCardQueueSet::abandon_completed_buffers() {
+  enqueue_all_paused_buffers();
+  verify_num_cards();
+  NonconcurrentVerifier ncv(this);
+  BufferNode* buffers_to_delete = Atomic::load(&_completed_buffers_head);
+  clear_completed_buffers();
+  while (buffers_to_delete != NULL_buffer) {
     BufferNode* bn = buffers_to_delete;
     buffers_to_delete = bn->next();
-    bn->set_next(NULL);
+    bn->set_next(NULL_buffer);
     deallocate_buffer(bn);
   }
 }
 
 void G1DirtyCardQueueSet::notify_if_necessary() {
-  MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  if (num_cards() > process_cards_threshold()) {
-    set_process_completed_buffers(true);
-    ml.notify_all();
+  if ((_primary_refinement_thread != NULL) &&
+      (num_cards() > process_cards_threshold())) {
+    _primary_refinement_thread->activate();
   }
 }
 
-// Merge lists of buffers. Notify the processing threads.
-// The source queue is emptied as a result. The queues
-// must share the monitor.
+// Merge lists of buffers. The source queue set is emptied as a
+// result. The queue sets must share the same allocator.
 void G1DirtyCardQueueSet::merge_bufferlists(G1RedirtyCardsQueueSet* src) {
   assert(allocator() == src->allocator(), "precondition");
   const G1BufferNodeList from = src->take_all_completed_buffers();
-  if (from._head == NULL) return;
-
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  if (_completed_buffers_tail == NULL) {
-    assert(_completed_buffers_head == NULL, "Well-formedness");
-    _completed_buffers_head = from._head;
-    _completed_buffers_tail = from._tail;
-  } else {
-    assert(_completed_buffers_head != NULL, "Well formedness");
-    _completed_buffers_tail->set_next(from._head);
-    _completed_buffers_tail = from._tail;
-  }
-  _num_cards += from._entry_count;
-
-  assert(_completed_buffers_head == NULL && _completed_buffers_tail == NULL ||
-         _completed_buffers_head != NULL && _completed_buffers_tail != NULL,
-         "Sanity");
-  verify_num_cards();
+  if (from._head == NULL_buffer) return;
+  append_buffers(from._head, from._tail, from._entry_count);
 }
 
 G1BufferNodeList G1DirtyCardQueueSet::take_all_completed_buffers() {
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  G1BufferNodeList result(_completed_buffers_head, _completed_buffers_tail, _num_cards);
-  _completed_buffers_head = NULL;
-  _completed_buffers_tail = NULL;
-  _num_cards = 0;
+#ifdef ASSERT
+  for (size_t i = 0; i < ARRAY_SIZE(_paused); ++i) {
+    assert(Atomic::load(&_paused[i]._head) == NULL_buffer, "precondition");
+    assert(Atomic::load(&_paused[i]._tail) == NULL_buffer, "precondition");
+  }
+#endif // ASSERT
+  verify_num_cards();
+  NonconcurrentVerifier ncv(this);
+  G1BufferNodeList result(Atomic::load(&_completed_buffers_head),
+                          Atomic::load(&_completed_buffers_tail),
+                          Atomic::load(&_num_cards));
+  clear_completed_buffers();
   return result;
 }
 
@@ -399,7 +606,7 @@
                                                                size_t stop_at,
                                                                size_t* total_refined_cards) {
   BufferNode* node = get_completed_buffer(stop_at);
-  if (node == NULL) {
+  if (node == NULL_buffer) {
     return false;
   } else if (refine_buffer(node, worker_id, total_refined_cards)) {
     assert_fully_consumed(node, buffer_size());
@@ -407,14 +614,15 @@
     deallocate_buffer(node);
     return true;
   } else {
-    // Return partially processed buffer to the queue.
-    enqueue_completed_buffer(node);
+    // Buffer incompletely processed because there is a pending safepoint.
+    // Record partially processed buffer, to be finished later.
+    record_paused_buffer(node);
     return true;
   }
 }
 
 void G1DirtyCardQueueSet::abandon_logs() {
-  assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
+  assert_at_safepoint();
   abandon_completed_buffers();
 
   // Since abandon is done only at safepoints, we can safely manipulate
@@ -433,7 +641,7 @@
   // Iterate over all the threads, if we find a partial log add it to
   // the global list of logs.  Temporarily turn off the limit on the number
   // of outstanding buffers.
-  assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
+  assert_at_safepoint();
   size_t old_limit = max_cards();
   set_max_cards(MaxCardsUnlimited);
 
@@ -448,5 +656,7 @@
   Threads::threads_do(&closure);
 
   G1BarrierSet::shared_dirty_card_queue().flush();
+  enqueue_all_paused_buffers();
+  verify_num_cards();
   set_max_cards(old_limit);
 }
--- old/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp	2020-01-16 01:05:19.052384223 -0500
+++ new/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp	2020-01-16 01:05:18.644362197 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -29,11 +29,12 @@
 #include "gc/g1/g1FreeIdSet.hpp"
 #include "gc/shared/ptrQueue.hpp"
 #include "memory/allocation.hpp"
+#include "memory/padded.hpp"
 
+class G1ConcurrentRefineThread;
 class G1DirtyCardQueueSet;
 class G1RedirtyCardsQueueSet;
 class Thread;
-class Monitor;
 
 // A ptrQueue whose elements are "oops", pointers to object heads.
 class G1DirtyCardQueue: public PtrQueue {
@@ -66,18 +67,45 @@
 };
 
 class G1DirtyCardQueueSet: public PtrQueueSet {
-  Monitor* _cbl_mon;  // Protects the list and count members.
-  BufferNode* _completed_buffers_head;
-  BufferNode* _completed_buffers_tail;
-
-  // Number of actual cards in the list of completed buffers.
+  G1ConcurrentRefineThread* _primary_refinement_thread;
+  // Add padding for improved performance for shared access.  There's only
+  // one instance of this class, so using a little extra space is fine.
+  // _completed_buffers_{head,tail} and _num_cards are isolated to their
+  // own cache lines.  Other members are not, even if shared access, because
+  // they aren't as critical to performance.
+  DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(G1ConcurrentRefineThread*));
+  BufferNode* volatile _completed_buffers_head;
+  DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(BufferNode* volatile));
+  BufferNode* volatile _completed_buffers_tail;
+  DEFINE_PAD_MINUS_SIZE(3, DEFAULT_CACHE_LINE_SIZE, sizeof(BufferNode* volatile));
   volatile size_t _num_cards;
+  DEFINE_PAD_MINUS_SIZE(4, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile size_t));
 
-  size_t _process_cards_threshold;
-  volatile bool _process_completed_buffers;
+  DEBUG_ONLY(mutable volatile int _concurrency;)
+  class ConcurrentVerifier;
+  class NonconcurrentVerifier;
+
+  size_t append_buffers(BufferNode* first, BufferNode* last, size_t card_count);
+  // Verify _num_cards == sum of cards in the completed queue.
+  void verify_num_cards() const NOT_DEBUG_RETURN;
+
+  struct PauseList {
+    BufferNode* volatile _head;
+    BufferNode* _tail;
+    PauseList() : _head(NULL), _tail(NULL) {}
+  };
+  PauseList _paused[2];
+
+  void record_paused_buffer(BufferNode* node);
+  void enqueue_paused_buffers_aux(size_t index);
+  void enqueue_previous_paused_buffers();
+  void enqueue_all_paused_buffers();
 
+  void clear_completed_buffers();
   void abandon_completed_buffers();
 
+  size_t _process_cards_threshold;
+
   // Refine the cards in "node" from its index to buffer_size.
   // Stops processing if SuspendibleThreadSet::should_yield() is true.
   // Returns true if the entire buffer was processed, false if there
@@ -103,9 +131,13 @@
   size_t* _mutator_refined_cards_counters;
 
 public:
-  G1DirtyCardQueueSet(Monitor* cbl_mon, BufferNode::Allocator* allocator);
+  G1DirtyCardQueueSet(BufferNode::Allocator* allocator);
   ~G1DirtyCardQueueSet();
 
+  void set_primary_refinement_thread(G1ConcurrentRefineThread* thread) {
+    _primary_refinement_thread = thread;
+  }
+
   // The number of parallel ids that can be claimed to allow collector or
   // mutator threads to do card-processing work.
   static uint num_par_ids();
@@ -126,13 +158,6 @@
   // The number of cards in completed buffers. Read without synchronization.
   size_t num_cards() const { return _num_cards; }
 
-  // Verify that _num_cards is equal to the sum of actual cards
-  // in the completed buffers.
-  void verify_num_cards() const NOT_DEBUG_RETURN;
-
-  bool process_completed_buffers() { return _process_completed_buffers; }
-  void set_process_completed_buffers(bool x) { _process_completed_buffers = x; }
-
   // Get/Set the number of cards that triggers log processing.
   // Log processing should be done when the number of cards exceeds the
   // threshold.
--- old/src/hotspot/share/gc/shared/ptrQueue.hpp	2020-01-16 01:05:20.864482048 -0500
+++ new/src/hotspot/share/gc/shared/ptrQueue.hpp	2020-01-16 01:05:20.420458078 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -210,8 +210,6 @@
     return offset_of(BufferNode, _buffer);
   }
 
-  static BufferNode* volatile* next_ptr(BufferNode& bn) { return &bn._next; }
-
   // Allocate a new BufferNode with the "buffer" having size elements.
   static BufferNode* allocate(size_t size);
 
@@ -219,6 +217,7 @@
   static void deallocate(BufferNode* node);
 
 public:
+  static BufferNode* volatile* next_ptr(BufferNode& bn) { return &bn._next; }
   typedef LockFreeStack<BufferNode, &next_ptr> Stack;
 
   BufferNode* next() const     { return _next;  }
--- old/src/hotspot/share/runtime/mutexLocker.cpp	2020-01-16 01:05:22.640577929 -0500
+++ new/src/hotspot/share/runtime/mutexLocker.cpp	2020-01-16 01:05:22.276558278 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -73,7 +73,6 @@
 Monitor* STS_lock                     = NULL;
 Monitor* FullGCCount_lock             = NULL;
 Monitor* G1OldGCCount_lock            = NULL;
-Monitor* DirtyCardQ_CBL_mon           = NULL;
 Mutex*   Shared_DirtyCardQ_lock       = NULL;
 Mutex*   MarkStackFreeList_lock       = NULL;
 Mutex*   MarkStackChunkList_lock      = NULL;
@@ -211,7 +210,6 @@
   if (UseG1GC) {
     def(G1OldGCCount_lock          , PaddedMonitor, leaf,        true,  _safepoint_check_always);
 
-    def(DirtyCardQ_CBL_mon         , PaddedMonitor, access,      true,  _safepoint_check_never);
     def(Shared_DirtyCardQ_lock     , PaddedMutex  , access + 1,  true,  _safepoint_check_never);
 
     def(FreeList_lock              , PaddedMutex  , leaf     ,   true,  _safepoint_check_never);
--- old/src/hotspot/share/runtime/mutexLocker.hpp	2020-01-16 01:05:24.496678130 -0500
+++ new/src/hotspot/share/runtime/mutexLocker.hpp	2020-01-16 01:05:24.112657399 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -69,8 +69,6 @@
 extern Monitor* STS_lock;                        // used for joining/leaving SuspendibleThreadSet.
 extern Monitor* FullGCCount_lock;                // in support of "concurrent" full gc
 extern Monitor* G1OldGCCount_lock;               // in support of "concurrent" full gc
-extern Monitor* DirtyCardQ_CBL_mon;              // Protects dirty card Q
-                                                 // completed buffer queue.
 extern Mutex*   Shared_DirtyCardQ_lock;          // Lock protecting dirty card
                                                  // queue shared by
                                                  // non-Java threads.