--- old/src/hotspot/share/gc/shared/ptrQueue.cpp	2019-01-15 17:45:45.298047544 -0500
+++ new/src/hotspot/share/gc/shared/ptrQueue.cpp	2019-01-15 17:45:45.042034023 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -24,12 +24,15 @@
 
 #include "precompiled.hpp"
 #include "gc/shared/ptrQueue.hpp"
+#include "logging/log.hpp"
 #include "memory/allocation.hpp"
 #include "memory/allocation.inline.hpp"
 #include "runtime/atomic.hpp"
 #include "runtime/mutex.hpp"
 #include "runtime/mutexLocker.hpp"
+#include "runtime/orderAccess.hpp"
 #include "runtime/thread.inline.hpp"
+#include "utilities/globalCounter.inline.hpp"
 
 #include <new>
 
@@ -85,20 +88,29 @@
   FREE_C_HEAP_ARRAY(char, node);
 }
 
-BufferNode::Allocator::Allocator(size_t buffer_size, Mutex* lock) :
+BufferNode::Allocator::Allocator(const char* name, size_t buffer_size) :
   _buffer_size(buffer_size),
-  _lock(lock),
-  _free_list(NULL),
-  _free_count(0)
+  _pending_list(),
+  _free_list(),
+  _pending_count(0),
+  _free_count(0),
+  _transfer_lock(false)
 {
-  assert(lock != NULL, "precondition");
+  strncpy(_name, name, sizeof(_name));
+  _name[sizeof(_name) - 1] = '\0';
 }
 
 BufferNode::Allocator::~Allocator() {
-  while (_free_list != NULL) {
-    BufferNode* node = _free_list;
-    _free_list = node->next();
-    BufferNode::deallocate(node);
+  delete_list(_free_list.pop_all());
+  delete_list(_pending_list.pop_all());
+}
+
+void BufferNode::Allocator::delete_list(BufferNode* list) {
+  while (list != NULL) {
+    BufferNode* next = list->next();
+    DEBUG_ONLY(list->set_next(NULL);)
+    BufferNode::deallocate(list);
+    list = next;
   }
 }
 
@@ -107,55 +119,109 @@
 }
 
 BufferNode* BufferNode::Allocator::allocate() {
-  BufferNode* node = NULL;
+  BufferNode* node;
   {
-    MutexLockerEx ml(_lock, Mutex::_no_safepoint_check_flag);
-    node = _free_list;
-    if (node != NULL) {
-      _free_list = node->next();
-      --_free_count;
-      node->set_next(NULL);
-      node->set_index(0);
-      return node;
-    }
-  }
-  return  BufferNode::allocate(_buffer_size);
-}
-
+    // Protect against ABA; see release().
+    GlobalCounter::CriticalSection cs(Thread::current());
+    node = _free_list.pop();
+  }
+  if (node == NULL) {
+    node = BufferNode::allocate(_buffer_size);
+  } else {
+    // Decrement count after getting buffer from free list.  This, along
+    // with incrementing count before adding to free list, ensures count
+    // never underflows.
+    size_t count = Atomic::sub(1u, &_free_count);
+    assert((count + 1) != 0, "_free_count underflow");
+  }
+  return node;
+}
+
+// To solve the ABA problem for lock-free stack pop, allocate does the
+// pop inside a critical section, and release synchronizes on the
+// critical sections before adding to the _free_list.  But we don't
+// want to make every release have to do a synchronize.  Instead, we
+// initially place released nodes on the _pending_list, and transfer
+// them to the _free_list in batches.  Only one transfer at a time is
+// permitted, with a lock bit to control access to that phase.  A
+// transfer takes all the nodes from the _pending_list, synchronizes on
+// the _free_list pops, and then adds the former pending nodes to the
+// _free_list.  While that's happening, other threads might be adding
+// other nodes to the _pending_list, to be dealt with by some later
+// transfer.
 void BufferNode::Allocator::release(BufferNode* node) {
-  MutexLockerEx ml(_lock, Mutex::_no_safepoint_check_flag);
-  node->set_next(_free_list);
-  _free_list = node;
-  ++_free_count;
+  assert(node != NULL, "precondition");
+  assert(node->next() == NULL, "precondition");
+
+  // Desired minimum transfer batch size.  There is relatively little
+  // importance to the specific number.  It shouldn't be too big, else
+  // we're wasting space when the release rate is low.  If the release
+  // rate is high, we might accumulate more than this before being
+  // able to start a new transfer, but that's okay.  Also note that
+  // the allocation rate and the release rate are going to be fairly
+  // similar, due to how the buffers are used.
+  const size_t trigger_transfer = 10;
+
+  // Add to pending list. Update count first so no underflow in transfer.
+  size_t pending_count = Atomic::add(1u, &_pending_count);
+  _pending_list.push(*node);
+  if (pending_count > trigger_transfer) {
+    try_transfer_pending();
+  }
 }
 
-void BufferNode::Allocator::reduce_free_list() {
-  BufferNode* head = NULL;
-  {
-    MutexLockerEx ml(_lock, Mutex::_no_safepoint_check_flag);
-    // For now, delete half.
-    size_t remove = _free_count / 2;
-    if (remove > 0) {
-      head = _free_list;
-      BufferNode* tail = head;
-      BufferNode* prev = NULL;
-      for (size_t i = 0; i < remove; ++i) {
-        assert(tail != NULL, "free list size is wrong");
-        prev = tail;
-        tail = tail->next();
-      }
-      assert(prev != NULL, "invariant");
-      assert(prev->next() == tail, "invariant");
-      prev->set_next(NULL);
-      _free_list = tail;
-      _free_count -= remove;
+// Try to transfer nodes from _pending_list to _free_list, with a
+// synchronization delay for any in-progress pops from the _free_list,
+// to solve ABA there.  Return true if performed a (possibly empty)
+// transfer, false if blocked from doing so by some other thread's
+// in-progress transfer.
+bool BufferNode::Allocator::try_transfer_pending() {
+  // Attempt to claim the lock.
+  if (Atomic::load(&_transfer_lock) || // Skip CAS if likely to fail.
+      Atomic::cmpxchg(true, &_transfer_lock, false)) {
+    return false;
+  }
+  // Have the lock; perform the transfer.
+
+  // Claim all the pending nodes.
+  BufferNode* first = _pending_list.pop_all();
+  if (first != NULL) {
+    // Prepare to add the claimed nodes, and update _pending_count.
+    BufferNode* last = first;
+    size_t count = 1;
+    for (BufferNode* next = first->next(); next != NULL; next = next->next()) {
+      last = next;
+      ++count;
     }
+    Atomic::sub(count, &_pending_count);
+
+    // Wait for any in-progress pops, to avoid ABA for them.
+    GlobalCounter::write_synchronize();
+
+    // Add synchronized nodes to _free_list.
+    // Update count first so no underflow in allocate().
+    Atomic::add(count, &_free_count);
+    _free_list.prepend(*first, *last);
+    log_trace(gc, ptrqueue, freelist)
+             ("Transferred %s pending to free: " SIZE_FORMAT, name(), count);
+  }
+  OrderAccess::release_store(&_transfer_lock, false);
+  return true;
+}
+
+size_t BufferNode::Allocator::reduce_free_list(size_t remove_goal) {
+  try_transfer_pending();
+  size_t removed = 0;
+  for ( ; removed < remove_goal; ++removed) {
+    BufferNode* node = _free_list.pop();
+    if (node == NULL) break;
+    BufferNode::deallocate(node);
   }
-  while (head != NULL) {
-    BufferNode* next = head->next();
-    BufferNode::deallocate(head);
-    head = next;
-  }
+  size_t new_count = Atomic::sub(removed, &_free_count);
+  log_debug(gc, ptrqueue, freelist)
+           ("Reduced %s free list by " SIZE_FORMAT " to " SIZE_FORMAT,
+            name(), removed, new_count);
+  return removed;
 }
 
 PtrQueueSet::PtrQueueSet(bool notify_when_complete) :