--- old/src/share/vm/gc/g1/dirtyCardQueue.cpp	2016-03-01 21:48:22.694923932 -0500
+++ new/src/share/vm/gc/g1/dirtyCardQueue.cpp	2016-03-01 21:48:22.598923456 -0500
@@ -115,9 +115,8 @@
                                    uint worker_i) {
   bool res = true;
   if (_buf != NULL) {
-    res = apply_closure_to_buffer(cl, _buf, _index, _sz,
-                                  consume,
-                                  worker_i);
+    BufferNode* node = BufferNode::make_node_from_buffer(_buf, _index);
+    res = apply_closure_to_buffer(cl, node, _sz, consume, worker_i);
     if (res && consume) {
       _index = _sz;
     }
@@ -126,25 +125,27 @@
 }
 
 bool DirtyCardQueue::apply_closure_to_buffer(CardTableEntryClosure* cl,
-                                             void** buf,
-                                             size_t index, size_t sz,
+                                             BufferNode* node, size_t sz,
                                              bool consume,
                                              uint worker_i) {
   if (cl == NULL) return true;
+  void** buf = BufferNode::make_buffer_from_node(node);
   size_t limit = byte_index_to_index(sz);
-  for (size_t i = byte_index_to_index(index); i < limit; ++i) {
+  for (size_t i = byte_index_to_index(node->index()); i < limit; ++i) {
     jbyte* card_ptr = static_cast<jbyte*>(buf[i]);
-    if (card_ptr != NULL) {
-      // Set the entry to null, so we don't do it again (via the test
-      // above) if we reconsider this buffer.
+    assert(card_ptr != NULL, "invariant");
+    if (!cl->do_card_ptr(card_ptr, worker_i)) {
       if (consume) {
-        buf[i] = NULL;
-      }
-      if (!cl->do_card_ptr(card_ptr, worker_i)) {
-        return false;
+        size_t new_index = index_to_byte_index(i + 1);
+        assert(new_index <= sz, "invariant");
+        node->set_index(new_index);
       }
+      return false;
     }
   }
+  if (consume) {
+    node->set_index(sz);
+  }
   return true;
 }
 
@@ -188,14 +189,15 @@
   t->dirty_card_queue().handle_zero_index();
 }
 
-bool DirtyCardQueueSet::mut_process_buffer(void** buf) {
+bool DirtyCardQueueSet::mut_process_buffer(BufferNode* node) {
   guarantee(_free_ids != NULL, "must be");
 
   // claim a par id
   uint worker_i = _free_ids->claim_par_id();
 
-  bool b = DirtyCardQueue::apply_closure_to_buffer(_mut_process_closure, buf, 0,
-                                                   _sz, true, worker_i);
+  bool b = DirtyCardQueue::apply_closure_to_buffer(_mut_process_closure,
+                                                   node, _sz,
+                                                   true, worker_i);
   if (b) {
     Atomic::inc(&_processed_buffers_mut);
   }
@@ -239,49 +241,30 @@
   if (nd == NULL) {
     return false;
   } else {
-    void** buf = BufferNode::make_buffer_from_node(nd);
-    size_t index = nd->index();
-    if (DirtyCardQueue::apply_closure_to_buffer(cl,
-                                                buf, index, _sz,
-                                                true, worker_i)) {
+    if (DirtyCardQueue::apply_closure_to_buffer(cl, nd, _sz, true, worker_i)) {
       // Done with fully processed buffer.
-      deallocate_buffer(buf);
+      deallocate_buffer(nd);
       Atomic::inc(&_processed_buffers_rs_thread);
       return true;
     } else {
       // Return partially processed buffer to the queue.
-      enqueue_complete_buffer(buf, index);
+      enqueue_complete_buffer(nd);
       return false;
     }
   }
 }
 
-void DirtyCardQueueSet::apply_closure_to_all_completed_buffers(CardTableEntryClosure* cl) {
-  BufferNode* nd = _completed_buffers_head;
-  while (nd != NULL) {
-    bool b =
-      DirtyCardQueue::apply_closure_to_buffer(cl,
-                                              BufferNode::make_buffer_from_node(nd),
-                                              0, _sz, false);
-    guarantee(b, "Should not stop early.");
-    nd = nd->next();
-  }
-}
-
 void DirtyCardQueueSet::par_apply_closure_to_all_completed_buffers(CardTableEntryClosure* cl) {
   BufferNode* nd = _cur_par_buffer_node;
   while (nd != NULL) {
-    BufferNode* next = (BufferNode*)nd->next();
-    BufferNode* actual = (BufferNode*)Atomic::cmpxchg_ptr((void*)next, (volatile void*)&_cur_par_buffer_node, (void*)nd);
+    BufferNode* next = nd->next();
+    void* actual = Atomic::cmpxchg_ptr(next, &_cur_par_buffer_node, nd);
     if (actual == nd) {
-      bool b =
-        DirtyCardQueue::apply_closure_to_buffer(cl,
-                                                BufferNode::make_buffer_from_node(actual),
-                                                0, _sz, false);
+      bool b = DirtyCardQueue::apply_closure_to_buffer(cl, nd, _sz, false);
       guarantee(b, "Should not stop early.");
       nd = next;
     } else {
-      nd = actual;
+      nd = static_cast<BufferNode*>(actual);
     }
   }
 }
@@ -304,7 +287,7 @@
   while (buffers_to_delete != NULL) {
     BufferNode* nd = buffers_to_delete;
     buffers_to_delete = nd->next();
-    deallocate_buffer(BufferNode::make_buffer_from_node(nd));
+    deallocate_buffer(nd);
   }
 
 }
@@ -320,6 +303,13 @@
   shared_dirty_card_queue()->reset();
 }
 
+void DirtyCardQueueSet::concatenate_log(DirtyCardQueue& dcq) {
+  if (!dcq.is_empty()) {
+    enqueue_complete_buffer(
+      BufferNode::make_node_from_buffer(dcq.get_buf(), dcq.get_index()));
+    dcq.reinitialize();
+  }
+}
 
 void DirtyCardQueueSet::concatenate_logs() {
   // Iterate over all the threads, if we find a partial log add it to
@@ -329,23 +319,9 @@
   _max_completed_queue = max_jint;
   assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
   for (JavaThread* t = Threads::first(); t; t = t->next()) {
-    DirtyCardQueue& dcq = t->dirty_card_queue();
-    if (dcq.size() != 0) {
-      void** buf = dcq.get_buf();
-      // We must NULL out the unused entries, then enqueue.
-      size_t limit = dcq.byte_index_to_index(dcq.get_index());
-      for (size_t i = 0; i < limit; ++i) {
-        buf[i] = NULL;
-      }
-      enqueue_complete_buffer(dcq.get_buf(), dcq.get_index());
-      dcq.reinitialize();
-    }
-  }
-  if (_shared_dirty_card_queue.size() != 0) {
-    enqueue_complete_buffer(_shared_dirty_card_queue.get_buf(),
-                            _shared_dirty_card_queue.get_index());
-    _shared_dirty_card_queue.reinitialize();
+    concatenate_log(t->dirty_card_queue());
   }
+  concatenate_log(_shared_dirty_card_queue);
   // Restore the completed buffer queue limit.
   _max_completed_queue = save_max_completed_queue;
 }
--- old/src/share/vm/gc/g1/dirtyCardQueue.hpp	2016-03-01 21:48:23.254926709 -0500
+++ new/src/share/vm/gc/g1/dirtyCardQueue.hpp	2016-03-01 21:48:23.154926213 -0500
@@ -52,21 +52,22 @@
   // Process queue entries and release resources.
   void flush() { flush_impl(); }
 
-  // Apply the closure to all elements, and reset the index to make the
-  // buffer empty.  If a closure application returns "false", return
-  // "false" immediately, halting the iteration.  If "consume" is true,
-  // deletes processed entries from logs.
+  // Apply the closure to all active elements, from index to size.  If
+  // all closure applications return true, then returns true.  Stops
+  // processing after the first false closure application and returns
+  // false.  If "consume" is true, index is updated to follow the last
+  // processed element.
   bool apply_closure(CardTableEntryClosure* cl,
                      bool consume = true,
                      uint worker_i = 0);
 
-  // Apply the closure to all elements of "buf", down to "index"
-  // (inclusive.)  If returns "false", then a closure application returned
-  // "false", and we return immediately.  If "consume" is true, entries are
-  // set to NULL as they are processed, so they will not be processed again
-  // later.
+  // Apply the closure to all active elements of "node", from it's
+  // index to sz.  If all closure applications return true, then
+  // returns true.  Stops processing after the first false closure
+  // application and returns false.  If "consume" is true, the node's
+  // index is updated to follow the last processed element.
   static bool apply_closure_to_buffer(CardTableEntryClosure* cl,
-                                      void** buf, size_t index, size_t sz,
+                                      BufferNode* node, size_t sz,
                                       bool consume = true,
                                       uint worker_i = 0);
   void **get_buf() { return _buf;}
@@ -95,7 +96,7 @@
   DirtyCardQueue _shared_dirty_card_queue;
 
   // Override.
-  bool mut_process_buffer(void** buf);
+  bool mut_process_buffer(BufferNode* node);
 
   // Protected by the _cbl_mon.
   FreeIdSet* _free_ids;
@@ -107,6 +108,9 @@
 
   // Current buffer node used for parallel iteration.
   BufferNode* volatile _cur_par_buffer_node;
+
+  void concatenate_log(DirtyCardQueue& dcq);
+
 public:
   DirtyCardQueueSet(bool notify_when_complete = true);
 
@@ -126,12 +130,13 @@
   static void handle_zero_index_for_thread(JavaThread* t);
 
   // If there exists some completed buffer, pop it, then apply the
-  // specified closure to all its elements, nulling out those elements
-  // processed.  If all elements are processed, returns "true".  If no
-  // completed buffers exist, returns false.  If a completed buffer exists,
-  // but is only partially completed before a "yield" happens, the
-  // partially completed buffer (with its processed elements set to NULL)
-  // is returned to the completed buffer set, and this call returns false.
+  // specified closure to its active elements.  If all active elements
+  // are processed, returns "true".  If no completed buffers exist,
+  // returns false.  If a completed buffer exists, but is only
+  // partially completed before a "yield" happens, the partially
+  // completed buffer (with its index updated to exclude the processed
+  // eleemnts) is returned to the completed buffer set, and this call
+  // returns false.
   bool apply_closure_to_completed_buffer(CardTableEntryClosure* cl,
                                          uint worker_i,
                                          size_t stop_at,
@@ -139,13 +144,10 @@
 
   BufferNode* get_completed_buffer(size_t stop_at);
 
-  // Applies the current closure to all completed buffers,
-  // non-consumptively.
-  void apply_closure_to_all_completed_buffers(CardTableEntryClosure* cl);
-
   void reset_for_par_iteration() { _cur_par_buffer_node = _completed_buffers_head; }
   // Applies the current closure to all completed buffers, non-consumptively.
-  // Parallel version.
+  // Can be used in parallel, all callers using the iteration state initialized
+  // by reset_for_par_iteration.
   void par_apply_closure_to_all_completed_buffers(CardTableEntryClosure* cl);
 
   DirtyCardQueue* shared_dirty_card_queue() {
--- old/src/share/vm/gc/g1/ptrQueue.cpp	2016-03-01 21:48:23.806929447 -0500
+++ new/src/share/vm/gc/g1/ptrQueue.cpp	2016-03-01 21:48:23.710928971 -0500
@@ -43,16 +43,12 @@
 
 void PtrQueue::flush_impl() {
   if (!_permanent && _buf != NULL) {
+    BufferNode* node = BufferNode::make_node_from_buffer(_buf, _index);
     if (_index == _sz) {
       // No work to do.
-      qset()->deallocate_buffer(_buf);
+      qset()->deallocate_buffer(node);
     } else {
-      // We must NULL out the unused entries, then enqueue.
-      size_t limit = byte_index_to_index(_index);
-      for (size_t i = 0; i < limit; ++i) {
-        _buf[i] = NULL;
-      }
-      qset()->enqueue_complete_buffer(_buf);
+      qset()->enqueue_complete_buffer(node);
     }
     _buf = NULL;
     _index = 0;
@@ -74,7 +70,7 @@
   assert(_index <= _sz, "Invariant.");
 }
 
-void PtrQueue::locking_enqueue_completed_buffer(void** buf) {
+void PtrQueue::locking_enqueue_completed_buffer(BufferNode* node) {
   assert(_lock->owned_by_self(), "Required.");
 
   // We have to unlock _lock (which may be Shared_DirtyCardQ_lock) before
@@ -82,7 +78,7 @@
   // have the same rank and we may get the "possible deadlock" message
   _lock->unlock();
 
-  qset()->enqueue_complete_buffer(buf);
+  qset()->enqueue_complete_buffer(node);
   // We must relock only because the caller will unlock, for the normal
   // case.
   _lock->lock_without_safepoint_check();
@@ -157,10 +153,9 @@
   return BufferNode::make_buffer_from_node(node);
 }
 
-void PtrQueueSet::deallocate_buffer(void** buf) {
+void PtrQueueSet::deallocate_buffer(BufferNode* node) {
   assert(_sz > 0, "Didn't set a buffer size.");
   MutexLockerEx x(_fl_owner->_fl_lock, Mutex::_no_safepoint_check_flag);
-  BufferNode *node = BufferNode::make_node_from_buffer(buf);
   node->set_next(_fl_owner->_buf_free_list);
   _fl_owner->_buf_free_list = node;
   _fl_owner->_buf_free_list_sz++;
@@ -211,10 +206,10 @@
       // preventing the subsequent the multiple enqueue, and
       // install a newly allocated buffer below.
 
-      void** buf = _buf;   // local pointer to completed buffer
+      BufferNode* node = BufferNode::make_node_from_buffer(_buf, _index);
       _buf = NULL;         // clear shared _buf field
 
-      locking_enqueue_completed_buffer(buf);  // enqueue completed buffer
+      locking_enqueue_completed_buffer(node); // enqueue completed buffer
 
       // While the current thread was enqueueing the buffer another thread
       // may have a allocated a new buffer and inserted it into this pointer
@@ -224,9 +219,11 @@
 
       if (_buf != NULL) return;
     } else {
-      if (qset()->process_or_enqueue_complete_buffer(_buf)) {
+      BufferNode* node = BufferNode::make_node_from_buffer(_buf, _index);
+      if (qset()->process_or_enqueue_complete_buffer(node)) {
         // Recycle the buffer. No allocation.
-        _sz = qset()->buffer_size();
+        assert(_buf == BufferNode::make_buffer_from_node(node), "invariant");
+        assert(_sz == qset()->buffer_size(), "invariant");
         _index = _sz;
         return;
       }
@@ -238,12 +235,12 @@
   _index = _sz;
 }
 
-bool PtrQueueSet::process_or_enqueue_complete_buffer(void** buf) {
+bool PtrQueueSet::process_or_enqueue_complete_buffer(BufferNode* node) {
   if (Thread::current()->is_Java_thread()) {
     // We don't lock. It is fine to be epsilon-precise here.
     if (_max_completed_queue == 0 || _max_completed_queue > 0 &&
         _n_completed_buffers >= _max_completed_queue + _completed_queue_padding) {
-      bool b = mut_process_buffer(buf);
+      bool b = mut_process_buffer(node);
       if (b) {
         // True here means that the buffer hasn't been deallocated and the caller may reuse it.
         return true;
@@ -251,14 +248,12 @@
     }
   }
   // The buffer will be enqueued. The caller will have to get a new one.
-  enqueue_complete_buffer(buf);
+  enqueue_complete_buffer(node);
   return false;
 }
 
-void PtrQueueSet::enqueue_complete_buffer(void** buf, size_t index) {
+void PtrQueueSet::enqueue_complete_buffer(BufferNode* cbn) {
   MutexLockerEx x(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  BufferNode* cbn = BufferNode::make_node_from_buffer(buf);
-  cbn->set_index(index);
   cbn->set_next(NULL);
   if (_completed_buffers_tail == NULL) {
     assert(_completed_buffers_head == NULL, "Well-formedness");
--- old/src/share/vm/gc/g1/ptrQueue.hpp	2016-03-01 21:48:24.358932184 -0500
+++ new/src/share/vm/gc/g1/ptrQueue.hpp	2016-03-01 21:48:24.254931668 -0500
@@ -33,6 +33,7 @@
 // the addresses of modified old-generation objects.  This type supports
 // this operation.
 
+class BufferNode;
 class PtrQueueSet;
 class PtrQueue VALUE_OBJ_CLASS_SPEC {
   friend class VMStructs;
@@ -104,7 +105,7 @@
   // get into an infinite loop).
   virtual bool should_enqueue_buffer() { return true; }
   void handle_zero_index();
-  void locking_enqueue_completed_buffer(void** buf);
+  void locking_enqueue_completed_buffer(BufferNode* node);
 
   void enqueue_known_active(void* ptr);
 
@@ -136,6 +137,10 @@
     return ind / sizeof(void*);
   }
 
+  static size_t index_to_byte_index(size_t ind) {
+    return ind * sizeof(void*);
+  }
+
   // To support compiler.
 
 protected:
@@ -186,10 +191,13 @@
   // Free a BufferNode.
   static void deallocate(BufferNode* node);
 
-  // Return the BufferNode containing the buffer.
-  static BufferNode* make_node_from_buffer(void** buffer) {
-    return reinterpret_cast<BufferNode*>(
-      reinterpret_cast<char*>(buffer) - buffer_offset());
+  // Return the BufferNode containing the buffer, after setting its index.
+  static BufferNode* make_node_from_buffer(void** buffer, size_t index) {
+    BufferNode* node =
+      reinterpret_cast<BufferNode*>(
+        reinterpret_cast<char*>(buffer) - buffer_offset());
+    node->set_index(index);
+    return node;
   }
 
   // Return the buffer for node.
@@ -243,7 +251,7 @@
   // A mutator thread does the the work of processing a buffer.
   // Returns "true" iff the work is complete (and the buffer may be
   // deallocated).
-  virtual bool mut_process_buffer(void** buf) {
+  virtual bool mut_process_buffer(BufferNode* node) {
     ShouldNotReachHere();
     return false;
   }
@@ -267,13 +275,13 @@
 
   // Return an empty buffer to the free list.  The "buf" argument is
   // required to be a pointer to the head of an array of length "_sz".
-  void deallocate_buffer(void** buf);
+  void deallocate_buffer(BufferNode* node);
 
   // Declares that "buf" is a complete buffer.
-  void enqueue_complete_buffer(void** buf, size_t index = 0);
+  void enqueue_complete_buffer(BufferNode* node);
 
   // To be invoked by the mutator.
-  bool process_or_enqueue_complete_buffer(void** buf);
+  bool process_or_enqueue_complete_buffer(BufferNode* node);
 
   bool completed_buffers_exist_dirty() {
     return _n_completed_buffers > 0;
--- old/src/share/vm/gc/g1/satbMarkQueue.cpp	2016-03-01 21:48:24.938935060 -0500
+++ new/src/share/vm/gc/g1/satbMarkQueue.cpp	2016-03-01 21:48:24.842934584 -0500
@@ -100,6 +100,10 @@
   return true;
 }
 
+inline bool retain_entry(const void* entry, G1CollectedHeap* heap) {
+  return requires_marking(entry, heap) && !heap->isMarkedNext((oop)entry);
+}
+
 // This method removes entries from a SATB buffer that will not be
 // useful to the concurrent marking threads.  Entries are retained if
 // they require marking and are not already marked. Retained entries
@@ -114,43 +118,23 @@
     return;
   }
 
-  // Used for sanity checking at the end of the loop.
-  DEBUG_ONLY(size_t entries = 0; size_t retained = 0;)
-
   assert(_index <= _sz, "invariant");
-  void** limit = &buf[byte_index_to_index(_index)];
-  void** src = &buf[byte_index_to_index(_sz)];
-  void** dst = src;
-
-  while (limit < src) {
-    DEBUG_ONLY(entries += 1;)
-    --src;
+
+  // Two-fingered compaction toward the end.
+  void** src = &buf[byte_index_to_index(_index)];
+  void** dst = &buf[byte_index_to_index(_sz)];
+  for ( ; src < dst; ++src) {
+    // Search low to high for an entry to keep.
     void* entry = *src;
-    // NULL the entry so that unused parts of the buffer contain NULLs
-    // at the end. If we are going to retain it we will copy it to its
-    // final place. If we have retained all entries we have visited so
-    // far, we'll just end up copying it to the same place.
-    *src = NULL;
-
-    if (requires_marking(entry, g1h) && !g1h->isMarkedNext((oop)entry)) {
-      --dst;
-      assert(*dst == NULL, "filtering destination should be clear");
-      *dst = entry;
-      DEBUG_ONLY(retained += 1;);
+    if (retain_entry(entry, g1h)) {
+      // Found keeper.  Search high to low for an entry to discard.
+      while ((src < --dst) && retain_entry(*dst, g1h)) { }
+      if (src >= dst) break;    // Done if no discard found.
+      *dst = entry;             // Replace discard with keeper.
     }
   }
-  size_t new_index = pointer_delta(dst, buf, 1);
-
-#ifdef ASSERT
-  size_t entries_calc = (_sz - _index) / sizeof(void*);
-  assert(entries == entries_calc, "the number of entries we counted "
-         "should match the number of entries we calculated");
-  size_t retained_calc = (_sz - new_index) / sizeof(void*);
-  assert(retained == retained_calc, "the number of retained entries we counted "
-         "should match the number of retained entries we calculated");
-#endif // ASSERT
-
-  _index = new_index;
+  assert(src == dst, "invariant");
+  _index = pointer_delta(dst, buf, 1);
 }
 
 // This method will first apply the above filtering to the buffer. If
@@ -286,19 +270,11 @@
   }
   if (nd != NULL) {
     void **buf = BufferNode::make_buffer_from_node(nd);
-    // Skip over NULL entries at beginning (e.g. push end) of buffer.
-    // Filtering can result in non-full completed buffers; see
-    // should_enqueue_buffer.
-    assert(_sz % sizeof(void*) == 0, "invariant");
-    size_t limit = SATBMarkQueue::byte_index_to_index(_sz);
-    for (size_t i = 0; i < limit; ++i) {
-      if (buf[i] != NULL) {
-        // Found the end of the block of NULLs; process the remainder.
-        cl->do_buffer(buf + i, limit - i);
-        break;
-      }
-    }
-    deallocate_buffer(buf);
+    size_t index = SATBMarkQueue::byte_index_to_index(nd->index());
+    size_t size = SATBMarkQueue::byte_index_to_index(_sz);
+    assert(index <= size, "invariant");
+    cl->do_buffer(buf + index, size - index);
+    deallocate_buffer(nd);
     return true;
   } else {
     return false;
@@ -355,7 +331,7 @@
   while (buffers_to_delete != NULL) {
     BufferNode* nd = buffers_to_delete;
     buffers_to_delete = nd->next();
-    deallocate_buffer(BufferNode::make_buffer_from_node(nd));
+    deallocate_buffer(nd);
   }
   assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
   // So we can safely manipulate these queues.
--- old/src/share/vm/gc/g1/satbMarkQueue.hpp	2016-03-01 21:48:25.486937777 -0500
+++ new/src/share/vm/gc/g1/satbMarkQueue.hpp	2016-03-01 21:48:25.390937301 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -115,9 +115,8 @@
 
   // If there exists some completed buffer, pop and process it, and
   // return true.  Otherwise return false.  Processing a buffer
-  // consists of applying the closure to the buffer range starting
-  // with the first non-NULL entry to the end of the buffer; the
-  // leading entries may be NULL due to filtering.
+  // consists of applying the closure to the active range of the
+  // buffer; the leading entries may be excluded due to filtering.
   bool apply_closure_to_completed_buffer(SATBBufferClosure* cl);
 
 #ifndef PRODUCT