--- old/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	2015-03-05 15:35:36.726410439 +0100
+++ new/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	2015-03-05 15:35:36.648408121 +0100
@@ -46,6 +46,7 @@
 #include "gc_implementation/g1/g1MarkSweep.hpp"
 #include "gc_implementation/g1/g1OopClosures.inline.hpp"
 #include "gc_implementation/g1/g1ParScanThreadState.inline.hpp"
+#include "gc_implementation/g1/g1EvacStats.hpp"
 #include "gc_implementation/g1/g1RegionToSpaceMapper.hpp"
 #include "gc_implementation/g1/g1RemSet.inline.hpp"
 #include "gc_implementation/g1/g1StringDedup.hpp"
@@ -668,7 +669,7 @@
   check_bitmaps("Humongous Region Allocation", first_hr);
 
   assert(first_hr->used() == word_size * HeapWordSize, "invariant");
-  _allocator->increase_used(first_hr->used());
+  increase_used(first_hr->used());
   _humongous_set.add(first_hr);
 
   return new_obj;
@@ -851,22 +852,20 @@
 
     {
       MutexLockerEx x(Heap_lock);
-      result = _allocator->mutator_alloc_region(context)->attempt_allocation_locked(word_size,
-                                                                                    false /* bot_updates */);
+      result = _allocator->par_allocate_during_mutator_locked(word_size, false /* bot_updates */, context);
       if (result != NULL) {
         return result;
       }
 
       // If we reach here, attempt_allocation_locked() above failed to
       // allocate a new region. So the mutator alloc region should be NULL.
-      assert(_allocator->mutator_alloc_region(context)->get() == NULL, "only way to get here");
+//      assert(_allocator->mutator_alloc_region(context)->get() == NULL, "only way to get here");
 
       if (GC_locker::is_active_and_needs_gc()) {
         if (g1_policy()->can_expand_young_list()) {
           // No need for an ergo verbose message here,
           // can_expand_young_list() does this when it returns true.
-          result = _allocator->mutator_alloc_region(context)->attempt_allocation_force(word_size,
-                                                                                       false /* bot_updates */);
+          result = _allocator->par_allocate_during_mutator_force(word_size, false /* bot_updates */, context);
           if (result != NULL) {
             return result;
           }
@@ -926,8 +925,7 @@
     // first attempt (without holding the Heap_lock) here and the
     // follow-on attempt will be at the start of the next loop
     // iteration (after taking the Heap_lock).
-    result = _allocator->mutator_alloc_region(context)->attempt_allocation(word_size,
-                                                                           false /* bot_updates */);
+    result = _allocator->par_allocate_during_mutator(word_size, false /* bot_updates */, context);
     if (result != NULL) {
       return result;
     }
@@ -1066,13 +1064,13 @@
                                                            AllocationContext_t context,
                                                            bool expect_null_mutator_alloc_region) {
   assert_at_safepoint(true /* should_be_vm_thread */);
+  /*
   assert(_allocator->mutator_alloc_region(context)->get() == NULL ||
                                              !expect_null_mutator_alloc_region,
          "the current alloc region was unexpectedly found to be non-NULL");
-
+*/
   if (!is_humongous(word_size)) {
-    return _allocator->mutator_alloc_region(context)->attempt_allocation_locked(word_size,
-                                                      false /* bot_updates */);
+    return _allocator->par_allocate_during_mutator_locked(word_size, false /* bot_updates */, context);
   } else {
     HeapWord* result = humongous_obj_allocate(word_size, context);
     if (result != NULL && g1_policy()->need_to_start_conc_mark("STW humongous allocation")) {
@@ -1783,8 +1781,6 @@
   _free_regions_coming(false),
   _young_list(new YoungList(this)),
   _gc_time_stamp(0),
-  _survivor_plab_stats(YoungPLABSize, PLABWeight),
-  _old_plab_stats(OldPLABSize, PLABWeight),
   _expand_heap_after_alloc_failure(true),
   _surviving_young_words(NULL),
   _old_marking_cycles_started(0),
@@ -2218,11 +2214,11 @@
 
 // Computes the sum of the storage used by the various regions.
 size_t G1CollectedHeap::used() const {
-  return _allocator->used();
+  return _summary_bytes_used + _allocator->used_in_alloc_regions();
 }
 
 size_t G1CollectedHeap::used_unlocked() const {
-  return _allocator->used_unlocked();
+  return _summary_bytes_used;
 }
 
 class SumUsedClosure: public HeapRegionClosure {
@@ -2716,20 +2712,7 @@
 }
 
 size_t G1CollectedHeap::unsafe_max_tlab_alloc(Thread* ignored) const {
-  // Return the remaining space in the cur alloc region, but not less than
-  // the min TLAB size.
-
-  // Also, this value can be at most the humongous object threshold,
-  // since we can't allow tlabs to grow big enough to accommodate
-  // humongous objects.
-
-  HeapRegion* hr = _allocator->mutator_alloc_region(AllocationContext::current())->get();
-  size_t max_tlab = max_tlab_size() * wordSize;
-  if (hr == NULL) {
-    return max_tlab;
-  } else {
-    return MIN2(MAX2(hr->free(), (size_t) MinTLABSize), max_tlab);
-  }
+  return _allocator->unsafe_max_tlab_alloc();
 }
 
 size_t G1CollectedHeap::max_capacity() const {
@@ -3938,7 +3921,7 @@
         _young_list->reset_auxilary_lists();
 
         if (evacuation_failed()) {
-          _allocator->set_used(recalculate_used());
+          set_used(recalculate_used());
           uint n_queues = MAX2((int)ParallelGCThreads, 1);
           for (uint i = 0; i < n_queues; i++) {
             if (_evacuation_failed_info_array[i].has_failed()) {
@@ -3948,7 +3931,7 @@
         } else {
           // The "used" of the the collection set have already been subtracted
           // when they were freed.  Add in the bytes evacuated.
-          _allocator->increase_used(g1_policy()->bytes_copied_during_gc());
+          increase_used(g1_policy()->bytes_copied_during_gc());
         }
 
         if (g1_policy()->during_initial_mark_pause()) {
@@ -4155,7 +4138,7 @@
     // Forward-to-self succeeded.
     assert(_par_scan_state != NULL, "par scan state");
     OopsInHeapRegionClosure* cl = _par_scan_state->evac_failure_closure();
-    uint queue_num = _par_scan_state->queue_num();
+    uint queue_num = _par_scan_state->worker_queue_id();
 
     _evacuation_failed = true;
     _evacuation_failed_info_array[queue_num].register_copy_failure(old->size());
@@ -4257,7 +4240,7 @@
 
   oop obj = oopDesc::decode_heap_oop_not_null(heap_oop);
 
-  assert(_worker_id == _par_scan_state->queue_num(), "sanity");
+  assert(_worker_id == _par_scan_state->worker_queue_id(), "sanity");
 
   const InCSetState state = _g1->in_cset_state(obj);
   if (state.is_in_cset()) {
@@ -4299,6 +4282,13 @@
 template void G1ParCopyClosure<G1BarrierEvac, G1MarkNone>::do_oop_work(narrowOop* p);
 
 class G1ParEvacuateFollowersClosure : public VoidClosure {
+private:
+  double _start_term;
+  double _term_time;
+  size_t _term_attempts;
+
+  void start_term_time() { _term_attempts++; _start_term = os::elapsedTime(); }
+  void end_term_time() { _term_time += os::elapsedTime() - _start_term; }
 protected:
   G1CollectedHeap*              _g1h;
   G1ParScanThreadState*         _par_scan_state;
@@ -4315,19 +4305,23 @@
                                 RefToScanQueueSet* queues,
                                 ParallelTaskTerminator* terminator)
     : _g1h(g1h), _par_scan_state(par_scan_state),
-      _queues(queues), _terminator(terminator) {}
+      _queues(queues), _terminator(terminator),
+      _start_term(0.0), _term_time(0.0), _term_attempts(0) {}
 
   void do_void();
 
+  double term_time() const { return _term_time; }
+  size_t term_attempts() const { return _term_attempts; }
+
 private:
   inline bool offer_termination();
 };
 
 bool G1ParEvacuateFollowersClosure::offer_termination() {
   G1ParScanThreadState* const pss = par_scan_state();
-  pss->start_term_time();
+  start_term_time();
   const bool res = terminator()->offer_termination();
-  pss->end_term_time();
+  end_term_time();
   return res;
 }
 
@@ -4418,20 +4412,18 @@
 class G1ParTask : public AbstractGangTask {
 protected:
   G1CollectedHeap*       _g1h;
-  RefToScanQueueSet      *_queues;
+  G1ParScanThreadState** _pss;
+  RefToScanQueueSet*     _queues;
   ParallelTaskTerminator _terminator;
   uint _n_workers;
 
-  Mutex _stats_lock;
-  Mutex* stats_lock() { return &_stats_lock; }
-
 public:
-  G1ParTask(G1CollectedHeap* g1h, RefToScanQueueSet *task_queues)
+  G1ParTask(G1CollectedHeap* g1h, G1ParScanThreadState** pss, RefToScanQueueSet *task_queues)
     : AbstractGangTask("G1 collection"),
       _g1h(g1h),
+      _pss(pss),
       _queues(task_queues),
-      _terminator(0, _queues),
-      _stats_lock(Mutex::leaf, "parallel G1 stats lock", true)
+      _terminator(0, _queues)
   {}
 
   RefToScanQueueSet* queues() { return _queues; }
@@ -4495,26 +4487,24 @@
 
       ReferenceProcessor*             rp = _g1h->ref_processor_stw();
 
-      G1ParScanThreadState            pss(_g1h, worker_id, rp);
-      G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, rp);
-
-      pss.set_evac_failure_closure(&evac_failure_cl);
+      G1ParScanThreadState*           pss = _pss[worker_id];
+      pss->set_ref_processor(rp);
 
       bool only_young = _g1h->g1_policy()->gcs_are_young();
 
       // Non-IM young GC.
-      G1ParCopyClosure<G1BarrierNone, G1MarkNone>             scan_only_root_cl(_g1h, &pss, rp);
+      G1ParCopyClosure<G1BarrierNone, G1MarkNone>             scan_only_root_cl(_g1h, pss, rp);
       G1CLDClosure<G1MarkNone>                                scan_only_cld_cl(&scan_only_root_cl,
                                                                                only_young, // Only process dirty klasses.
                                                                                false);     // No need to claim CLDs.
       // IM young GC.
       //    Strong roots closures.
-      G1ParCopyClosure<G1BarrierNone, G1MarkFromRoot>         scan_mark_root_cl(_g1h, &pss, rp);
+      G1ParCopyClosure<G1BarrierNone, G1MarkFromRoot>         scan_mark_root_cl(_g1h, pss, rp);
       G1CLDClosure<G1MarkFromRoot>                            scan_mark_cld_cl(&scan_mark_root_cl,
                                                                                false, // Process all klasses.
                                                                                true); // Need to claim CLDs.
       //    Weak roots closures.
-      G1ParCopyClosure<G1BarrierNone, G1MarkPromotedFromRoot> scan_mark_weak_root_cl(_g1h, &pss, rp);
+      G1ParCopyClosure<G1BarrierNone, G1MarkPromotedFromRoot> scan_mark_weak_root_cl(_g1h, pss, rp);
       G1CLDClosure<G1MarkPromotedFromRoot>                    scan_mark_weak_cld_cl(&scan_mark_weak_root_cl,
                                                                                     false, // Process all klasses.
                                                                                     true); // Need to claim CLDs.
@@ -4550,9 +4540,9 @@
       }
 
 
-      G1ParPushHeapRSClosure  push_heap_rs_cl(_g1h, &pss);
+      G1ParPushHeapRSClosure  push_heap_rs_cl(_g1h, pss);
 
-      pss.start_strong_roots();
+      double start_strong_roots = os::elapsedTime();
       _g1h->g1_process_roots(strong_root_cl,
                              weak_root_cl,
                              &push_heap_rs_cl,
@@ -4560,28 +4550,36 @@
                              weak_cld_cl,
                              strong_code_cl,
                              worker_id);
-
-      pss.end_strong_roots();
-
+      double strong_roots_time = os::elapsedTime() - start_strong_roots;
+      double evac_term_time = 0.0;
+      size_t evac_term_attempts = 0;
       {
         double start = os::elapsedTime();
-        G1ParEvacuateFollowersClosure evac(_g1h, &pss, _queues, &_terminator);
+        G1ParEvacuateFollowersClosure evac(_g1h, pss, _queues, &_terminator);
         evac.do_void();
         double elapsed_ms = (os::elapsedTime()-start)*1000.0;
-        double term_ms = pss.term_time()*1000.0;
-        _g1h->g1_policy()->phase_times()->add_obj_copy_time(worker_id, elapsed_ms-term_ms);
-        _g1h->g1_policy()->phase_times()->record_termination(worker_id, term_ms, pss.term_attempts());
-      }
-      _g1h->g1_policy()->record_thread_age_table(pss.age_table());
-      _g1h->update_surviving_young_words(pss.surviving_young_words()+1);
+        evac_term_attempts = evac.term_attempts();
+        evac_term_time = evac.term_time();
+        double term_ms = evac_term_time * 1000.0;
+        _g1h->g1_policy()->phase_times()->add_obj_copy_time(worker_id, elapsed_ms - term_ms);
+        _g1h->g1_policy()->phase_times()->record_termination(worker_id, term_ms, evac.term_attempts());
+      }
+      _g1h->g1_policy()->record_thread_age_table(pss->age_table());
+      _g1h->update_surviving_young_words(pss->surviving_young_words()+1);
+      assert(pss->queue_is_empty(), "should be empty");
 
       if (PrintTerminationStats) {
-        MutexLocker x(stats_lock());
-        pss.print_termination_stats(worker_id);
+        MutexLockerEx x(ParGCRareEvent_lock, Mutex::_no_safepoint_check_flag);
+        _g1h->print_termination_stats(gclog_or_tty,
+                                      worker_id,
+                                      os::elapsedTime() * 1000.0 - start_time_ms, /* elapsed time */
+                                      strong_roots_time * 1000.0,                 /* strong roots time */
+                                      evac_term_time * 1000.0,                    /* evac term time */
+                                      evac_term_attempts,                         /* evac term attempts */
+                                      pss->lab_waste(),                  /* alloc buffer waste */
+                                      pss->lab_undo_waste()                           /* undo waste */
+                                      );
       }
-
-      assert(pss.queue_is_empty(), "should be empty");
-
       // Close the inner scope so that the ResourceMark and HandleMark
       // destructors are executed here and are included as part of the
       // "GC Worker Time".
@@ -4592,6 +4590,31 @@
   }
 };
 
+void G1CollectedHeap::print_termination_stats_hdr(outputStream* const st) {
+  st->print_raw_cr("GC Termination Stats");
+  st->print_raw_cr("     elapsed  --strong roots-- -------termination------- ------waste (KiB)------");
+  st->print_raw_cr("thr     ms        ms      %        ms      %    attempts  total   alloc    undo");
+  st->print_raw_cr("--- --------- --------- ------ --------- ------ -------- ------- ------- -------");
+}
+
+void G1CollectedHeap::print_termination_stats(outputStream* const st,
+                                              uint worker_id,
+                                              double elapsed_ms,
+                                              double strong_roots_ms,
+                                              double term_ms,
+                                              size_t term_attempts,
+                                              size_t alloc_buffer_waste,
+                                              size_t undo_waste) const {
+  st->print_cr("%3d %9.2f %9.2f %6.2f "
+               "%9.2f %6.2f " SIZE_FORMAT_W(8) " "
+               SIZE_FORMAT_W(7) " " SIZE_FORMAT_W(7) " " SIZE_FORMAT_W(7),
+               worker_id, elapsed_ms, strong_roots_ms, strong_roots_ms * 100 / elapsed_ms,
+               term_ms, term_ms * 100 / elapsed_ms, term_attempts,
+               (alloc_buffer_waste + undo_waste) * HeapWordSize / K,
+               alloc_buffer_waste * HeapWordSize / K,
+               undo_waste * HeapWordSize / K);
+}
+
 // *** Common G1 Evacuation Stuff
 
 // This method is run in a GC worker.
@@ -5238,17 +5261,20 @@
 
 class G1STWRefProcTaskExecutor: public AbstractRefProcTaskExecutor {
 private:
-  G1CollectedHeap*   _g1h;
-  RefToScanQueueSet* _queues;
-  FlexibleWorkGang*  _workers;
-  int                _active_workers;
+  G1CollectedHeap*        _g1h;
+  G1ParScanThreadState**  _pss;
+  RefToScanQueueSet*      _queues;
+  FlexibleWorkGang*       _workers;
+  int                     _active_workers;
 
 public:
   G1STWRefProcTaskExecutor(G1CollectedHeap* g1h,
-                        FlexibleWorkGang* workers,
-                        RefToScanQueueSet *task_queues,
-                        int n_workers) :
+                           G1ParScanThreadState** pss,
+                           FlexibleWorkGang* workers,
+                           RefToScanQueueSet *task_queues,
+                           int n_workers) :
     _g1h(g1h),
+    _pss(pss),
     _queues(task_queues),
     _workers(workers),
     _active_workers(n_workers)
@@ -5267,17 +5293,20 @@
   typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask;
   ProcessTask&     _proc_task;
   G1CollectedHeap* _g1h;
-  RefToScanQueueSet *_task_queues;
+  G1ParScanThreadState** _pss;
+  RefToScanQueueSet* _task_queues;
   ParallelTaskTerminator* _terminator;
 
 public:
   G1STWRefProcTaskProxy(ProcessTask& proc_task,
-                     G1CollectedHeap* g1h,
-                     RefToScanQueueSet *task_queues,
-                     ParallelTaskTerminator* terminator) :
+                        G1CollectedHeap* g1h,
+                        G1ParScanThreadState** pss,
+                        RefToScanQueueSet *task_queues,
+                        ParallelTaskTerminator* terminator) :
     AbstractGangTask("Process reference objects in parallel"),
     _proc_task(proc_task),
     _g1h(g1h),
+    _pss(pss),
     _task_queues(task_queues),
     _terminator(terminator)
   {}
@@ -5289,14 +5318,12 @@
 
     G1STWIsAliveClosure is_alive(_g1h);
 
-    G1ParScanThreadState            pss(_g1h, worker_id, NULL);
-    G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, NULL);
-
-    pss.set_evac_failure_closure(&evac_failure_cl);
+    G1ParScanThreadState*           pss = _pss[worker_id];
+    pss->set_ref_processor(NULL);
 
-    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, pss, NULL);
 
-    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, pss, NULL);
 
     OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
 
@@ -5306,10 +5333,10 @@
     }
 
     // Keep alive closure.
-    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, &pss);
+    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, pss);
 
     // Complete GC closure
-    G1ParEvacuateFollowersClosure drain_queue(_g1h, &pss, _task_queues, _terminator);
+    G1ParEvacuateFollowersClosure drain_queue(_g1h, pss, _task_queues, _terminator);
 
     // Call the reference processing task's work routine.
     _proc_task.work(worker_id, is_alive, keep_alive, drain_queue);
@@ -5328,7 +5355,7 @@
   assert(_workers != NULL, "Need parallel worker threads.");
 
   ParallelTaskTerminator terminator(_active_workers, _queues);
-  G1STWRefProcTaskProxy proc_task_proxy(proc_task, _g1h, _queues, &terminator);
+  G1STWRefProcTaskProxy proc_task_proxy(proc_task, _g1h, _pss, _queues, &terminator);
 
   _g1h->set_par_threads(_active_workers);
   _workers->run_task(&proc_task_proxy);
@@ -5375,14 +5402,16 @@
 class G1ParPreserveCMReferentsTask: public AbstractGangTask {
 protected:
   G1CollectedHeap* _g1h;
-  RefToScanQueueSet      *_queues;
+  G1ParScanThreadState** _pss;
+  RefToScanQueueSet *_queues;
   ParallelTaskTerminator _terminator;
   uint _n_workers;
 
 public:
-  G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h,int workers, RefToScanQueueSet *task_queues) :
+  G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h, G1ParScanThreadState** pss, int workers, RefToScanQueueSet *task_queues) :
     AbstractGangTask("ParPreserveCMReferents"),
     _g1h(g1h),
+    _pss(pss),
     _queues(task_queues),
     _terminator(workers, _queues),
     _n_workers(workers)
@@ -5392,16 +5421,13 @@
     ResourceMark rm;
     HandleMark   hm;
 
-    G1ParScanThreadState            pss(_g1h, worker_id, NULL);
-    G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, NULL);
+    G1ParScanThreadState*           pss = _pss[worker_id];
+    pss->set_ref_processor(NULL);
+    assert(pss->queue_is_empty(), "both queue and overflow should be empty");
 
-    pss.set_evac_failure_closure(&evac_failure_cl);
+    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, pss, NULL);
 
-    assert(pss.queue_is_empty(), "both queue and overflow should be empty");
-
-    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, &pss, NULL);
-
-    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, pss, NULL);
 
     OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
 
@@ -5415,7 +5441,7 @@
 
     // Copying keep alive closure. Applied to referent objects that need
     // to be copied.
-    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, &pss);
+    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, pss);
 
     ReferenceProcessor* rp = _g1h->ref_processor_cm();
 
@@ -5448,15 +5474,15 @@
     }
 
     // Drain the queue - which may cause stealing
-    G1ParEvacuateFollowersClosure drain_queue(_g1h, &pss, _queues, &_terminator);
+    G1ParEvacuateFollowersClosure drain_queue(_g1h, pss, _queues, &_terminator);
     drain_queue.do_void();
     // Allocation buffers were retired at the end of G1ParEvacuateFollowersClosure
-    assert(pss.queue_is_empty(), "should be");
+    assert(pss->queue_is_empty(), "should be");
   }
 };
 
 // Weak Reference processing during an evacuation pause (part 1).
-void G1CollectedHeap::process_discovered_references(uint no_of_gc_workers) {
+void G1CollectedHeap::process_discovered_references(G1ParScanThreadState** pss_, uint no_of_gc_workers) {
   double ref_proc_start = os::elapsedTime();
 
   ReferenceProcessor* rp = _ref_processor_stw;
@@ -5487,6 +5513,7 @@
 
   set_par_threads(no_of_gc_workers);
   G1ParPreserveCMReferentsTask keep_cm_referents(this,
+                                                 pss_,
                                                  no_of_gc_workers,
                                                  _task_queues);
 
@@ -5503,20 +5530,17 @@
   // JNI refs.
 
   // Use only a single queue for this PSS.
-  G1ParScanThreadState            pss(this, 0, NULL);
+  G1ParScanThreadState*           pss = pss_[0];
+  pss->set_ref_processor(NULL);
+  assert(pss->queue_is_empty(), "pre-condition");
 
   // We do not embed a reference processor in the copying/scanning
   // closures while we're actually processing the discovered
   // reference objects.
-  G1ParScanHeapEvacFailureClosure evac_failure_cl(this, &pss, NULL);
-
-  pss.set_evac_failure_closure(&evac_failure_cl);
-
-  assert(pss.queue_is_empty(), "pre-condition");
 
-  G1ParScanExtRootClosure        only_copy_non_heap_cl(this, &pss, NULL);
+  G1ParScanExtRootClosure        only_copy_non_heap_cl(this, pss, NULL);
 
-  G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(this, &pss, NULL);
+  G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(this, pss, NULL);
 
   OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
 
@@ -5526,10 +5550,10 @@
   }
 
   // Keep alive closure.
-  G1CopyingKeepAliveClosure keep_alive(this, copy_non_heap_cl, &pss);
+  G1CopyingKeepAliveClosure keep_alive(this, copy_non_heap_cl, pss);
 
   // Serial Complete GC closure
-  G1STWDrainQueueClosure drain_queue(this, &pss);
+  G1STWDrainQueueClosure drain_queue(this, pss);
 
   // Setup the soft refs policy...
   rp->setup_policy(false);
@@ -5548,7 +5572,7 @@
     assert(rp->num_q() == no_of_gc_workers, "sanity");
     assert(no_of_gc_workers <= rp->max_num_q(), "sanity");
 
-    G1STWRefProcTaskExecutor par_task_executor(this, workers(), _task_queues, no_of_gc_workers);
+    G1STWRefProcTaskExecutor par_task_executor(this, pss_, workers(), _task_queues, no_of_gc_workers);
     stats = rp->process_discovered_references(&is_alive,
                                               &keep_alive,
                                               &drain_queue,
@@ -5560,7 +5584,7 @@
   _gc_tracer_stw->report_gc_reference_stats(stats);
 
   // We have completed copying any necessary live referent objects.
-  assert(pss.queue_is_empty(), "both queue and overflow should be empty");
+  assert(pss->queue_is_empty(), "both queue and overflow should be empty");
 
   double ref_proc_time = os::elapsedTime() - ref_proc_start;
   g1_policy()->phase_times()->record_ref_proc_time(ref_proc_time * 1000.0);
@@ -5586,7 +5610,7 @@
     assert(rp->num_q() == no_of_gc_workers, "sanity");
     assert(no_of_gc_workers <= rp->max_num_q(), "sanity");
 
-    G1STWRefProcTaskExecutor par_task_executor(this, workers(), _task_queues, no_of_gc_workers);
+    G1STWRefProcTaskExecutor par_task_executor(this, NULL, workers(), _task_queues, no_of_gc_workers);
     rp->enqueue_discovered_references(&par_task_executor);
   }
 
@@ -5627,7 +5651,12 @@
   workers()->set_active_workers(n_workers);
   set_par_threads(n_workers);
 
-  G1ParTask g1_par_task(this, _task_queues);
+  G1ParScanThreadState** per_thread_states = NEW_C_HEAP_ARRAY(G1ParScanThreadState*, n_workers, mtGC);
+  for (uint i = 0; i < n_workers; i++) {
+    per_thread_states[i] = new G1ParScanThreadState(this, i);
+  }
+
+  G1ParTask g1_par_task(this, per_thread_states, _task_queues);
 
   init_for_evac_failure(NULL);
 
@@ -5643,7 +5672,9 @@
     }
 
      // The individual threads will set their evac-failure closures.
-     if (PrintTerminationStats) G1ParScanThreadState::print_termination_stats_hdr();
+     if (PrintTerminationStats) {
+       print_termination_stats_hdr(gclog_or_tty);
+     }
      // These tasks use ShareHeap::_process_strong_tasks
      assert(UseDynamicNumberOfGCThreads ||
             workers()->active_workers() == workers()->total_workers(),
@@ -5672,7 +5703,7 @@
   // as we may have to copy some 'reachable' referent
   // objects (and their reachable sub-graphs) that were
   // not copied during the pause.
-  process_discovered_references(n_workers);
+  process_discovered_references(per_thread_states, n_workers);
 
   if (G1StringDedup::is_enabled()) {
     G1STWIsAliveClosure is_alive(this);
@@ -5680,6 +5711,13 @@
     G1StringDedup::unlink_or_oops_do(&is_alive, &keep_alive);
   }
 
+  for (uint i = 0; i < n_workers; i++) {
+    delete per_thread_states[i];
+  }
+  FREE_C_HEAP_ARRAY(G1ParScanThreadState*, per_thread_states);
+
+  record_obj_copy_mem_stats();
+
   _allocator->release_gc_alloc_regions(n_workers, evacuation_info);
   g1_rem_set()->cleanup_after_oops_into_collection_set_do();
 
@@ -5715,6 +5753,31 @@
   COMPILER2_PRESENT(DerivedPointerTable::update_pointers());
 }
 
+void G1CollectedHeap::record_obj_copy_mem_stats() {
+  record_obj_copy_mem_stats(InCSetState::Young);
+  record_obj_copy_mem_stats(InCSetState::Old);
+}
+
+void G1CollectedHeap::record_obj_copy_mem_stats(InCSetState which) {
+  G1EvacStats* stats = _allocator->evac_stats(which);
+
+  EventGCG1EvacuationMemoryStatistics e;
+  if (e.should_commit()) {
+    e.set_gcId(GCId::peek().id());
+    e.set_gen(InCSetState::to_gen_number(which));
+    e.set_allocated(stats->allocated() * HeapWordSize);
+    e.set_wasted(stats->wasted() * HeapWordSize);
+    e.set_used(stats->used() * HeapWordSize);
+    e.set_undo_waste(stats->undo_waste() * HeapWordSize);
+    e.set_region_end_waste(stats->region_end_waste() * HeapWordSize);
+    e.set_regions_refilled(stats->regions_refilled());
+    e.set_inline_allocated(stats->inline_allocated() * HeapWordSize);
+    e.set_failure_used(stats->failure_used() * HeapWordSize);
+    e.set_failure_waste(stats->failure_waste() * HeapWordSize);
+    e.commit();
+  }
+}
+
 void G1CollectedHeap::free_region(HeapRegion* hr,
                                   FreeRegionList* free_list,
                                   bool par,
@@ -5781,7 +5844,7 @@
 }
 
 void G1CollectedHeap::decrement_summary_bytes(size_t bytes) {
-  _allocator->decrease_used(bytes);
+  decrease_used(bytes);
 }
 
 class G1ParCleanupCTTask : public AbstractGangTask {
@@ -6115,6 +6178,11 @@
       cur->set_evacuation_failed(false);
       // The region is now considered to be old.
       cur->set_old();
+      // Do some allocation statistics accounting. Regions that failed evacuation
+      // are always made old, so there is no need to update anything in the young
+      // gen statistics.
+      size_t used_words = cur->marked_bytes() / HeapWordSize;
+      _allocator->evac_stats(InCSetState::Old)->add_failure_used_and_waste(used_words, HeapRegion::GrainWords - used_words);
       _old_set.add(cur);
       evacuation_info.increment_collectionset_used_after(cur->used());
     }
@@ -6497,12 +6565,12 @@
   heap_region_iterate(&cl);
 
   if (!free_list_only) {
-    _allocator->set_used(cl.total_used());
+    set_used(cl.total_used());
   }
-  assert(_allocator->used_unlocked() == recalculate_used(),
+  assert(used_unlocked() == recalculate_used(),
          err_msg("inconsistent _allocator->used_unlocked(), "
                  "value: "SIZE_FORMAT" recalculated: "SIZE_FORMAT,
-                 _allocator->used_unlocked(), recalculate_used()));
+                 used_unlocked(), recalculate_used()));
 }
 
 void G1CollectedHeap::set_refine_cte_cl_concurrency(bool concurrent) {
@@ -6542,7 +6610,7 @@
   assert(alloc_region->is_eden(), "all mutator alloc regions should be eden");
 
   g1_policy()->add_region_to_incremental_cset_lhs(alloc_region);
-  _allocator->increase_used(allocated_bytes);
+  increase_used(allocated_bytes);
   _hr_printer.retire(alloc_region);
   // We update the eden sizes here, when the region is retired,
   // instead of when it's allocated, since this is the point that its
@@ -6550,21 +6618,6 @@
   g1mm()->update_eden_size();
 }
 
-void G1CollectedHeap::set_par_threads() {
-  // Don't change the number of workers.  Use the value previously set
-  // in the workgroup.
-  uint n_workers = workers()->active_workers();
-  assert(UseDynamicNumberOfGCThreads ||
-           n_workers == workers()->total_workers(),
-      "Otherwise should be using the total number of workers");
-  if (n_workers == 0) {
-    assert(false, "Should have been set in prior evacuation pause.");
-    n_workers = ParallelGCThreads;
-    workers()->set_active_workers(n_workers);
-  }
-  set_par_threads(n_workers);
-}
-
 // Methods for the GC alloc regions
 
 HeapRegion* G1CollectedHeap::new_gc_alloc_region(size_t word_size,
@@ -6613,6 +6666,21 @@
   _hr_printer.retire(alloc_region);
 }
 
+void G1CollectedHeap::set_par_threads() {
+  // Don't change the number of workers.  Use the value previously set
+  // in the workgroup.
+  uint n_workers = workers()->active_workers();
+  assert(UseDynamicNumberOfGCThreads ||
+           n_workers == workers()->total_workers(),
+      "Otherwise should be using the total number of workers");
+  if (n_workers == 0) {
+    assert(false, "Should have been set in prior evacuation pause.");
+    n_workers = ParallelGCThreads;
+    workers()->set_active_workers(n_workers);
+  }
+  set_par_threads(n_workers);
+}
+
 // Heap region set verification
 
 class VerifyRegionListsClosure : public HeapRegionClosure {