# HG changeset patch
# User rkennke
# Date 1495572270 -7200
#      Tue May 23 22:44:30 2017 +0200
# Node ID 61b7c1b90b36a96d9b830f555f9e7402f9a3092d
# Parent  eca62a5498ce2c230f7c9f05d79bd1bc3cb256d7
[mq]: parallel_sp_cleaning.patch

diff --git a/src/share/vm/gc/shared/collectedHeap.cpp b/src/share/vm/gc/shared/collectedHeap.cpp
--- a/src/share/vm/gc/shared/collectedHeap.cpp
+++ b/src/share/vm/gc/shared/collectedHeap.cpp
@@ -622,23 +622,6 @@
   // Default implementation does nothing.
 }
 
-void CollectedHeap::deflate_idle_monitors_all_threads() {
-  ObjectSynchronizer::deflate_idle_monitors_all_threads();
-}
-
-class DeflateIdleMonitorsThreadClosure : public ThreadClosure {
-public:
-  void do_thread(Thread* thread) {
-    ObjectSynchronizer::deflate_idle_monitors_and_oops_do(thread, NULL);
-  }
-};
-
-void CollectedHeap::parallel_deflate_idle_monitors(WorkGang* workers) {
-  StrongRootsScope(workers->active_workers());
-  DeflateIdleMonitorsThreadClosure cl;
-  Threads::parallel_java_threads_do(&cl);
-}
-
 #ifndef CC_INTERP
 void CollectedHeap::compile_prepare_oop(MacroAssembler* masm, Register obj) {
   // Default implementation does nothing.
diff --git a/src/share/vm/gc/shared/collectedHeap.hpp b/src/share/vm/gc/shared/collectedHeap.hpp
--- a/src/share/vm/gc/shared/collectedHeap.hpp
+++ b/src/share/vm/gc/shared/collectedHeap.hpp
@@ -601,31 +601,6 @@
   // Accumulate additional statistics from GCLABs.
   virtual void accumulate_statistics_all_gclabs();
 
-  // Return true if GC supports per-thread monitor deflation.
-  // In this case, idle monitors will not get deflated when entering
-  // a safepoint, but instead will get deflated when the GC
-  // calls into Thread::oops_do() or Thread::possibly_parallel_oops_do().
-  // This allows for better parallelization and cache behaviour.
-  //
-  // NOTICE that monitor deflation requires the mark words to be intact,
-  // which means that this can only be supported by GCs that don't stow
-  // away the mark word in order to temporarily store a forwarding pointer
-  // to it.
-  virtual bool supports_per_thread_monitor_deflation() const {
-    return false;
-  }
-
-  // This is called by ObjectSynchronizer::deflate_idle_monitors() when
-  // the above supports_per_thread_monitor_deflation() returns false,
-  // or on special non-GC cleanup safepoints (even if the above returns true).
-  // It gives the GC a chance to deflate idle monitors using its GC worker
-  // threads, and thus support parallelization of monitor deflation.
-  // The default implementation simply deflates idle monitors single-threaded,
-  // using the calling (VM) thread.
-  virtual void deflate_idle_monitors_all_threads();
-
-  void parallel_deflate_idle_monitors(WorkGang* workers);
-
   // Non product verification and debugging.
 #ifndef PRODUCT
   // Support for PromotionFailureALot.  Return true if it's time to cause a
diff --git a/src/share/vm/gc/shenandoah/shenandoahHeap.cpp b/src/share/vm/gc/shenandoah/shenandoahHeap.cpp
--- a/src/share/vm/gc/shenandoah/shenandoahHeap.cpp
+++ b/src/share/vm/gc/shenandoah/shenandoahHeap.cpp
@@ -2564,7 +2564,3 @@
     }
   }
 }
-
-void ShenandoahHeap::deflate_idle_monitors_all_threads() {
-  parallel_deflate_idle_monitors(workers());
-}
diff --git a/src/share/vm/gc/shenandoah/shenandoahHeap.hpp b/src/share/vm/gc/shenandoah/shenandoahHeap.hpp
--- a/src/share/vm/gc/shenandoah/shenandoahHeap.hpp
+++ b/src/share/vm/gc/shenandoah/shenandoahHeap.hpp
@@ -512,12 +512,6 @@
   void start_deferred_recycling();
   void defer_recycle(ShenandoahHeapRegion* r);
   void finish_deferred_recycle();
-
-  bool supports_per_thread_monitor_deflation() const {
-    return true;
-  }
-
-  void deflate_idle_monitors_all_threads();
 };
 
 #endif // SHARE_VM_GC_SHENANDOAH_SHENANDOAHHEAP_HPP
diff --git a/src/share/vm/gc/shenandoah/shenandoahRootProcessor.cpp b/src/share/vm/gc/shenandoah/shenandoahRootProcessor.cpp
--- a/src/share/vm/gc/shenandoah/shenandoahRootProcessor.cpp
+++ b/src/share/vm/gc/shenandoah/shenandoahRootProcessor.cpp
@@ -31,9 +31,11 @@
 #include "gc/shenandoah/shenandoahBarrierSet.hpp"
 #include "gc/shenandoah/shenandoahCollectorPolicy.hpp"
 #include "gc/shenandoah/shenandoahPhaseTimes.hpp"
+#include "gc/shenandoah/vm_operations_shenandoah.hpp"
 #include "memory/allocation.inline.hpp"
 #include "runtime/fprofiler.hpp"
 #include "runtime/mutex.hpp"
+#include "runtime/sweeper.hpp"
 #include "services/management.hpp"
 
 ShenandoahRootProcessor::ShenandoahRootProcessor(ShenandoahHeap* heap, uint n_workers,
@@ -42,14 +44,21 @@
   _srs(n_workers),
   _phase(phase),
   _codecache_iterator(CodeCache::parallel_iterator()),
-  _om_iterator(ObjectSynchronizer::parallel_iterator())
+  _om_iterator(ObjectSynchronizer::parallel_iterator()),
+  _threads_nmethods_cl(NULL)
 {
   heap->shenandoahPolicy()->record_workers_start(_phase);
+  VM_ShenandoahOperation* op = (VM_ShenandoahOperation*) VMThread::vm_operation();
+  if (! op->_safepoint_cleanup_done) {
+    _threads_nmethods_cl = NMethodSweeper::prepare_mark_active_nmethods();
+  }
 }
 
 ShenandoahRootProcessor::~ShenandoahRootProcessor() {
   delete _process_strong_tasks;
   ShenandoahHeap::heap()->shenandoahPolicy()->record_workers_end(_phase);
+  VM_ShenandoahOperation* op = (VM_ShenandoahOperation*) VMThread::vm_operation();
+  op->_safepoint_cleanup_done = true;
 }
 
 void ShenandoahRootProcessor::process_strong_roots(OopClosure* oops,
@@ -58,7 +67,7 @@
                                                    CodeBlobClosure* blobs,
                                                    uint worker_id) {
 
-  process_java_roots(oops, clds, NULL, blobs, worker_id);
+  process_java_roots(oops, clds, NULL, NULL, _threads_nmethods_cl, worker_id);
   process_vm_roots(oops, NULL, weak_oops, worker_id);
 
   _process_strong_tasks->all_tasks_completed(n_workers());
@@ -71,7 +80,7 @@
                                                 uint worker_id) {
 
   ShenandoahPhaseTimes* phase_times = ShenandoahHeap::heap()->shenandoahPolicy()->phase_times();
-  process_java_roots(oops, clds, clds, NULL, worker_id);
+  process_java_roots(oops, clds, clds, blobs, _threads_nmethods_cl, worker_id);
   process_vm_roots(oops, oops, weak_oops, worker_id);
 
   if (blobs != NULL) {
@@ -86,6 +95,7 @@
                                                  CLDClosure* strong_clds,
                                                  CLDClosure* weak_clds,
                                                  CodeBlobClosure* strong_code,
+                                                 CodeBlobClosure* nmethods_cl,
                                                  uint worker_id)
 {
   ShenandoahPhaseTimes* phase_times = ShenandoahHeap::heap()->shenandoahPolicy()->phase_times();
@@ -101,7 +111,7 @@
     ShenandoahParPhaseTimesTracker timer(phase_times, ShenandoahPhaseTimes::ThreadRoots, worker_id);
     bool is_par = n_workers() > 1;
     ResourceMark rm;
-    Threads::possibly_parallel_oops_do(is_par, strong_roots, strong_code);
+    Threads::possibly_parallel_oops_do(is_par, strong_roots, strong_code, nmethods_cl);
   }
 }
 
@@ -170,14 +180,21 @@
   _process_strong_tasks(new SubTasksDone(SHENANDOAH_RP_PS_NumElements)),
   _srs(n_workers),
   _phase(phase),
-  _codecache_iterator(CodeCache::parallel_iterator())
+  _codecache_iterator(CodeCache::parallel_iterator()),
+  _threads_nmethods_cl(NULL)
 {
   heap->shenandoahPolicy()->record_workers_start(_phase);
+  VM_ShenandoahOperation* op = (VM_ShenandoahOperation*) VMThread::vm_operation();
+  if (! op->_safepoint_cleanup_done) {
+    _threads_nmethods_cl = NMethodSweeper::prepare_mark_active_nmethods();
+  }
 }
 
 ShenandoahRootEvacuator::~ShenandoahRootEvacuator() {
   delete _process_strong_tasks;
   ShenandoahHeap::heap()->shenandoahPolicy()->record_workers_end(_phase);
+  VM_ShenandoahOperation* op = (VM_ShenandoahOperation*) VMThread::vm_operation();
+  op->_safepoint_cleanup_done = true;
 }
 
 void ShenandoahRootEvacuator::process_evacuate_roots(OopClosure* oops,
@@ -189,7 +206,8 @@
     bool is_par = n_workers() > 1;
     ResourceMark rm;
     ShenandoahParPhaseTimesTracker timer(phase_times, ShenandoahPhaseTimes::ThreadRoots, worker_id);
-    Threads::possibly_parallel_oops_do(is_par, oops, NULL);
+
+    Threads::possibly_parallel_oops_do(is_par, oops, NULL, _threads_nmethods_cl);
   }
 
   {
diff --git a/src/share/vm/gc/shenandoah/shenandoahRootProcessor.hpp b/src/share/vm/gc/shenandoah/shenandoahRootProcessor.hpp
--- a/src/share/vm/gc/shenandoah/shenandoahRootProcessor.hpp
+++ b/src/share/vm/gc/shenandoah/shenandoahRootProcessor.hpp
@@ -72,11 +72,13 @@
   ParallelCLDRootIterator   _cld_iterator;
   ParallelCodeCacheIterator _codecache_iterator;
   ParallelObjectSynchronizerIterator _om_iterator;
+  CodeBlobClosure* _threads_nmethods_cl;
 
   void process_java_roots(OopClosure* scan_non_heap_roots,
                           CLDClosure* scan_strong_clds,
                           CLDClosure* scan_weak_clds,
                           CodeBlobClosure* scan_strong_code,
+                          CodeBlobClosure* nmethods_cl,
                           uint worker_i);
 
   void process_vm_roots(OopClosure* scan_non_heap_roots,
@@ -110,6 +112,7 @@
   StrongRootsScope _srs;
   ShenandoahCollectorPolicy::TimingPhase _phase;
   ParallelCodeCacheIterator _codecache_iterator;
+  CodeBlobClosure* _threads_nmethods_cl;
 
 public:
   ShenandoahRootEvacuator(ShenandoahHeap* heap, uint n_workers,
diff --git a/src/share/vm/gc/shenandoah/shenandoah_globals.hpp b/src/share/vm/gc/shenandoah/shenandoah_globals.hpp
--- a/src/share/vm/gc/shenandoah/shenandoah_globals.hpp
+++ b/src/share/vm/gc/shenandoah/shenandoah_globals.hpp
@@ -217,6 +217,9 @@
   experimental(bool, ShenandoahFastSyncRoots, true,                         \
           "Enable fast synchronizer roots scanning")                        \
                                                                             \
+  experimental(bool, ShenandoahMergeSafepointCleanup, false,                \
+              "Do safepoint cleanup piggy-backed on thread scans")          \
+                                                                            \
   diagnostic(bool, ShenandoahSATBBarrier, true,                             \
           "Turn on/off SATB barriers in Shenandoah")                        \
                                                                             \
diff --git a/src/share/vm/gc/shenandoah/vm_operations_shenandoah.hpp b/src/share/vm/gc/shenandoah/vm_operations_shenandoah.hpp
--- a/src/share/vm/gc/shenandoah/vm_operations_shenandoah.hpp
+++ b/src/share/vm/gc/shenandoah/vm_operations_shenandoah.hpp
@@ -42,7 +42,10 @@
 protected:
   uint         _gc_id;
 public:
-  VM_ShenandoahOperation() : _gc_id(GCId::current()) {};
+  bool _safepoint_cleanup_done;
+  VM_ShenandoahOperation() : _gc_id(GCId::current()), _safepoint_cleanup_done(false) {};
+  virtual bool deflates_idle_monitors() { return ShenandoahMergeSafepointCleanup && ! _safepoint_cleanup_done; }
+  virtual bool marks_nmethods() { return ShenandoahMergeSafepointCleanup && ! _safepoint_cleanup_done; }
 };
 
 class VM_ShenandoahReferenceOperation : public VM_ShenandoahOperation {
@@ -92,6 +95,9 @@
   VM_Operation::VMOp_Type type() const { return VMOp_ShenandoahInitUpdateRefs; }
   const char* name()             const { return "Shenandoah Init Update References"; }
   virtual void doit();
+  // This is the only Shenandoah VM_Op that cannot take over deflation and nmethod marking.
+  bool deflates_idle_monitors() { return false; }
+  bool marks_nmethods() { return false; }
 };
 
 class VM_ShenandoahFinalUpdateRefs: public VM_ShenandoahOperation {
@@ -108,6 +114,9 @@
   VM_Operation::VMOp_Type type() const { return VMOp_ShenandoahVerifyHeapAfterEvacuation; }
   const char* name()             const { return "Shenandoah verify heap after evacuation"; }
   virtual void doit();
+  // We don't care about that here.
+  bool deflates_idle_monitors() { return false; }
+  bool marks_nmethods() { return false; }
 };
 
 #endif //SHARE_VM_GC_SHENANDOAH_VM_OPERATIONS_SHENANDOAH_HPP
diff --git a/src/share/vm/runtime/globals.hpp b/src/share/vm/runtime/globals.hpp
--- a/src/share/vm/runtime/globals.hpp
+++ b/src/share/vm/runtime/globals.hpp
@@ -1185,9 +1185,11 @@
                                                                             \
   product(bool, MonitorInUseLists, true, "Track Monitors for Deflation")    \
                                                                             \
-  experimental(bool, DeflateIdleMonitorsPerThread, false,                   \
-              "Deflate idle monitors in Java thread before entering "       \
-               "safepoint.")                                                \
+  experimental(bool, ParallelSafepointCleanup, false,                       \
+              "Enable parallel safepoint cleanup")                          \
+                                                                            \
+  product(uint, ParallelSafepointCleanupThreads, 8,                         \
+          "Number of parallel threads used for safepoint cleanup")          \
                                                                             \
   experimental(intx, SyncFlags, 0, "(Unsafe, Unstable) "                    \
                "Experimental Sync flags")                                   \
diff --git a/src/share/vm/runtime/safepoint.cpp b/src/share/vm/runtime/safepoint.cpp
--- a/src/share/vm/runtime/safepoint.cpp
+++ b/src/share/vm/runtime/safepoint.cpp
@@ -72,6 +72,9 @@
 // --------------------------------------------------------------------------------------------------
 // Implementation of Safepoint begin/end
 
+WorkGang* SafepointSynchronize::_cleanup_workers = NULL;
+SubTasksDone* SafepointSynchronize::_cleanup_subtasks = NULL;
+
 SafepointSynchronize::SynchronizeState volatile SafepointSynchronize::_state = SafepointSynchronize::_not_synchronized;
 volatile int  SafepointSynchronize::_waiting_to_block = 0;
 volatile int SafepointSynchronize::_safepoint_counter = 0;
@@ -543,11 +546,24 @@
 
 // Various cleaning tasks that should be done periodically at safepoints
 void SafepointSynchronize::do_cleanup_tasks() {
+  VM_Operation* op = VMThread::vm_operation();
+  // If op does both deflating and nmethod marking, we don't bother firing up
+  // the workers.
+  bool op_does_cleanup = op != NULL && op->marks_nmethods() && op->deflates_idle_monitors();
+  if (ParallelSafepointCleanup && ! op_does_cleanup) {
+    parallel_cleanup();
+  } else {
+    serial_cleanup();
+  }
+}
+
+void SafepointSynchronize::serial_cleanup() {
+  VM_Operation* op = VMThread::vm_operation();
   {
     const char* name = "deflating idle monitors";
     EventSafepointCleanupTask event;
     TraceTime timer(name, TRACETIME_LOG(Info, safepoint, cleanup));
-    ObjectSynchronizer::deflate_idle_monitors();
+    ObjectSynchronizer::deflate_idle_monitors(op == NULL || ! op->deflates_idle_monitors());
     event_safepoint_cleanup_task_commit(event, name);
   }
 
@@ -566,7 +582,7 @@
     event_safepoint_cleanup_task_commit(event, name);
   }
 
-  {
+  if (op == NULL || ! op->marks_nmethods()) {
     const char* name = "mark nmethods";
     EventSafepointCleanupTask event;
     TraceTime timer(name, TRACETIME_LOG(Info, safepoint, cleanup));
@@ -575,11 +591,14 @@
   }
 
   if (SymbolTable::needs_rehashing()) {
+    double start = os::elapsedTime();
     const char* name = "rehashing symbol table";
     EventSafepointCleanupTask event;
     TraceTime timer(name, TRACETIME_LOG(Info, safepoint, cleanup));
     SymbolTable::rehash_table();
     event_safepoint_cleanup_task_commit(event, name);
+    double end = os::elapsedTime();
+    tty->print_cr("vmthread took: %f ms for symbol table rehash", (end - start) * 1000.0);
   }
 
   if (StringTable::needs_rehashing()) {
@@ -601,6 +620,117 @@
   }
 }
 
+class ParallelSPCleanupThreadClosure : public ThreadClosure {
+private:
+  bool _do_deflate_idle_monitors;
+  CodeBlobClosure* _nmethod_cl;
+
+public:
+  ParallelSPCleanupThreadClosure() {
+    VM_Operation* op = VMThread::vm_operation();
+    _do_deflate_idle_monitors = op == NULL || ! op->deflates_idle_monitors();
+    if (op == NULL || ! op->marks_nmethods()) {
+      _nmethod_cl = NMethodSweeper::prepare_mark_active_nmethods();
+    } else {
+      _nmethod_cl = NULL;
+    }
+  }
+
+  void do_thread(Thread* thread) {
+    if (_do_deflate_idle_monitors) {
+      ObjectSynchronizer::deflate_idle_monitors_and_oops_do(thread, NULL);
+    }
+    if (_nmethod_cl != NULL && thread->is_Java_thread() &&
+        ! thread->is_Code_cache_sweeper_thread()) {
+      JavaThread* jt = (JavaThread*) thread;
+      jt->nmethods_do(_nmethod_cl);
+    }
+  }
+};
+
+class ParallelSPCleanupTask : public AbstractGangTask {
+private:
+  SubTasksDone* _subtasks;
+  ParallelSPCleanupThreadClosure _cleanup_threads_cl;
+public:
+  ParallelSPCleanupTask(SubTasksDone* subtasks) :
+    AbstractGangTask("Parallel Safepoint Cleanup"),
+    _cleanup_threads_cl(ParallelSPCleanupThreadClosure()),
+    _subtasks(subtasks) {}
+
+  void work(uint worker_id) {
+    // All threads deflate monitors and mark nmethods (if necessary).
+    Threads::parallel_java_threads_do(&_cleanup_threads_cl);
+
+    if (! _subtasks->is_task_claimed(SafepointSynchronize::SAFEPOINT_CLEANUP_DEFLATE_MONITORS)) {
+      const char* name = "deflating idle monitors";
+      EventSafepointCleanupTask event;
+      TraceTime timer(name, TRACETIME_LOG(Info, safepoint, cleanup));
+      ObjectSynchronizer::deflate_idle_monitors(false);
+      event_safepoint_cleanup_task_commit(event, name);
+    }
+
+    if (! _subtasks->is_task_claimed(SafepointSynchronize::SAFEPOINT_CLEANUP_UPDATE_INLINE_CACHES)) {
+      const char* name = "updating inline caches";
+      EventSafepointCleanupTask event;
+      TraceTime timer(name, TRACETIME_LOG(Info, safepoint, cleanup));
+      InlineCacheBuffer::update_inline_caches();
+      event_safepoint_cleanup_task_commit(event, name);
+    }
+
+    if (! _subtasks->is_task_claimed(SafepointSynchronize::SAFEPOINT_CLEANUP_COMPILATION_POLICY)) {
+      const char* name = "compilation policy safepoint handler";
+      EventSafepointCleanupTask event;
+      TraceTime timer("compilation policy safepoint handler", TRACETIME_LOG(Info, safepoint, cleanup));
+      CompilationPolicy::policy()->do_safepoint_work();
+      event_safepoint_cleanup_task_commit(event, name);
+    }
+
+    if (! _subtasks->is_task_claimed(SafepointSynchronize::SAFEPOINT_CLEANUP_SYMBOL_TABLE_REHASH)) {
+      if (SymbolTable::needs_rehashing()) {
+        const char* name = "rehashing symbol table";
+        EventSafepointCleanupTask event;
+        TraceTime timer(name, TRACETIME_LOG(Info, safepoint, cleanup));
+        SymbolTable::rehash_table();
+        event_safepoint_cleanup_task_commit(event, name);
+      }
+    }
+
+    if (! _subtasks->is_task_claimed(SafepointSynchronize::SAFEPOINT_CLEANUP_STRING_TABLE_REHASH)) {
+      if (StringTable::needs_rehashing()) {
+        const char* name = "rehashing string table";
+        EventSafepointCleanupTask event;
+        TraceTime timer(name, TRACETIME_LOG(Info, safepoint, cleanup));
+        StringTable::rehash_table();
+        event_safepoint_cleanup_task_commit(event, name);
+      }
+    }
+
+    if (! _subtasks->is_task_claimed(SafepointSynchronize::SAFEPOINT_CLEANUP_CLD_PURGE)) {
+      // CMS delays purging the CLDG until the beginning of the next safepoint and to
+      // make sure concurrent sweep is done
+      const char* name = "purging class loader data graph";
+      EventSafepointCleanupTask event;
+      TraceTime timer(name, TRACETIME_LOG(Info, safepoint, cleanup));
+      ClassLoaderDataGraph::purge_if_needed();
+      event_safepoint_cleanup_task_commit(event, name);
+    }
+    _subtasks->all_tasks_completed(ParallelSafepointCleanupThreads);
+  }
+};
+
+void SafepointSynchronize::parallel_cleanup() {
+  // Deferred init
+  if (_cleanup_workers == NULL) {
+    _cleanup_workers = new WorkGang("Parallel Safepoint Cleanup", ParallelSafepointCleanupThreads, false, false);
+    _cleanup_workers->initialize_workers();
+    _cleanup_subtasks = new SubTasksDone(SAFEPOINT_CLEANUP_NUM_TASKS);
+  }
+
+  StrongRootsScope srs(_cleanup_workers->active_workers());
+  ParallelSPCleanupTask cleanup_task(_cleanup_subtasks);
+  _cleanup_workers->run_task(&cleanup_task);
+}
 
 bool SafepointSynchronize::safepoint_safe(JavaThread *thread, JavaThreadState state) {
   switch(state) {
diff --git a/src/share/vm/runtime/safepoint.hpp b/src/share/vm/runtime/safepoint.hpp
--- a/src/share/vm/runtime/safepoint.hpp
+++ b/src/share/vm/runtime/safepoint.hpp
@@ -51,6 +51,8 @@
 class ThreadSafepointState;
 class SnippetCache;
 class nmethod;
+class WorkGang;
+class SubTasksDone;
 
 //
 // Implements roll-forward to safepoint (safepoint synchronization)
@@ -90,6 +92,17 @@
     jlong  _time_to_exec_vmop;                 // total time in millis spent in vm operation itself
   } SafepointStats;
 
+  enum SafepointCleanupTasks {
+    SAFEPOINT_CLEANUP_DEFLATE_MONITORS,
+    SAFEPOINT_CLEANUP_UPDATE_INLINE_CACHES,
+    SAFEPOINT_CLEANUP_COMPILATION_POLICY,
+    SAFEPOINT_CLEANUP_SYMBOL_TABLE_REHASH,
+    SAFEPOINT_CLEANUP_STRING_TABLE_REHASH,
+    SAFEPOINT_CLEANUP_CLD_PURGE,
+    // Leave this one last.
+    SAFEPOINT_CLEANUP_NUM_TASKS
+  };
+
  private:
   static volatile SynchronizeState _state;     // Threads might read this flag directly, without acquiring the Threads_lock
   static volatile int _waiting_to_block;       // number of threads we are waiting for to block
@@ -129,6 +142,10 @@
   // For debug long safepoint
   static void print_safepoint_timeout(SafepointTimeoutReason timeout_reason);
 
+  // Parallel cleanup
+  static WorkGang* _cleanup_workers;
+  static SubTasksDone* _cleanup_subtasks;
+
 public:
 
   // Main entry points
@@ -173,6 +190,11 @@
   static bool is_cleanup_needed();
   static void do_cleanup_tasks();
 
+private:
+  static void serial_cleanup();
+  static void parallel_cleanup();
+
+public:
   // Debugging
   static void print_state()                                PRODUCT_RETURN;
   static void safepoint_msg(const char* format, ...) ATTRIBUTE_PRINTF(1, 2) PRODUCT_RETURN;
diff --git a/src/share/vm/runtime/sweeper.cpp b/src/share/vm/runtime/sweeper.cpp
--- a/src/share/vm/runtime/sweeper.cpp
+++ b/src/share/vm/runtime/sweeper.cpp
@@ -199,11 +199,20 @@
   * safepoint.
   */
 void NMethodSweeper::mark_active_nmethods() {
+  CodeBlobClosure* cl = prepare_mark_active_nmethods();
+  if (cl != NULL) {
+    Threads::nmethods_do(cl);
+    // TODO: Is this really needed?
+    OrderAccess::storestore();
+  }
+}
+
+CodeBlobClosure* NMethodSweeper::prepare_mark_active_nmethods() {
   assert(SafepointSynchronize::is_at_safepoint(), "must be executed at a safepoint");
   // If we do not want to reclaim not-entrant or zombie methods there is no need
   // to scan stacks
   if (!MethodFlushing) {
-    return;
+    return NULL;
   }
 
   // Increase time so that we can estimate when to invoke the sweeper again.
@@ -231,14 +240,13 @@
     if (PrintMethodFlushing) {
       tty->print_cr("### Sweep: stack traversal %ld", _traversals);
     }
-    Threads::nmethods_do(&mark_activation_closure);
+    return &mark_activation_closure;
 
   } else {
     // Only set hotness counter
-    Threads::nmethods_do(&set_hotness_closure);
+    return &set_hotness_closure;
   }
 
-  OrderAccess::storestore();
 }
 
 /**
diff --git a/src/share/vm/runtime/sweeper.hpp b/src/share/vm/runtime/sweeper.hpp
--- a/src/share/vm/runtime/sweeper.hpp
+++ b/src/share/vm/runtime/sweeper.hpp
@@ -30,6 +30,8 @@
 #include "code/codeCache.hpp"
 #include "utilities/ticks.hpp"
 
+class CodeBlobClosure;
+
 // An NmethodSweeper is an incremental cleaner for:
 //    - cleanup inline caches
 //    - reclamation of nmethods
@@ -114,6 +116,8 @@
 #endif
 
   static void mark_active_nmethods();      // Invoked at the end of each safepoint
+  static CodeBlobClosure* prepare_mark_active_nmethods();
+
   static void sweeper_loop();
   static void notify(int code_blob_type);  // Possibly start the sweeper thread.
   static void force_sweep();
diff --git a/src/share/vm/runtime/synchronizer.cpp b/src/share/vm/runtime/synchronizer.cpp
--- a/src/share/vm/runtime/synchronizer.cpp
+++ b/src/share/vm/runtime/synchronizer.cpp
@@ -1686,7 +1686,7 @@
   return deflated_count;
 }
 
-void ObjectSynchronizer::deflate_idle_monitors() {
+void ObjectSynchronizer::deflate_idle_monitors(bool deflate_thread_local_monitors) {
   assert(SafepointSynchronize::is_at_safepoint(), "must be at safepoint");
   int nInuse = 0;              // currently associated with objects
   int nInCirculation = 0;      // extant
@@ -1697,19 +1697,25 @@
   ObjectMonitor * freeTailp = NULL;
 
   TEVENT(deflate_idle_monitors);
-  if (MonitorInUseLists) {
-    if (! Universe::heap()->supports_per_thread_monitor_deflation() ||
-        ForceMonitorScavenge == 1) {
-      Universe::heap()->deflate_idle_monitors_all_threads();
-    }
-  }
-
   // Prevent omFlush from changing mids in Thread dtor's during deflation
   // And in case the vm thread is acquiring a lock during a safepoint
   // See e.g. 6320749
   Thread::muxAcquire(&gListLock, "scavenge - return");
 
   if (MonitorInUseLists) {
+    if (deflate_thread_local_monitors) {
+      for (JavaThread* cur = Threads::first(); cur != NULL; cur = cur->next()) {
+        nInCirculation+= cur->omInUseCount;
+        int deflated_count = deflate_monitor_list(cur->omInUseList_addr(), &freeHeadp, &freeTailp);
+        cur->omInUseCount-= deflated_count;
+        if (ObjectMonitor::Knob_VerifyInUse) {
+          verifyInUse(cur);
+        }
+        nScavenged += deflated_count;
+        nInuse += cur->omInUseCount;
+      }
+    }
+
     // For moribund threads, scan gOmInUseList
     if (gOmInUseList) {
       nInCirculation += gOmInUseCount;
@@ -1786,8 +1792,6 @@
 void ObjectSynchronizer::deflate_idle_monitors_and_oops_do(Thread* thread, OopClosure* cl) {
   assert(SafepointSynchronize::is_at_safepoint(), "must be at safepoint");
   if (! MonitorInUseLists) return;
-  if (ForceMonitorScavenge == 1) return;
-  assert(Universe::heap()->supports_per_thread_monitor_deflation(), "only call this when supported by GC");
 
   ObjectMonitor * freeHeadp = NULL;  // Local SLL of scavenged monitors
   ObjectMonitor * freeTailp = NULL;
diff --git a/src/share/vm/runtime/synchronizer.hpp b/src/share/vm/runtime/synchronizer.hpp
--- a/src/share/vm/runtime/synchronizer.hpp
+++ b/src/share/vm/runtime/synchronizer.hpp
@@ -144,7 +144,9 @@
   // GC: we current use aggressive monitor deflation policy
   // Basically we deflate all monitors that are not busy.
   // An adaptive profile-based deflation policy could be used if needed
-  static void deflate_idle_monitors();
+  // When deflate_tl is true, also deflate thread-local monitors. Otherwise only
+  // deflate global monitors.
+  static void deflate_idle_monitors(bool deflate_thread_local_monitors);
   static void deflate_idle_monitors_and_oops_do(Thread* thread, OopClosure* cl);
   static void deflate_idle_monitors_all_threads();
 
diff --git a/src/share/vm/runtime/thread.cpp b/src/share/vm/runtime/thread.cpp
--- a/src/share/vm/runtime/thread.cpp
+++ b/src/share/vm/runtime/thread.cpp
@@ -791,7 +791,8 @@
   f->do_oop((oop*)&_pending_exception);
   handle_area()->oops_do(f);
   if (MonitorInUseLists) {
-    if (Universe::heap()->supports_per_thread_monitor_deflation()) {
+    VM_Operation* op = VMThread::vm_operation();
+    if (op != NULL && op->deflates_idle_monitors()) {
       ObjectSynchronizer::deflate_idle_monitors_and_oops_do(this, f);
     } else {
       ObjectSynchronizer::thread_local_used_oops_do(this, f);
@@ -4391,11 +4392,18 @@
 }
 #endif // ASSERT
 
-void Threads::possibly_parallel_oops_do(bool is_par, OopClosure* f, CodeBlobClosure* cf) {
+void Threads::possibly_parallel_oops_do(bool is_par, OopClosure* f, CodeBlobClosure* cf, CodeBlobClosure* nmethods_cl) {
+  CodeBlobClosure* blobs;
+  VM_Operation* op = VMThread::vm_operation();
+  if (op != NULL && op->marks_nmethods()) {
+  }
   int cp = Threads::thread_claim_parity();
   ALL_JAVA_THREADS(p) {
     if (p->claim_oops_do(is_par, cp)) {
       p->oops_do(f, cf);
+      if (nmethods_cl != NULL && ! p->is_Code_cache_sweeper_thread()) {
+        p->nmethods_do(nmethods_cl);
+      }
     }
   }
   VMThread* vmt = VMThread::vm_thread();
diff --git a/src/share/vm/runtime/thread.hpp b/src/share/vm/runtime/thread.hpp
--- a/src/share/vm/runtime/thread.hpp
+++ b/src/share/vm/runtime/thread.hpp
@@ -2132,7 +2132,7 @@
   // This version may only be called by sequential code.
   static void oops_do(OopClosure* f, CodeBlobClosure* cf);
   // This version may be called by sequential or parallel code.
-  static void possibly_parallel_oops_do(bool is_par, OopClosure* f, CodeBlobClosure* cf);
+  static void possibly_parallel_oops_do(bool is_par, OopClosure* f, CodeBlobClosure* cf, CodeBlobClosure* nmethods_cl = NULL);
   // This creates a list of GCTasks, one per thread.
   static void create_thread_roots_tasks(GCTaskQueue* q);
   // This creates a list of GCTasks, one per thread, for marking objects.
diff --git a/src/share/vm/runtime/vm_operations.hpp b/src/share/vm/runtime/vm_operations.hpp
--- a/src/share/vm/runtime/vm_operations.hpp
+++ b/src/share/vm/runtime/vm_operations.hpp
@@ -198,6 +198,26 @@
 
   static const char* mode_to_string(Mode mode);
 
+  // Safepoint cleanup
+  // Return true if this VM_Operation takes care of idle monitor deflation.
+  // Idle monitor deflation is usually done by the safepoint cleanup phase
+  // in SafepointSynchronize::do_cleanup_tasks(). However, a VM_Operation
+  // may want to take care of this itself, for example if a GC operation
+  // scans the thread stack anyway, it probably can piggy-back monitor
+  // deflation. Note that this is only possible if the oop marks are preserved
+  // during the VM operation (for example, most current GCs *don't* preserve
+  // the mark word, but displace it and temporarily use the mark word as
+  // forwarding pointer).
+  virtual bool deflates_idle_monitors() { return false; }
+
+  // Return true if this VM_Operation takes care of nmethod marking.
+  // NMethod marking is usually done by the safepoint cleanup phase
+  // in SafepointSynchronize::do_cleanup_tasks(). However, a VM_Operation
+  // may want to take care of this itself, for example if a GC operation
+  // scans the thread stack anyway, it can just as well piggy-back nmethod
+  // marking.
+  virtual bool marks_nmethods() { return false; }
+
   // Debugging
   virtual void print_on_error(outputStream* st) const;
   const char* name() const { return _names[type()]; }