--- old/src/hotspot/share/runtime/synchronizer.cpp	2019-10-17 17:29:14.000000000 -0400
+++ new/src/hotspot/share/runtime/synchronizer.cpp	2019-10-17 17:29:13.000000000 -0400
@@ -37,6 +37,7 @@
 #include "runtime/atomic.hpp"
 #include "runtime/biasedLocking.hpp"
 #include "runtime/handles.inline.hpp"
+#include "runtime/handshake.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
 #include "runtime/mutexLocker.hpp"
 #include "runtime/objectMonitor.hpp"
@@ -118,21 +119,269 @@
 
 // global list of blocks of monitors
 PaddedObjectMonitor* volatile ObjectSynchronizer::g_block_list = NULL;
+bool volatile ObjectSynchronizer::_is_async_deflation_requested = false;
+bool volatile ObjectSynchronizer::_is_special_deflation_requested = false;
+jlong ObjectSynchronizer::_last_async_deflation_time_ns = 0;
+
 // Global ObjectMonitor free list. Newly allocated and deflated
 // ObjectMonitors are prepended here.
-ObjectMonitor* volatile ObjectSynchronizer::g_free_list = NULL;
+static ObjectMonitor* volatile g_free_list = NULL;
 // Global ObjectMonitor in-use list. When a JavaThread is exiting,
 // ObjectMonitors on its per-thread in-use list are prepended here.
-ObjectMonitor* volatile ObjectSynchronizer::g_om_in_use_list = NULL;
-int ObjectSynchronizer::g_om_in_use_count = 0;  // # on g_om_in_use_list
-
-static volatile intptr_t gListLock = 0;   // protects global monitor lists
-static volatile int g_om_free_count = 0;  // # on g_free_list
-static volatile int g_om_population = 0;  // # Extant -- in circulation
+static ObjectMonitor* volatile g_om_in_use_list = NULL;
+// Global ObjectMonitor wait list. If HandshakeAfterDeflateIdleMonitors
+// is true, deflated ObjectMonitors wait on this list until after a
+// handshake or a safepoint for platforms that don't support handshakes.
+// After the handshake or safepoint, the deflated ObjectMonitors are
+// prepended to g_free_list.
+static ObjectMonitor* volatile g_wait_list = NULL;
+
+static volatile int g_om_free_count = 0;    // # on g_free_list
+static volatile int g_om_in_use_count = 0;  // # on g_om_in_use_list
+static volatile int g_om_population = 0;    // # Extant -- in circulation
+static volatile int g_om_wait_count = 0;    // # on g_wait_list
 
 #define CHAINMARKER (cast_to_oop<intptr_t>(-1))
 
 
+// =====================> List Management functions
+
+// Return true if the ObjectMonitor's next field is marked.
+// Otherwise returns false.
+static bool is_next_marked(ObjectMonitor* om) {
+  return ((intptr_t)OrderAccess::load_acquire(&om->_next_om) & 0x1) != 0;
+}
+
+// Mark an ObjectMonitor* and return it. Note: the om parameter
+// may or may not have been marked originally.
+static ObjectMonitor* mark_om_ptr(ObjectMonitor* om) {
+  return (ObjectMonitor*)((intptr_t)om | 0x1);
+}
+
+// Mark the next field in an ObjectMonitor. If marking was successful,
+// then the unmarked next field is returned via parameter and true is
+// returned. Otherwise false is returned.
+static bool mark_next(ObjectMonitor* om, ObjectMonitor** next_p) {
+  // Get current next field without any marking value.
+  ObjectMonitor* next = (ObjectMonitor*)
+      ((intptr_t)OrderAccess::load_acquire(&om->_next_om) & ~0x1);
+  if (Atomic::cmpxchg(mark_om_ptr(next), &om->_next_om, next) != next) {
+    return false;  // Could not mark the next field or it was already marked.
+  }
+  *next_p = next;
+  return true;
+}
+
+// Loop until we mark the next field in an ObjectMonitor. The unmarked
+// next field is returned.
+static ObjectMonitor* mark_next_loop(ObjectMonitor* om) {
+  ObjectMonitor* next;
+  while (true) {
+    if (mark_next(om, &next)) {
+      // Marked om's next field so return the unmarked value.
+      return next;
+    }
+  }
+}
+
+// Set the next field in an ObjectMonitor to the specified value.
+// The caller of set_next() must be the same thread that marked the
+// ObjectMonitor.
+static void set_next(ObjectMonitor* om, ObjectMonitor* value) {
+  OrderAccess::release_store(&om->_next_om, value);
+}
+
+// Mark the next field in the list head ObjectMonitor. If marking was
+// successful, then the mid and the unmarked next field are returned
+// via parameter and true is returned. Otherwise false is returned.
+static bool mark_list_head(ObjectMonitor* volatile * list_p,
+                           ObjectMonitor** mid_p, ObjectMonitor** next_p) {
+  while (true) {
+    ObjectMonitor* mid = OrderAccess::load_acquire(list_p);
+    if (mid == NULL) {
+      return false;  // The list is empty so nothing to mark.
+    }
+    if (mark_next(mid, next_p)) {
+      if (OrderAccess::load_acquire(list_p) != mid) {
+        // The list head changed so we have to retry.
+        set_next(mid, *next_p);  // unmark mid
+        continue;
+      }
+      // We marked next field to guard against races.
+      *mid_p = mid;
+      return true;
+    }
+  }
+}
+
+// Return the unmarked next field in an ObjectMonitor. Note: the next
+// field may or may not have been marked originally.
+static ObjectMonitor* unmarked_next(ObjectMonitor* om) {
+  return (ObjectMonitor*)((intptr_t)OrderAccess::load_acquire(&om->_next_om) & ~0x1);
+}
+
+// Prepend a list of ObjectMonitors to the specified *list_p. 'tail' is
+// the last ObjectMonitor in the list and there are 'count' on the list.
+// Also updates the specified *count_p.
+static void prepend_list_to_common(ObjectMonitor* list, ObjectMonitor* tail,
+                                   int count, ObjectMonitor* volatile* list_p,
+                                   volatile int* count_p) {
+  while (true) {
+    ObjectMonitor* cur = OrderAccess::load_acquire(list_p);
+    // Prepend list to *list_p.
+    ObjectMonitor* next = NULL;
+    if (!mark_next(tail, &next)) {
+      continue;  // failed to mark next field so try it all again
+    }
+    set_next(tail, cur);  // tail now points to cur (and unmarks tail)
+    if (cur == NULL) {
+      // No potential race with takers or other prependers since
+      // *list_p is empty.
+      if (Atomic::cmpxchg(list, list_p, cur) == cur) {
+        // Successfully switched *list_p to the list value.
+        Atomic::add(count, count_p);
+        break;
+      }
+      // Implied else: try it all again
+    } else {
+      // Try to mark next field to guard against races:
+      if (!mark_next(cur, &next)) {
+        continue;  // failed to mark next field so try it all again
+      }
+      // We marked the next field so try to switch *list_p to the list value.
+      if (Atomic::cmpxchg(list, list_p, cur) != cur) {
+        // The list head has changed so unmark the next field and try again:
+        set_next(cur, next);
+        continue;
+      }
+      Atomic::add(count, count_p);
+      set_next(cur, next);  // unmark next field
+      break;
+    }
+  }
+}
+
+// Prepend a newly allocated block of ObjectMonitors to g_block_list and
+// g_free_list. Also updates g_om_population and g_om_free_count.
+void ObjectSynchronizer::prepend_block_to_lists(PaddedObjectMonitor* new_blk) {
+  // First we handle g_block_list:
+  while (true) {
+    PaddedObjectMonitor* cur = g_block_list;
+    // Prepend new_blk to g_block_list. The first ObjectMonitor in
+    // a block is reserved for use as linkage to the next block.
+    new_blk[0]._next_om = cur;
+    if (Atomic::cmpxchg(new_blk, &g_block_list, cur) == cur) {
+      // Successfully switched g_block_list to the new_blk value.
+      Atomic::add(_BLOCKSIZE - 1, &g_om_population);
+      break;
+    }
+    // Implied else: try it all again
+  }
+
+  // Second we handle g_free_list:
+  prepend_list_to_common(new_blk + 1, &new_blk[_BLOCKSIZE - 1], _BLOCKSIZE - 1,
+                         &g_free_list, &g_om_free_count);
+}
+
+// Prepend a list of ObjectMonitors to g_free_list. 'tail' is the last
+// ObjectMonitor in the list and there are 'count' on the list. Also
+// updates g_om_free_count.
+static void prepend_list_to_g_free_list(ObjectMonitor* list,
+                                        ObjectMonitor* tail, int count) {
+  prepend_list_to_common(list, tail, count, &g_free_list, &g_om_free_count);
+}
+
+// Prepend a list of ObjectMonitors to g_wait_list. 'tail' is the last
+// ObjectMonitor in the list and there are 'count' on the list. Also
+// updates g_om_wait_count.
+static void prepend_list_to_g_wait_list(ObjectMonitor* list,
+                                        ObjectMonitor* tail, int count) {
+  assert(HandshakeAfterDeflateIdleMonitors, "sanity check");
+  prepend_list_to_common(list, tail, count, &g_wait_list, &g_om_wait_count);
+}
+
+// Prepend a list of ObjectMonitors to g_om_in_use_list. 'tail' is the last
+// ObjectMonitor in the list and there are 'count' on the list. Also
+// updates g_om_in_use_list.
+static void prepend_list_to_g_om_in_use_list(ObjectMonitor* list,
+                                             ObjectMonitor* tail, int count) {
+  prepend_list_to_common(list, tail, count, &g_om_in_use_list, &g_om_in_use_count);
+}
+
+// Prepend an ObjectMonitor to the specified list. Also updates
+// the specified counter.
+static void prepend_to_common(ObjectMonitor* m, ObjectMonitor* volatile * list_p,
+                              int volatile * count_p) {
+  while (true) {
+    (void)mark_next_loop(m);  // mark m so we can safely update its next field
+    ObjectMonitor* cur = NULL;
+    ObjectMonitor* next = NULL;
+    // Mark the list head to guard against A-B-A race:
+    if (mark_list_head(list_p, &cur, &next)) {
+      // List head is now marked so we can safely switch it.
+      set_next(m, cur);  // m now points to cur (and unmarks m)
+      OrderAccess::release_store(list_p, m);  // Switch list head to unmarked m.
+      set_next(cur, next);  // Unmark the previous list head.
+      break;
+    }
+    // The list is empty so try to set the list head.
+    assert(cur == NULL, "cur must be NULL: cur=" INTPTR_FORMAT, p2i(cur));
+    set_next(m, cur);  // m now points to NULL (and unmarks m)
+    if (Atomic::cmpxchg(m, list_p, cur) == cur) {
+      // List head is now unmarked m.
+      break;
+    }
+    // Implied else: try it all again
+  }
+  Atomic::inc(count_p);
+}
+
+// Prepend an ObjectMonitor to a per-thread om_free_list.
+// Also updates the per-thread om_free_count.
+static void prepend_to_om_free_list(Thread* self, ObjectMonitor* m) {
+  prepend_to_common(m, &self->om_free_list, &self->om_free_count);
+}
+
+// Prepend an ObjectMonitor to a per-thread om_in_use_list.
+// Also updates the per-thread om_in_use_count.
+static void prepend_to_om_in_use_list(Thread* self, ObjectMonitor* m) {
+  prepend_to_common(m, &self->om_in_use_list, &self->om_in_use_count);
+}
+
+// Take an ObjectMonitor from the start of the specified list. Also
+// decrements the specified counter. Returns NULL if none are available.
+static ObjectMonitor* take_from_start_of_common(ObjectMonitor* volatile * list_p,
+                                                int volatile * count_p) {
+  ObjectMonitor* next = NULL;
+  ObjectMonitor* take = NULL;
+  // Mark the list head to guard against A-B-A race:
+  if (!mark_list_head(list_p, &take, &next)) {
+    return NULL;  // None are available.
+  }
+  // Switch marked list head to next (which unmarks the list head, but
+  // leaves take marked):
+  OrderAccess::release_store(list_p, next);
+  Atomic::dec(count_p);
+  // Unmark take, but leave the next value for any lagging list
+  // walkers. It will get cleaned up when take is prepended to
+  // the in-use list:
+  set_next(take, next);
+  return take;
+}
+
+// Take an ObjectMonitor from the start of the global free-list. Also
+// updates g_om_free_count. Returns NULL if none are available.
+static ObjectMonitor* take_from_start_of_g_free_list() {
+  return take_from_start_of_common(&g_free_list, &g_om_free_count);
+}
+
+// Take an ObjectMonitor from the start of a per-thread free-list.
+// Also updates om_free_count. Returns NULL if none are available.
+static ObjectMonitor* take_from_start_of_om_free_list(Thread* self) {
+  return take_from_start_of_common(&self->om_free_list, &self->om_free_count);
+}
+
+
 // =====================> Quick functions
 
 // The quick_* forms are special fast-path variants used to improve
@@ -211,39 +460,59 @@
   assert(((JavaThread *) self)->thread_state() == _thread_in_Java, "invariant");
   NoSafepointVerifier nsv;
   if (obj == NULL) return false;       // Need to throw NPE
-  const markWord mark = obj->mark();
 
-  if (mark.has_monitor()) {
-    ObjectMonitor* const m = mark.monitor();
-    assert(m->object() == obj, "invariant");
-    Thread* const owner = (Thread *) m->_owner;
-
-    // Lock contention and Transactional Lock Elision (TLE) diagnostics
-    // and observability
-    // Case: light contention possibly amenable to TLE
-    // Case: TLE inimical operations such as nested/recursive synchronization
+  while (true) {
+    const markWord mark = obj->mark();
 
-    if (owner == self) {
-      m->_recursions++;
-      return true;
-    }
+    if (mark.has_monitor()) {
+      ObjectMonitorHandle omh;
+      if (!omh.save_om_ptr(obj, mark)) {
+        // Lost a race with async deflation so try again.
+        assert(AsyncDeflateIdleMonitors, "sanity check");
+        continue;
+      }
+      ObjectMonitor* const m = omh.om_ptr();
+      assert(m->object() == obj, "invariant");
+      Thread* const owner = (Thread *) m->_owner;
+
+      // Lock contention and Transactional Lock Elision (TLE) diagnostics
+      // and observability
+      // Case: light contention possibly amenable to TLE
+      // Case: TLE inimical operations such as nested/recursive synchronization
+
+      if (owner == self) {
+        m->_recursions++;
+        return true;
+      }
 
-    // This Java Monitor is inflated so obj's header will never be
-    // displaced to this thread's BasicLock. Make the displaced header
-    // non-NULL so this BasicLock is not seen as recursive nor as
-    // being locked. We do this unconditionally so that this thread's
-    // BasicLock cannot be mis-interpreted by any stack walkers. For
-    // performance reasons, stack walkers generally first check for
-    // Biased Locking in the object's header, the second check is for
-    // stack-locking in the object's header, the third check is for
-    // recursive stack-locking in the displaced header in the BasicLock,
-    // and last are the inflated Java Monitor (ObjectMonitor) checks.
-    lock->set_displaced_header(markWord::unused_mark());
+      // This Java Monitor is inflated so obj's header will never be
+      // displaced to this thread's BasicLock. Make the displaced header
+      // non-NULL so this BasicLock is not seen as recursive nor as
+      // being locked. We do this unconditionally so that this thread's
+      // BasicLock cannot be mis-interpreted by any stack walkers. For
+      // performance reasons, stack walkers generally first check for
+      // Biased Locking in the object's header, the second check is for
+      // stack-locking in the object's header, the third check is for
+      // recursive stack-locking in the displaced header in the BasicLock,
+      // and last are the inflated Java Monitor (ObjectMonitor) checks.
+      lock->set_displaced_header(markWord::unused_mark());
+
+      if (owner == NULL && m->try_set_owner_from(self, NULL) == NULL) {
+        assert(m->_recursions == 0, "invariant");
+        return true;
+      }
 
-    if (owner == NULL && Atomic::replace_if_null(self, &(m->_owner))) {
-      assert(m->_recursions == 0, "invariant");
-      return true;
+      if (AsyncDeflateIdleMonitors &&
+          m->try_set_owner_from(self, DEFLATER_MARKER) == DEFLATER_MARKER) {
+        // The deflation protocol finished the first part (setting owner),
+        // but it failed the second part (making ref_count negative) and
+        // bailed. Or the ObjectMonitor was async deflated and reused.
+        // Acquired the monitor.
+        assert(m->_recursions == 0, "invariant");
+        return true;
+      }
     }
+    break;
   }
 
   // Note that we could inflate in quick_enter.
@@ -295,7 +564,9 @@
   // must be non-zero to avoid looking like a re-entrant lock,
   // and must not look locked either.
   lock->set_displaced_header(markWord::unused_mark());
-  inflate(THREAD, obj(), inflate_cause_monitor_enter)->enter(THREAD);
+  ObjectMonitorHandle omh;
+  inflate(&omh, THREAD, obj(), inflate_cause_monitor_enter);
+  omh.om_ptr()->enter(THREAD);
 }
 
 void ObjectSynchronizer::exit(oop object, BasicLock* lock, TRAPS) {
@@ -344,7 +615,9 @@
   }
 
   // We have to take the slow-path of possible inflation and then exit.
-  inflate(THREAD, object, inflate_cause_vm_internal)->exit(true, THREAD);
+  ObjectMonitorHandle omh;
+  inflate(&omh, THREAD, object, inflate_cause_vm_internal);
+  omh.om_ptr()->exit(true, THREAD);
 }
 
 // -----------------------------------------------------------------------------
@@ -365,9 +638,10 @@
     assert(!obj->mark().has_bias_pattern(), "biases should be revoked by now");
   }
 
-  ObjectMonitor* monitor = inflate(THREAD, obj(), inflate_cause_vm_internal);
-
-  return monitor->complete_exit(THREAD);
+  ObjectMonitorHandle omh;
+  inflate(&omh, THREAD, obj(), inflate_cause_vm_internal);
+  intptr_t ret_code = omh.om_ptr()->complete_exit(THREAD);
+  return ret_code;
 }
 
 // NOTE: must use heavy weight monitor to handle complete_exit/reenter()
@@ -377,9 +651,9 @@
     assert(!obj->mark().has_bias_pattern(), "biases should be revoked by now");
   }
 
-  ObjectMonitor* monitor = inflate(THREAD, obj(), inflate_cause_vm_internal);
-
-  monitor->reenter(recursion, THREAD);
+  ObjectMonitorHandle omh;
+  inflate(&omh, THREAD, obj(), inflate_cause_vm_internal);
+  omh.om_ptr()->reenter(recursion, THREAD);
 }
 // -----------------------------------------------------------------------------
 // JNI locks on java objects
@@ -391,7 +665,9 @@
     assert(!obj->mark().has_bias_pattern(), "biases should be revoked by now");
   }
   THREAD->set_current_pending_monitor_is_from_java(false);
-  inflate(THREAD, obj(), inflate_cause_jni_enter)->enter(THREAD);
+  ObjectMonitorHandle omh;
+  inflate(&omh, THREAD, obj(), inflate_cause_jni_enter);
+  omh.om_ptr()->enter(THREAD);
   THREAD->set_current_pending_monitor_is_from_java(true);
 }
 
@@ -404,7 +680,9 @@
   }
   assert(!obj->mark().has_bias_pattern(), "biases should be revoked by now");
 
-  ObjectMonitor* monitor = inflate(THREAD, obj, inflate_cause_jni_exit);
+  ObjectMonitorHandle omh;
+  inflate(&omh, THREAD, obj, inflate_cause_jni_exit);
+  ObjectMonitor* monitor = omh.om_ptr();
   // If this thread has locked the object, exit the monitor. We
   // intentionally do not use CHECK here because we must exit the
   // monitor even if an exception is pending.
@@ -445,7 +723,9 @@
   if (millis < 0) {
     THROW_MSG_0(vmSymbols::java_lang_IllegalArgumentException(), "timeout value is negative");
   }
-  ObjectMonitor* monitor = inflate(THREAD, obj(), inflate_cause_wait);
+  ObjectMonitorHandle omh;
+  inflate(&omh, THREAD, obj(), inflate_cause_wait);
+  ObjectMonitor* monitor = omh.om_ptr();
 
   DTRACE_MONITOR_WAIT_PROBE(monitor, obj(), THREAD, millis);
   monitor->wait(millis, true, THREAD);
@@ -454,7 +734,8 @@
   // that's fixed we can uncomment the following line, remove the call
   // and change this function back into a "void" func.
   // DTRACE_MONITOR_PROBE(waited, monitor, obj(), THREAD);
-  return dtrace_waited_probe(monitor, obj, THREAD);
+  int ret_code = dtrace_waited_probe(monitor, obj, THREAD);
+  return ret_code;
 }
 
 void ObjectSynchronizer::wait_uninterruptibly(Handle obj, jlong millis, TRAPS) {
@@ -465,7 +746,9 @@
   if (millis < 0) {
     THROW_MSG(vmSymbols::java_lang_IllegalArgumentException(), "timeout value is negative");
   }
-  inflate(THREAD, obj(), inflate_cause_wait)->wait(millis, false, THREAD);
+  ObjectMonitorHandle omh;
+  inflate(&omh, THREAD, obj(), inflate_cause_wait);
+  omh.om_ptr()->wait(millis, false, THREAD);
 }
 
 void ObjectSynchronizer::notify(Handle obj, TRAPS) {
@@ -478,7 +761,9 @@
   if (mark.has_locker() && THREAD->is_lock_owned((address)mark.locker())) {
     return;
   }
-  inflate(THREAD, obj(), inflate_cause_notify)->notify(THREAD);
+  ObjectMonitorHandle omh;
+  inflate(&omh, THREAD, obj(), inflate_cause_notify);
+  omh.om_ptr()->notify(THREAD);
 }
 
 // NOTE: see comment of notify()
@@ -492,7 +777,9 @@
   if (mark.has_locker() && THREAD->is_lock_owned((address)mark.locker())) {
     return;
   }
-  inflate(THREAD, obj(), inflate_cause_notify)->notifyAll(THREAD);
+  ObjectMonitorHandle omh;
+  inflate(&omh, THREAD, obj(), inflate_cause_notify);
+  omh.om_ptr()->notifyAll(THREAD);
 }
 
 // -----------------------------------------------------------------------------
@@ -517,15 +804,15 @@
 // performed by the CPU(s) or platform.
 
 struct SharedGlobals {
-  char         _pad_prefix[DEFAULT_CACHE_LINE_SIZE];
+  char         _pad_prefix[OM_CACHE_LINE_SIZE];
   // These are highly shared mostly-read variables.
   // To avoid false-sharing they need to be the sole occupants of a cache line.
   volatile int stw_random;
   volatile int stw_cycle;
-  DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile int) * 2);
+  DEFINE_PAD_MINUS_SIZE(1, OM_CACHE_LINE_SIZE, sizeof(volatile int) * 2);
   // Hot RW variable -- Sequester to avoid false-sharing
   volatile int hc_sequence;
-  DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile int));
+  DEFINE_PAD_MINUS_SIZE(2, OM_CACHE_LINE_SIZE, sizeof(volatile int));
 };
 
 static SharedGlobals GVars;
@@ -686,79 +973,93 @@
   assert(Universe::verify_in_progress() || DumpSharedSpaces ||
          ((JavaThread *)self)->thread_state() != _thread_blocked, "invariant");
 
-  ObjectMonitor* monitor = NULL;
-  markWord temp, test;
-  intptr_t hash;
-  markWord mark = read_stable_mark(obj);
+  while (true) {
+    ObjectMonitor* monitor = NULL;
+    markWord temp, test;
+    intptr_t hash;
+    markWord mark = read_stable_mark(obj);
 
-  // object should remain ineligible for biased locking
-  assert(!mark.has_bias_pattern(), "invariant");
+    // object should remain ineligible for biased locking
+    assert(!mark.has_bias_pattern(), "invariant");
 
-  if (mark.is_neutral()) {
-    hash = mark.hash();               // this is a normal header
-    if (hash != 0) {                  // if it has hash, just return it
-      return hash;
-    }
-    hash = get_next_hash(self, obj);  // allocate a new hash code
-    temp = mark.copy_set_hash(hash);  // merge the hash code into header
-    // use (machine word version) atomic operation to install the hash
-    test = obj->cas_set_mark(temp, mark);
-    if (test == mark) {
-      return hash;
-    }
-    // If atomic operation failed, we must inflate the header
-    // into heavy weight monitor. We could add more code here
-    // for fast path, but it does not worth the complexity.
-  } else if (mark.has_monitor()) {
-    monitor = mark.monitor();
-    temp = monitor->header();
-    assert(temp.is_neutral(), "invariant: header=" INTPTR_FORMAT, temp.value());
-    hash = temp.hash();
-    if (hash != 0) {
-      return hash;
-    }
-    // Skip to the following code to reduce code size
-  } else if (self->is_lock_owned((address)mark.locker())) {
-    temp = mark.displaced_mark_helper(); // this is a lightweight monitor owned
-    assert(temp.is_neutral(), "invariant: header=" INTPTR_FORMAT, temp.value());
-    hash = temp.hash();                  // by current thread, check if the displaced
-    if (hash != 0) {                     // header contains hash code
-      return hash;
-    }
-    // WARNING:
-    // The displaced header in the BasicLock on a thread's stack
-    // is strictly immutable. It CANNOT be changed in ANY cases.
-    // So we have to inflate the stack lock into an ObjectMonitor
-    // even if the current thread owns the lock. The BasicLock on
-    // a thread's stack can be asynchronously read by other threads
-    // during an inflate() call so any change to that stack memory
-    // may not propagate to other threads correctly.
-  }
-
-  // Inflate the monitor to set hash code
-  monitor = inflate(self, obj, inflate_cause_hash_code);
-  // Load displaced header and check it has hash code
-  mark = monitor->header();
-  assert(mark.is_neutral(), "invariant: header=" INTPTR_FORMAT, mark.value());
-  hash = mark.hash();
-  if (hash == 0) {
-    hash = get_next_hash(self, obj);
-    temp = mark.copy_set_hash(hash); // merge hash code into header
-    assert(temp.is_neutral(), "invariant: header=" INTPTR_FORMAT, temp.value());
-    uintptr_t v = Atomic::cmpxchg(temp.value(), (volatile uintptr_t*)monitor->header_addr(), mark.value());
-    test = markWord(v);
-    if (test != mark) {
-      // The only non-deflation update to the ObjectMonitor's
-      // header/dmw field is to merge in the hash code. If someone
-      // adds a new usage of the header/dmw field, please update
-      // this code.
-      hash = test.hash();
-      assert(test.is_neutral(), "invariant: header=" INTPTR_FORMAT, test.value());
-      assert(hash != 0, "Trivial unexpected object/monitor header usage.");
+    if (mark.is_neutral()) {
+      hash = mark.hash();              // this is a normal header
+      if (hash != 0) {                  // if it has hash, just return it
+        return hash;
+      }
+      hash = get_next_hash(self, obj);  // allocate a new hash code
+      temp = mark.copy_set_hash(hash); // merge the hash code into header
+      // use (machine word version) atomic operation to install the hash
+      test = obj->cas_set_mark(temp, mark);
+      if (test == mark) {
+        return hash;
+      }
+      // If atomic operation failed, we must inflate the header
+      // into heavy weight monitor. We could add more code here
+      // for fast path, but it does not worth the complexity.
+    } else if (mark.has_monitor()) {
+      ObjectMonitorHandle omh;
+      if (!omh.save_om_ptr(obj, mark)) {
+        // Lost a race with async deflation so try again.
+        assert(AsyncDeflateIdleMonitors, "sanity check");
+        continue;
+      }
+      monitor = omh.om_ptr();
+      temp = monitor->header();
+      assert(temp.is_neutral(), "invariant: header=" INTPTR_FORMAT, temp.value());
+      hash = temp.hash();
+      if (hash != 0) {
+        return hash;
+      }
+      // Skip to the following code to reduce code size
+    } else if (self->is_lock_owned((address)mark.locker())) {
+      temp = mark.displaced_mark_helper(); // this is a lightweight monitor owned
+      assert(temp.is_neutral(), "invariant: header=" INTPTR_FORMAT, temp.value());
+      hash = temp.hash();              // by current thread, check if the displaced
+      if (hash != 0) {                  // header contains hash code
+        return hash;
+      }
+      // WARNING:
+      // The displaced header in the BasicLock on a thread's stack
+      // is strictly immutable. It CANNOT be changed in ANY cases.
+      // So we have to inflate the stack lock into an ObjectMonitor
+      // even if the current thread owns the lock. The BasicLock on
+      // a thread's stack can be asynchronously read by other threads
+      // during an inflate() call so any change to that stack memory
+      // may not propagate to other threads correctly.
+    }
+
+    // Inflate the monitor to set hash code
+    ObjectMonitorHandle omh;
+    inflate(&omh, self, obj, inflate_cause_hash_code);
+    monitor = omh.om_ptr();
+    // Load displaced header and check it has hash code
+    mark = monitor->header();
+    assert(mark.is_neutral(), "invariant: header=" INTPTR_FORMAT, mark.value());
+    hash = mark.hash();
+    if (hash == 0) {
+      hash = get_next_hash(self, obj);
+      temp = mark.copy_set_hash(hash); // merge hash code into header
+      assert(temp.is_neutral(), "invariant: header=" INTPTR_FORMAT, temp.value());
+      uintptr_t v = Atomic::cmpxchg(temp.value(), (volatile uintptr_t*)monitor->header_addr(), mark.value());
+      test = markWord(v);
+      if (test != mark) {
+        // The only non-deflation update to the ObjectMonitor's
+        // header/dmw field is to merge in the hash code. If someone
+        // adds a new usage of the header/dmw field, please update
+        // this code.
+        // ObjectMonitor::install_displaced_markword_in_object()
+        // does mark the header/dmw field as part of async deflation,
+        // but that protocol cannot happen now due to the
+        // ObjectMonitorHandle above.
+        hash = test.hash();
+        assert(test.is_neutral(), "invariant: header=" INTPTR_FORMAT, test.value());
+        assert(hash != 0, "Trivial unexpected object/monitor header usage.");
+      }
     }
+    // We finally get the hash
+    return hash;
   }
-  // We finally get the hash
-  return hash;
 }
 
 // Deprecated -- use FastHashCode() instead.
@@ -778,20 +1079,28 @@
   assert(thread == JavaThread::current(), "Can only be called on current thread");
   oop obj = h_obj();
 
-  markWord mark = read_stable_mark(obj);
+  while (true) {
+    markWord mark = read_stable_mark(obj);
 
-  // Uncontended case, header points to stack
-  if (mark.has_locker()) {
-    return thread->is_lock_owned((address)mark.locker());
-  }
-  // Contended case, header points to ObjectMonitor (tagged pointer)
-  if (mark.has_monitor()) {
-    ObjectMonitor* monitor = mark.monitor();
-    return monitor->is_entered(thread) != 0;
+    // Uncontended case, header points to stack
+    if (mark.has_locker()) {
+      return thread->is_lock_owned((address)mark.locker());
+    }
+    // Contended case, header points to ObjectMonitor (tagged pointer)
+    if (mark.has_monitor()) {
+      ObjectMonitorHandle omh;
+      if (!omh.save_om_ptr(obj, mark)) {
+        // Lost a race with async deflation so try again.
+        assert(AsyncDeflateIdleMonitors, "sanity check");
+        continue;
+      }
+      bool ret_code = omh.om_ptr()->is_entered(thread) != 0;
+      return ret_code;
+    }
+    // Unlocked case, header in place
+    assert(mark.is_neutral(), "sanity check");
+    return false;
   }
-  // Unlocked case, header in place
-  assert(mark.is_neutral(), "sanity check");
-  return false;
 }
 
 // Be aware of this method could revoke bias of the lock object.
@@ -817,27 +1126,37 @@
 
   assert(self == JavaThread::current(), "Can only be called on current thread");
   oop obj = h_obj();
-  markWord mark = read_stable_mark(obj);
 
-  // CASE: stack-locked.  Mark points to a BasicLock on the owner's stack.
-  if (mark.has_locker()) {
-    return self->is_lock_owned((address)mark.locker()) ?
-      owner_self : owner_other;
-  }
+  while (true) {
+    markWord mark = read_stable_mark(obj);
 
-  // CASE: inflated. Mark (tagged pointer) points to an ObjectMonitor.
-  // The Object:ObjectMonitor relationship is stable as long as we're
-  // not at a safepoint.
-  if (mark.has_monitor()) {
-    void* owner = mark.monitor()->_owner;
-    if (owner == NULL) return owner_none;
-    return (owner == self ||
-            self->is_lock_owned((address)owner)) ? owner_self : owner_other;
-  }
+    // CASE: stack-locked.  Mark points to a BasicLock on the owner's stack.
+    if (mark.has_locker()) {
+      return self->is_lock_owned((address)mark.locker()) ?
+        owner_self : owner_other;
+    }
+
+    // CASE: inflated. Mark (tagged pointer) points to an ObjectMonitor.
+    // The Object:ObjectMonitor relationship is stable as long as we're
+    // not at a safepoint and AsyncDeflateIdleMonitors is false.
+    if (mark.has_monitor()) {
+      ObjectMonitorHandle omh;
+      if (!omh.save_om_ptr(obj, mark)) {
+        // Lost a race with async deflation so try again.
+        assert(AsyncDeflateIdleMonitors, "sanity check");
+        continue;
+      }
+      ObjectMonitor* monitor = omh.om_ptr();
+      void* owner = monitor->_owner;
+      if (owner == NULL) return owner_none;
+      return (owner == self ||
+              self->is_lock_owned((address)owner)) ? owner_self : owner_other;
+    }
 
-  // CASE: neutral
-  assert(mark.is_neutral(), "sanity check");
-  return owner_none;           // it's unlocked
+    // CASE: neutral
+    assert(mark.is_neutral(), "sanity check");
+    return owner_none;           // it's unlocked
+  }
 }
 
 // FIXME: jvmti should call this
@@ -852,33 +1171,41 @@
   }
 
   oop obj = h_obj();
-  address owner = NULL;
 
-  markWord mark = read_stable_mark(obj);
+  while (true) {
+    address owner = NULL;
+    markWord mark = read_stable_mark(obj);
 
-  // Uncontended case, header points to stack
-  if (mark.has_locker()) {
-    owner = (address) mark.locker();
-  }
+    // Uncontended case, header points to stack
+    if (mark.has_locker()) {
+      owner = (address) mark.locker();
+    }
 
-  // Contended case, header points to ObjectMonitor (tagged pointer)
-  else if (mark.has_monitor()) {
-    ObjectMonitor* monitor = mark.monitor();
-    assert(monitor != NULL, "monitor should be non-null");
-    owner = (address) monitor->owner();
-  }
+    // Contended case, header points to ObjectMonitor (tagged pointer)
+    else if (mark.has_monitor()) {
+      ObjectMonitorHandle omh;
+      if (!omh.save_om_ptr(obj, mark)) {
+        // Lost a race with async deflation so try again.
+        assert(AsyncDeflateIdleMonitors, "sanity check");
+        continue;
+      }
+      ObjectMonitor* monitor = omh.om_ptr();
+      assert(monitor != NULL, "monitor should be non-null");
+      owner = (address) monitor->owner();
+    }
 
-  if (owner != NULL) {
-    // owning_thread_from_monitor_owner() may also return NULL here
-    return Threads::owning_thread_from_monitor_owner(t_list, owner);
-  }
+    if (owner != NULL) {
+      // owning_thread_from_monitor_owner() may also return NULL here
+      return Threads::owning_thread_from_monitor_owner(t_list, owner);
+    }
 
-  // Unlocked case, header in place
-  // Cannot have assertion since this object may have been
-  // locked by another thread when reaching here.
-  // assert(mark.is_neutral(), "sanity check");
+    // Unlocked case, header in place
+    // Cannot have assertion since this object may have been
+    // locked by another thread when reaching here.
+    // assert(mark.is_neutral(), "sanity check");
 
-  return NULL;
+    return NULL;
+  }
 }
 
 // Visitors ...
@@ -889,32 +1216,101 @@
     assert(block->object() == CHAINMARKER, "must be a block header");
     for (int i = _BLOCKSIZE - 1; i > 0; i--) {
       ObjectMonitor* mid = (ObjectMonitor *)(block + i);
-      oop object = (oop)mid->object();
-      if (object != NULL) {
-        // Only process with closure if the object is set.
+      if (mid->is_active()) {
+        ObjectMonitorHandle omh(mid);
+
+        if (mid->object() == NULL ||
+            (AsyncDeflateIdleMonitors && mid->ref_count() < 0)) {
+          // Only process with closure if the object is set.
+          // For async deflation, race here if monitor is not owned!
+          // The above ref_count bump (in ObjectMonitorHandle ctr)
+          // will cause subsequent async deflation to skip it.
+          // However, previous or concurrent async deflation is a race
+          // so skip this ObjectMonitor if it is being async deflated.
+          continue;
+        }
         closure->do_monitor(mid);
       }
     }
-    block = (PaddedObjectMonitor*)block->_next_om;
+    // unmarked_next() is not needed with g_block_list (no next field marking).
+    block = (PaddedObjectMonitor*)OrderAccess::load_acquire(&block->_next_om);
   }
 }
 
 static bool monitors_used_above_threshold() {
-  if (g_om_population == 0) {
+  if (OrderAccess::load_acquire(&g_om_population) == 0) {
     return false;
   }
-  int monitors_used = g_om_population - g_om_free_count;
-  int monitor_usage = (monitors_used * 100LL) / g_om_population;
-  return monitor_usage > MonitorUsedDeflationThreshold;
+  if (MonitorUsedDeflationThreshold > 0) {
+    int monitors_used = OrderAccess::load_acquire(&g_om_population) -
+                        OrderAccess::load_acquire(&g_om_free_count);
+    if (HandshakeAfterDeflateIdleMonitors) {
+      monitors_used -= OrderAccess::load_acquire(&g_om_wait_count);
+    }
+    int monitor_usage = (monitors_used * 100LL) /
+                        OrderAccess::load_acquire(&g_om_population);
+    return monitor_usage > MonitorUsedDeflationThreshold;
+  }
+  return false;
 }
 
-bool ObjectSynchronizer::is_cleanup_needed() {
-  if (MonitorUsedDeflationThreshold > 0) {
-    return monitors_used_above_threshold();
+// Returns true if MonitorBound is set (> 0) and if the specified
+// cnt is > MonitorBound. Otherwise returns false.
+static bool is_MonitorBound_exceeded(const int cnt) {
+  const int mx = MonitorBound;
+  return mx > 0 && cnt > mx;
+}
+
+bool ObjectSynchronizer::is_async_deflation_needed() {
+  if (!AsyncDeflateIdleMonitors) {
+    return false;
+  }
+  if (is_async_deflation_requested()) {
+    // Async deflation request.
+    return true;
+  }
+  if (AsyncDeflationInterval > 0 &&
+      time_since_last_async_deflation_ms() > AsyncDeflationInterval &&
+      monitors_used_above_threshold()) {
+    // It's been longer than our specified deflate interval and there
+    // are too many monitors in use. We don't deflate more frequently
+    // than AsyncDeflationInterval (unless is_async_deflation_requested)
+    // in order to not swamp the ServiceThread.
+    _last_async_deflation_time_ns = os::javaTimeNanos();
+    return true;
+  }
+  int monitors_used = OrderAccess::load_acquire(&g_om_population) -
+                      OrderAccess::load_acquire(&g_om_free_count);
+  if (HandshakeAfterDeflateIdleMonitors) {
+    monitors_used -= OrderAccess::load_acquire(&g_om_wait_count);
+  }
+  if (is_MonitorBound_exceeded(monitors_used)) {
+    // Not enough ObjectMonitors on the global free list.
+    return true;
+  }
+  return false;
+}
+
+bool ObjectSynchronizer::is_safepoint_deflation_needed() {
+  if (!AsyncDeflateIdleMonitors) {
+    if (monitors_used_above_threshold()) {
+      // Too many monitors in use.
+      return true;
+    }
+    return false;
+  }
+  if (is_special_deflation_requested()) {
+    // For AsyncDeflateIdleMonitors only do a safepoint deflation
+    // if there is a special deflation request.
+    return true;
   }
   return false;
 }
 
+jlong ObjectSynchronizer::time_since_last_async_deflation_ms() {
+  return (os::javaTimeNanos() - _last_async_deflation_time_ns) / (NANOUNITS / MILLIUNITS);
+}
+
 void ObjectSynchronizer::oops_do(OopClosure* f) {
   // We only scan the global used list here (for moribund threads), and
   // the thread-local monitors in Thread::oops_do().
@@ -923,18 +1319,20 @@
 
 void ObjectSynchronizer::global_used_oops_do(OopClosure* f) {
   assert(SafepointSynchronize::is_at_safepoint(), "must be at safepoint");
-  list_oops_do(g_om_in_use_list, f);
+  list_oops_do(OrderAccess::load_acquire(&g_om_in_use_list), OrderAccess::load_acquire(&g_om_in_use_count), f);
 }
 
 void ObjectSynchronizer::thread_local_used_oops_do(Thread* thread, OopClosure* f) {
   assert(SafepointSynchronize::is_at_safepoint(), "must be at safepoint");
-  list_oops_do(thread->om_in_use_list, f);
+  list_oops_do(OrderAccess::load_acquire(&thread->om_in_use_list), OrderAccess::load_acquire(&thread->om_in_use_count), f);
 }
 
-void ObjectSynchronizer::list_oops_do(ObjectMonitor* list, OopClosure* f) {
+void ObjectSynchronizer::list_oops_do(ObjectMonitor* list, int count, OopClosure* f) {
   assert(SafepointSynchronize::is_at_safepoint(), "must be at safepoint");
-  ObjectMonitor* mid;
-  for (mid = list; mid != NULL; mid = mid->_next_om) {
+  // The oops_do() phase does not overlap with monitor deflation
+  // so no need to update the ObjectMonitor's ref_count for this
+  // ObjectMonitor* use.
+  for (ObjectMonitor* mid = list; mid != NULL; mid = unmarked_next(mid)) {
     if (mid->object() != NULL) {
       f->do_oop((oop*)mid->object_addr());
     }
@@ -950,9 +1348,6 @@
 // STW-time -- disassociates idle monitors from objects.  Such
 // scavenged monitors are returned to the g_free_list.
 //
-// The global list is protected by gListLock.  All the critical sections
-// are short and operate in constant-time.
-//
 // ObjectMonitors reside in type-stable memory (TSM) and are immortal.
 //
 // Lifecycle:
@@ -966,6 +1361,7 @@
 //
 // If MonitorBound is not set (<= 0), MonitorBound checks are disabled.
 //
+// When safepoint deflation is being used (!AsyncDeflateIdleMonitors):
 // The monitor pool is grow-only.  We scavenge at STW safepoint-time, but the
 // the rate of scavenging is driven primarily by GC.  As such,  we can find
 // an inordinate number of monitors in circulation.
@@ -980,13 +1376,26 @@
 //
 // The current implementation uses asynchronous VM operations.
 //
-// If MonitorBound is set, the boundry applies to
+// When safepoint deflation is being used and MonitorBound is set, the
+// boundry applies to
 //     (g_om_population - g_om_free_count)
 // i.e., if there are not enough ObjectMonitors on the global free list,
 // then a safepoint deflation is induced. Picking a good MonitorBound value
 // is non-trivial.
+//
+// When async deflation is being used:
+// The monitor pool is still grow-only. Async deflation is requested
+// by a safepoint's cleanup phase or by the ServiceThread at periodic
+// intervals when is_async_deflation_needed() returns true. In
+// addition to other policies that are checked, if there are not
+// enough ObjectMonitors on the global free list, then
+// is_async_deflation_needed() will return true. The ServiceThread
+// calls deflate_global_idle_monitors_using_JT() and also calls
+// deflate_per_thread_idle_monitors_using_JT() as needed.
 
 static void InduceScavenge(Thread* self, const char * Whence) {
+  assert(!AsyncDeflateIdleMonitors, "is not used by async deflation");
+
   // Induce STW safepoint to trim monitors
   // Ultimately, this results in a call to deflate_idle_monitors() in the near future.
   // More precisely, trigger an asynchronous STW safepoint as the number
@@ -1002,31 +1411,30 @@
   }
 }
 
-ObjectMonitor* ObjectSynchronizer::om_alloc(Thread* self) {
+ObjectMonitor* ObjectSynchronizer::om_alloc(Thread* self,
+                                           const InflateCause cause) {
   // A large MAXPRIVATE value reduces both list lock contention
   // and list coherency traffic, but also tends to increase the
   // number of ObjectMonitors in circulation as well as the STW
   // scavenge costs.  As usual, we lean toward time in space-time
   // tradeoffs.
   const int MAXPRIVATE = 1024;
+
   stringStream ss;
   for (;;) {
     ObjectMonitor* m;
 
     // 1: try to allocate from the thread's local om_free_list.
     // Threads will attempt to allocate first from their local list, then
-    // from the global list, and only after those attempts fail will the thread
-    // attempt to instantiate new monitors.   Thread-local free lists take
-    // heat off the gListLock and improve allocation latency, as well as reducing
-    // coherency traffic on the shared global list.
-    m = self->om_free_list;
+    // from the global list, and only after those attempts fail will the
+    // thread attempt to instantiate new monitors. Thread-local free lists
+    // improve allocation latency, as well as reducing coherency traffic
+    // on the shared global list.
+    m = take_from_start_of_om_free_list(self);
     if (m != NULL) {
-      self->om_free_list = m->_next_om;
-      self->om_free_count--;
       guarantee(m->object() == NULL, "invariant");
-      m->_next_om = self->om_in_use_list;
-      self->om_in_use_list = m;
-      self->om_in_use_count++;
+      m->set_allocation_state(ObjectMonitor::New);
+      prepend_to_om_in_use_list(self, m);
       return m;
     }
 
@@ -1035,25 +1443,45 @@
     // If the muxTry() fails then drop immediately into case 3.
     // If we're using thread-local free lists then try
     // to reprovision the caller's free list.
-    if (g_free_list != NULL) {
+    if (OrderAccess::load_acquire(&g_free_list) != NULL) {
       // Reprovision the thread's om_free_list.
       // Use bulk transfers to reduce the allocation rate and heat
       // on various locks.
-      Thread::muxAcquire(&gListLock, "om_alloc(1)");
-      for (int i = self->om_free_provision; --i >= 0 && g_free_list != NULL;) {
-        g_om_free_count--;
-        ObjectMonitor* take = g_free_list;
-        g_free_list = take->_next_om;
+      for (int i = self->om_free_provision; --i >= 0;) {
+        ObjectMonitor* take = take_from_start_of_g_free_list();
+        if (take == NULL) {
+          break;  // No more are available.
+        }
         guarantee(take->object() == NULL, "invariant");
+        if (AsyncDeflateIdleMonitors) {
+          // We allowed 3 field values to linger during async deflation.
+          // We clear header and restore ref_count here, but we leave
+          // owner == DEFLATER_MARKER so the simple C2 ObjectMonitor
+          // enter optimization can no longer race with async deflation
+          // and reuse.
+          take->set_header(markWord::zero());
+          if (take->ref_count() < 0) {
+            // Add back max_jint to restore the ref_count field to its
+            // proper value.
+            Atomic::add(max_jint, &take->_ref_count);
+
+            assert(take->ref_count() >= 0, "must not be negative: ref_count=%d",
+                   take->ref_count());
+          }
+        }
         take->Recycle();
+        // Since we're taking from the global free-list, take must be Free.
+        // om_release() also sets the allocation state to Free because it
+        // is called from other code paths.
+        assert(take->is_free(), "invariant");
         om_release(self, take, false);
       }
-      Thread::muxRelease(&gListLock);
       self->om_free_provision += 1 + (self->om_free_provision/2);
       if (self->om_free_provision > MAXPRIVATE) self->om_free_provision = MAXPRIVATE;
 
-      const int mx = MonitorBound;
-      if (mx > 0 && (g_om_population-g_om_free_count) > mx) {
+      if (!AsyncDeflateIdleMonitors &&
+          is_MonitorBound_exceeded(OrderAccess::load_acquire(&g_om_population) -
+                                   OrderAccess::load_acquire(&g_om_free_count))) {
         // Not enough ObjectMonitors on the global free list.
         // We can't safely induce a STW safepoint from om_alloc() as our thread
         // state may not be appropriate for such activities and callers may hold
@@ -1074,9 +1502,9 @@
     assert(_BLOCKSIZE > 1, "invariant");
     size_t neededsize = sizeof(PaddedObjectMonitor) * _BLOCKSIZE;
     PaddedObjectMonitor* temp;
-    size_t aligned_size = neededsize + (DEFAULT_CACHE_LINE_SIZE - 1);
+    size_t aligned_size = neededsize + (OM_CACHE_LINE_SIZE - 1);
     void* real_malloc_addr = NEW_C_HEAP_ARRAY(char, aligned_size, mtInternal);
-    temp = (PaddedObjectMonitor*)align_up(real_malloc_addr, DEFAULT_CACHE_LINE_SIZE);
+    temp = (PaddedObjectMonitor*)align_up(real_malloc_addr, OM_CACHE_LINE_SIZE);
     (void)memset((void *) temp, 0, neededsize);
 
     // Format the block.
@@ -1088,11 +1516,12 @@
     // look like: class Block { Block * next; int N; ObjectMonitor Body [N] ; }
 
     for (int i = 1; i < _BLOCKSIZE; i++) {
-      temp[i]._next_om = (ObjectMonitor *)&temp[i+1];
+      OrderAccess::release_store(&temp[i]._next_om, (ObjectMonitor*)&temp[i+1]);
+      assert(temp[i].is_free(), "invariant");
     }
 
     // terminate the last monitor as the end of list
-    temp[_BLOCKSIZE - 1]._next_om = NULL;
+    OrderAccess::release_store(&temp[_BLOCKSIZE - 1]._next_om, (ObjectMonitor*)NULL);
 
     // Element [0] is reserved for global list linkage
     temp[0].set_object(CHAINMARKER);
@@ -1101,24 +1530,7 @@
     // block in hand.  This avoids some lock traffic and redundant
     // list activity.
 
-    // Acquire the gListLock to manipulate g_block_list and g_free_list.
-    // An Oyama-Taura-Yonezawa scheme might be more efficient.
-    Thread::muxAcquire(&gListLock, "om_alloc(2)");
-    g_om_population += _BLOCKSIZE-1;
-    g_om_free_count += _BLOCKSIZE-1;
-
-    // Add the new block to the list of extant blocks (g_block_list).
-    // The very first ObjectMonitor in a block is reserved and dedicated.
-    // It serves as blocklist "next" linkage.
-    temp[0]._next_om = g_block_list;
-    // There are lock-free uses of g_block_list so make sure that
-    // the previous stores happen before we update g_block_list.
-    OrderAccess::release_store(&g_block_list, temp);
-
-    // Add the new string of ObjectMonitors to the global free list
-    temp[_BLOCKSIZE - 1]._next_om = g_free_list;
-    g_free_list = temp + 1;
-    Thread::muxRelease(&gListLock);
+    prepend_block_to_lists(temp);
   }
 }
 
@@ -1131,8 +1543,8 @@
 //
 // Key constraint: all ObjectMonitors on a thread's free list and the global
 // free list must have their object field set to null. This prevents the
-// scavenger -- deflate_monitor_list() -- from reclaiming them while we
-// are trying to release them.
+// scavenger -- deflate_monitor_list() or deflate_monitor_list_using_JT()
+// -- from reclaiming them while we are trying to release them.
 
 void ObjectSynchronizer::om_release(Thread* self, ObjectMonitor* m,
                                     bool from_per_thread_alloc) {
@@ -1140,33 +1552,64 @@
   guarantee(m->object() == NULL, "invariant");
   stringStream ss;
   guarantee((m->is_busy() | m->_recursions) == 0, "freeing in-use monitor: "
-            "%s, recursions=" INTPTR_FORMAT, m->is_busy_to_string(&ss),
+            "%s, recursions=" INTX_FORMAT, m->is_busy_to_string(&ss),
             m->_recursions);
+  m->set_allocation_state(ObjectMonitor::Free);
   // _next_om is used for both per-thread in-use and free lists so
   // we have to remove 'm' from the in-use list first (as needed).
   if (from_per_thread_alloc) {
     // Need to remove 'm' from om_in_use_list.
+    // We use the more complicated mark-cur_mid_in_use-and-mid-as-we-go
+    // protocol because async deflation can do list deletions in parallel.
     ObjectMonitor* cur_mid_in_use = NULL;
+    ObjectMonitor* mid = NULL;
+    ObjectMonitor* next = NULL;
     bool extracted = false;
-    for (ObjectMonitor* mid = self->om_in_use_list; mid != NULL; cur_mid_in_use = mid, mid = mid->_next_om) {
+
+    if (!mark_list_head(&self->om_in_use_list, &mid, &next)) {
+      fatal("thread=" INTPTR_FORMAT " in-use list must not be empty.", p2i(self));
+    }
+    while (true) {
       if (m == mid) {
-        // extract from per-thread in-use list
-        if (mid == self->om_in_use_list) {
-          self->om_in_use_list = mid->_next_om;
-        } else if (cur_mid_in_use != NULL) {
-          cur_mid_in_use->_next_om = mid->_next_om; // maintain the current thread in-use list
+        // We found 'm' on the per-thread in-use list so try to extract it.
+        if (cur_mid_in_use == NULL) {
+          // mid is the list head and it is marked. Switch the list head
+          // to next which unmarks the list head, but leaves mid marked:
+          OrderAccess::release_store(&self->om_in_use_list, next);
+        } else {
+          // mid and cur_mid_in_use are marked. Switch cur_mid_in_use's
+          // next field to next which unmarks cur_mid_in_use, but leaves
+          // mid marked:
+          OrderAccess::release_store(&cur_mid_in_use->_next_om, next);
         }
         extracted = true;
-        self->om_in_use_count--;
+        Atomic::dec(&self->om_in_use_count);
+        // Unmark mid, but leave the next value for any lagging list
+        // walkers. It will get cleaned up when mid is prepended to
+        // the thread's free list:
+        set_next(mid, next);
         break;
       }
+      if (cur_mid_in_use != NULL) {
+        set_next(cur_mid_in_use, mid);  // umark cur_mid_in_use
+      }
+      // The next cur_mid_in_use keeps mid's marked next field so
+      // that it is stable for a possible next field change. It
+      // cannot be deflated while it is marked.
+      cur_mid_in_use = mid;
+      mid = next;
+      if (mid == NULL) {
+        // Reached end of the list and didn't find m so:
+        fatal("must find m=" INTPTR_FORMAT "on om_in_use_list=" INTPTR_FORMAT,
+              p2i(m), p2i(self->om_in_use_list));
+      }
+      // Mark mid's next field so we can possibly extract it:
+      next = mark_next_loop(mid);
     }
-    assert(extracted, "Should have extracted from in-use list");
   }
 
-  m->_next_om = self->om_free_list;
-  self->om_free_list = m;
-  self->om_free_count++;
+  prepend_to_om_free_list(self, m);
+  guarantee(m->is_free(), "invariant");
 }
 
 // Return ObjectMonitors on a moribund thread's free and in-use
@@ -1181,62 +1624,108 @@
 // scanned by a GC safepoint, either via Thread::oops_do() (before
 // om_flush() is called) or via ObjectSynchronizer::oops_do() (after
 // om_flush() is called).
+//
+// With AsyncDeflateIdleMonitors, deflate_global_idle_monitors_using_JT()
+// and deflate_per_thread_idle_monitors_using_JT() (in another thread) can
+// run at the same time as om_flush() so we have to follow a careful
+// protocol to prevent list corruption.
 
 void ObjectSynchronizer::om_flush(Thread* self) {
-  ObjectMonitor* free_list = self->om_free_list;
-  ObjectMonitor* free_tail = NULL;
+  // This function can race with an async deflater thread. Since
+  // deflation has to process the per-thread in-use list before
+  // prepending the deflated ObjectMonitors to the global free list,
+  // we process the per-thread lists in the same order to prevent
+  // ordering races.
+  int in_use_count = 0;
+  ObjectMonitor* in_use_list = NULL;
+  ObjectMonitor* in_use_tail = NULL;
+  ObjectMonitor* next = NULL;
+
+  // An async deflation thread checks to see if the target thread
+  // is exiting, but if it has made it past that check before we
+  // started exiting, then it is racing to get to the in-use list.
+  if (mark_list_head(&self->om_in_use_list, &in_use_list, &next)) {
+    // At this point, we have marked the in-use list head so an
+    // async deflation thread cannot come in after us. If an async
+    // deflation thread is ahead of us, then we'll detect that and
+    // wait for it to finish its work.
+    //
+    // The thread is going away, however the ObjectMonitors on the
+    // om_in_use_list may still be in-use by other threads. Link
+    // them to in_use_tail, which will be linked into the global
+    // in-use list g_om_in_use_list below.
+    //
+    // Account for the in-use list head before the loop since it is
+    // already marked (by this thread):
+    in_use_tail = in_use_list;
+    in_use_count++;
+    for (ObjectMonitor* cur_om = unmarked_next(in_use_list); cur_om != NULL;) {
+      if (is_next_marked(cur_om)) {
+        // This next field is marked so there must be an async deflater
+        // thread ahead of us so we'll give it a chance to finish.
+        while (is_next_marked(cur_om)) {
+          os::naked_short_sleep(1);
+        }
+        // Refetch the possibly changed next field and try again.
+        cur_om = unmarked_next(in_use_tail);
+        continue;
+      }
+      if (!cur_om->is_active()) {
+        // cur_om was deflated and the allocation state was changed
+        // to Free while it was marked. We happened to see it just
+        // after it was unmarked (and added to the free list).
+        // Refetch the possibly changed next field and try again.
+        cur_om = unmarked_next(in_use_tail);
+        continue;
+      }
+      in_use_tail = cur_om;
+      in_use_count++;
+      cur_om = unmarked_next(cur_om);
+    }
+    guarantee(in_use_tail != NULL, "invariant");
+    int l_om_in_use_count = OrderAccess::load_acquire(&self->om_in_use_count);
+    ADIM_guarantee(l_om_in_use_count == in_use_count, "in-use counts don't "
+                   "match: l_om_in_use_count=%d, in_use_count=%d",
+                   l_om_in_use_count, in_use_count);
+    // Clear the in-use count before unmarking the in-use list head
+    // to avoid races:
+    OrderAccess::release_store(&self->om_in_use_count, 0);
+    // Clear the in-use list head (which also unmarks it):
+    OrderAccess::release_store(&self->om_in_use_list, (ObjectMonitor*)NULL);
+    // Unmark the disconnected list head:
+    set_next(in_use_list, next);
+  }
+
   int free_count = 0;
+  ObjectMonitor* free_list = OrderAccess::load_acquire(&self->om_free_list);
+  ObjectMonitor* free_tail = NULL;
   if (free_list != NULL) {
-    ObjectMonitor* s;
     // The thread is going away. Set 'free_tail' to the last per-thread free
-    // monitor which will be linked to g_free_list below under the gListLock.
+    // monitor which will be linked to g_free_list below.
     stringStream ss;
-    for (s = free_list; s != NULL; s = s->_next_om) {
+    for (ObjectMonitor* s = free_list; s != NULL; s = unmarked_next(s)) {
       free_count++;
       free_tail = s;
       guarantee(s->object() == NULL, "invariant");
       guarantee(!s->is_busy(), "must be !is_busy: %s", s->is_busy_to_string(&ss));
     }
     guarantee(free_tail != NULL, "invariant");
-    assert(self->om_free_count == free_count, "free-count off");
-    self->om_free_list = NULL;
-    self->om_free_count = 0;
-  }
-
-  ObjectMonitor* in_use_list = self->om_in_use_list;
-  ObjectMonitor* in_use_tail = NULL;
-  int in_use_count = 0;
-  if (in_use_list != NULL) {
-    // The thread is going away, however the ObjectMonitors on the
-    // om_in_use_list may still be in-use by other threads. Link
-    // them to in_use_tail, which will be linked into the global
-    // in-use list g_om_in_use_list below, under the gListLock.
-    ObjectMonitor *cur_om;
-    for (cur_om = in_use_list; cur_om != NULL; cur_om = cur_om->_next_om) {
-      in_use_tail = cur_om;
-      in_use_count++;
-    }
-    guarantee(in_use_tail != NULL, "invariant");
-    assert(self->om_in_use_count == in_use_count, "in-use count off");
-    self->om_in_use_list = NULL;
-    self->om_in_use_count = 0;
+    int l_om_free_count = OrderAccess::load_acquire(&self->om_free_count);
+    ADIM_guarantee(l_om_free_count == free_count, "free counts don't match: "
+                   "l_om_free_count=%d, free_count=%d", l_om_free_count,
+                   free_count);
+    OrderAccess::release_store(&self->om_free_list, (ObjectMonitor*)NULL);
+    OrderAccess::release_store(&self->om_free_count, 0);
   }
 
-  Thread::muxAcquire(&gListLock, "om_flush");
   if (free_tail != NULL) {
-    free_tail->_next_om = g_free_list;
-    g_free_list = free_list;
-    g_om_free_count += free_count;
+    prepend_list_to_g_free_list(free_list, free_tail, free_count);
   }
 
   if (in_use_tail != NULL) {
-    in_use_tail->_next_om = g_om_in_use_list;
-    g_om_in_use_list = in_use_list;
-    g_om_in_use_count += in_use_count;
+    prepend_list_to_g_om_in_use_list(in_use_list, in_use_tail, in_use_count);
   }
 
-  Thread::muxRelease(&gListLock);
-
   LogStreamHandle(Debug, monitorinflation) lsh_debug;
   LogStreamHandle(Info, monitorinflation) lsh_info;
   LogStream* ls = NULL;
@@ -1265,19 +1754,28 @@
 }
 
 // Fast path code shared by multiple functions
-void ObjectSynchronizer::inflate_helper(oop obj) {
-  markWord mark = obj->mark();
-  if (mark.has_monitor()) {
-    assert(ObjectSynchronizer::verify_objmon_isinpool(mark.monitor()), "monitor is invalid");
-    assert(mark.monitor()->header().is_neutral(), "monitor must record a good object header");
+void ObjectSynchronizer::inflate_helper(ObjectMonitorHandle* omh_p, oop obj) {
+  while (true) {
+    markWord mark = obj->mark();
+    if (mark.has_monitor()) {
+      if (!omh_p->save_om_ptr(obj, mark)) {
+        // Lost a race with async deflation so try again.
+        assert(AsyncDeflateIdleMonitors, "sanity check");
+        continue;
+      }
+      ObjectMonitor* monitor = omh_p->om_ptr();
+      assert(ObjectSynchronizer::verify_objmon_isinpool(monitor), "monitor is invalid");
+      markWord dmw = monitor->header();
+      assert(dmw.is_neutral(), "sanity check: header=" INTPTR_FORMAT, dmw.value());
+      return;
+    }
+    inflate(omh_p, Thread::current(), obj, inflate_cause_vm_internal);
     return;
   }
-  inflate(Thread::current(), obj, inflate_cause_vm_internal);
 }
 
-ObjectMonitor* ObjectSynchronizer::inflate(Thread* self,
-                                           oop object,
-                                           const InflateCause cause) {
+void ObjectSynchronizer::inflate(ObjectMonitorHandle* omh_p, Thread* self,
+                                 oop object, const InflateCause cause) {
   // Inflate mutates the heap ...
   // Relaxing assertion for bug 6320749.
   assert(Universe::verify_in_progress() ||
@@ -1298,12 +1796,17 @@
 
     // CASE: inflated
     if (mark.has_monitor()) {
-      ObjectMonitor* inf = mark.monitor();
+      if (!omh_p->save_om_ptr(object, mark)) {
+        // Lost a race with async deflation so try again.
+        assert(AsyncDeflateIdleMonitors, "sanity check");
+        continue;
+      }
+      ObjectMonitor* inf = omh_p->om_ptr();
       markWord dmw = inf->header();
       assert(dmw.is_neutral(), "invariant: header=" INTPTR_FORMAT, dmw.value());
       assert(inf->object() == object, "invariant");
       assert(ObjectSynchronizer::verify_objmon_isinpool(inf), "monitor is invalid");
-      return inf;
+      return;
     }
 
     // CASE: inflation in progress - inflating over a stack-lock.
@@ -1339,7 +1842,7 @@
     LogStreamHandle(Trace, monitorinflation) lsh;
 
     if (mark.has_locker()) {
-      ObjectMonitor* m = om_alloc(self);
+      ObjectMonitor* m = om_alloc(self, cause);
       // Optimistically prepare the objectmonitor - anticipate successful CAS
       // We do this before the CAS in order to minimize the length of time
       // in which INFLATING appears in the mark.
@@ -1349,6 +1852,7 @@
 
       markWord cmp = object->cas_set_mark(markWord::INFLATING(), mark);
       if (cmp != mark) {
+        // om_release() will reset the allocation state from New to Free.
         om_release(self, m, true);
         continue;       // Interference -- just retry
       }
@@ -1386,7 +1890,7 @@
       markWord dmw = mark.displaced_mark_helper();
       // Catch if the object's header is not neutral (not locked and
       // not marked is what we care about here).
-      assert(dmw.is_neutral(), "invariant: header=" INTPTR_FORMAT, dmw.value());
+      ADIM_guarantee(dmw.is_neutral(), "invariant: header=" INTPTR_FORMAT, dmw.value());
 
       // Setup monitor fields to proper values -- prepare the monitor
       m->set_header(dmw);
@@ -1396,15 +1900,26 @@
       // Note that a thread can inflate an object
       // that it has stack-locked -- as might happen in wait() -- directly
       // with CAS.  That is, we can avoid the xchg-NULL .... ST idiom.
-      m->set_owner(mark.locker());
+      if (AsyncDeflateIdleMonitors) {
+        m->set_owner_from(mark.locker(), NULL, DEFLATER_MARKER);
+      } else {
+        m->set_owner_from(mark.locker(), NULL);
+      }
       m->set_object(object);
       // TODO-FIXME: assert BasicLock->dhw != 0.
 
+      omh_p->set_om_ptr(m);
+
       // Must preserve store ordering. The monitor state must
       // be stable at the time of publishing the monitor address.
       guarantee(object->mark() == markWord::INFLATING(), "invariant");
       object->release_set_mark(markWord::encode(m));
 
+      // Once ObjectMonitor is configured and the object is associated
+      // with the ObjectMonitor, it is safe to allow async deflation:
+      assert(m->is_new(), "freshly allocated monitor must be new");
+      m->set_allocation_state(ObjectMonitor::Old);
+
       // Hopefully the performance counters are allocated on distinct cache lines
       // to avoid false sharing on MP systems ...
       OM_PERFDATA_OP(Inflations, inc());
@@ -1417,7 +1932,8 @@
       if (event.should_commit()) {
         post_monitor_inflate_event(&event, object, cause);
       }
-      return m;
+      ADIM_guarantee(!m->is_free(), "inflated monitor to be returned cannot be free");
+      return;
     }
 
     // CASE: neutral
@@ -1431,19 +1947,26 @@
 
     // Catch if the object's header is not neutral (not locked and
     // not marked is what we care about here).
-    assert(mark.is_neutral(), "invariant: header=" INTPTR_FORMAT, mark.value());
-    ObjectMonitor* m = om_alloc(self);
+    ADIM_guarantee(mark.is_neutral(), "invariant: header=" INTPTR_FORMAT,mark.value());
+    ObjectMonitor* m = om_alloc(self, cause);
     // prepare m for installation - set monitor to initial state
     m->Recycle();
     m->set_header(mark);
+    // If we leave _owner == DEFLATER_MARKER here, then the simple C2
+    // ObjectMonitor enter optimization can no longer race with async
+    // deflation and reuse.
     m->set_object(object);
     m->_Responsible  = NULL;
     m->_SpinDuration = ObjectMonitor::Knob_SpinLimit;       // consider: keep metastats by type/class
 
+    omh_p->set_om_ptr(m);
+
     if (object->cas_set_mark(markWord::encode(m), mark) != mark) {
       m->set_header(markWord::zero());
       m->set_object(NULL);
       m->Recycle();
+      omh_p->set_om_ptr(NULL);
+      // om_release() will reset the allocation state from New to Free.
       om_release(self, m, true);
       m = NULL;
       continue;
@@ -1452,6 +1975,11 @@
       // live-lock -- "Inflated" is an absorbing state.
     }
 
+    // Once the ObjectMonitor is configured and object is associated
+    // with the ObjectMonitor, it is safe to allow async deflation:
+    assert(m->is_new(), "freshly allocated monitor must be new");
+    m->set_allocation_state(ObjectMonitor::Old);
+
     // Hopefully the performance counters are allocated on distinct
     // cache lines to avoid false sharing on MP systems ...
     OM_PERFDATA_OP(Inflations, inc());
@@ -1464,13 +1992,15 @@
     if (event.should_commit()) {
       post_monitor_inflate_event(&event, object, cause);
     }
-    return m;
+    ADIM_guarantee(!m->is_free(), "inflated monitor to be returned cannot be free");
+    return;
   }
 }
 
 
 // We maintain a list of in-use monitors for each thread.
 //
+// For safepoint based deflation:
 // deflate_thread_local_monitors() scans a single thread's in-use list, while
 // deflate_idle_monitors() scans only a global list of in-use monitors which
 // is populated only as a thread dies (see om_flush()).
@@ -1489,6 +2019,31 @@
 // typically drives the scavenge rate.  Large heaps can mean infrequent GC,
 // which in turn can mean large(r) numbers of ObjectMonitors in circulation.
 // This is an unfortunate aspect of this design.
+//
+// For async deflation:
+// If a special deflation request is made, then the safepoint based
+// deflation mechanism is used. Otherwise, an async deflation request
+// is registered with the ServiceThread and it is notified.
+
+void ObjectSynchronizer::do_safepoint_work(DeflateMonitorCounters* counters) {
+  assert(SafepointSynchronize::is_at_safepoint(), "must be at safepoint");
+
+  // The per-thread in-use lists are handled in
+  // ParallelSPCleanupThreadClosure::do_thread().
+
+  if (!AsyncDeflateIdleMonitors || is_special_deflation_requested()) {
+    // Use the older mechanism for the global in-use list or if a
+    // special deflation has been requested before the safepoint.
+    ObjectSynchronizer::deflate_idle_monitors(counters);
+    return;
+  }
+
+  log_debug(monitorinflation)("requesting async deflation of idle monitors.");
+  // Request deflation of idle monitors by the ServiceThread:
+  set_is_async_deflation_requested(true);
+  MonitorLocker ml(Service_lock, Mutex::_no_safepoint_check_flag);
+  ml.notify_all();
+}
 
 // Deflate a single monitor if not in-use
 // Return true if deflated, false if in-use
@@ -1507,7 +2062,9 @@
   const markWord dmw = mid->header();
   guarantee(dmw.is_neutral(), "invariant: header=" INTPTR_FORMAT, dmw.value());
 
-  if (mid->is_busy()) {
+  if (mid->is_busy() || mid->ref_count() != 0) {
+    // Easy checks are first - the ObjectMonitor is busy or ObjectMonitor*
+    // is in use so no deflation.
     deflated = false;
   } else {
     // Deflate the monitor if it is no longer being used
@@ -1523,21 +2080,34 @@
 
     // Restore the header back to obj
     obj->release_set_mark(dmw);
+    if (AsyncDeflateIdleMonitors) {
+      // clear() expects the owner field to be NULL and we won't race
+      // with the simple C2 ObjectMonitor enter optimization since
+      // we're at a safepoint. DEFLATER_MARKER is the only non-NULL
+      // value we should see here.
+      mid->try_set_owner_from(NULL, DEFLATER_MARKER);
+    }
     mid->clear();
 
     assert(mid->object() == NULL, "invariant: object=" INTPTR_FORMAT,
            p2i(mid->object()));
+    assert(mid->is_free(), "invariant");
 
     // Move the deflated ObjectMonitor to the working free list
-    // defined by free_head_p and free_tail_p.
+    // defined by free_head_p and free_tail_p. No races on this list
+    // so no need for load_acquire() or store_release().
     if (*free_head_p == NULL) *free_head_p = mid;
     if (*free_tail_p != NULL) {
       // We append to the list so the caller can use mid->_next_om
       // to fix the linkages in its context.
       ObjectMonitor* prevtail = *free_tail_p;
       // Should have been cleaned up by the caller:
-      assert(prevtail->_next_om == NULL, "cleaned up deflated?");
-      prevtail->_next_om = mid;
+      // Note: Should not have to mark prevtail here since we're at a
+      // safepoint and ObjectMonitors on the local free list should
+      // not be accessed in parallel.
+      assert(prevtail->_next_om == NULL, "must be NULL: _next_om="
+             INTPTR_FORMAT, p2i(prevtail->_next_om));
+      set_next(prevtail, mid);
     }
     *free_tail_p = mid;
     // At this point, mid->_next_om still refers to its current
@@ -1549,9 +2119,150 @@
   return deflated;
 }
 
-// Walk a given monitor list, and deflate idle monitors
-// The given list could be a per-thread list or a global list
-// Caller acquires gListLock as needed.
+// Deflate the specified ObjectMonitor if not in-use using a JavaThread.
+// Returns true if it was deflated and false otherwise.
+//
+// The async deflation protocol sets owner to DEFLATER_MARKER and
+// makes ref_count negative as signals to contending threads that
+// an async deflation is in progress. There are a number of checks
+// as part of the protocol to make sure that the calling thread has
+// not lost the race to a contending thread or to a thread that just
+// wants to use the ObjectMonitor*.
+//
+// The ObjectMonitor has been successfully async deflated when:
+// (owner == DEFLATER_MARKER && ref_count < 0)
+// Contending threads or ObjectMonitor* using threads that see those
+// values know to retry their operation.
+//
+bool ObjectSynchronizer::deflate_monitor_using_JT(ObjectMonitor* mid,
+                                                  ObjectMonitor** free_head_p,
+                                                  ObjectMonitor** free_tail_p) {
+  assert(AsyncDeflateIdleMonitors, "sanity check");
+  assert(Thread::current()->is_Java_thread(), "precondition");
+  // A newly allocated ObjectMonitor should not be seen here so we
+  // avoid an endless inflate/deflate cycle.
+  assert(mid->is_old(), "must be old: allocation_state=%d",
+         (int) mid->allocation_state());
+
+  if (mid->is_busy() || mid->ref_count() != 0) {
+    // Easy checks are first - the ObjectMonitor is busy or ObjectMonitor*
+    // is in use so no deflation.
+    return false;
+  }
+
+  if (mid->try_set_owner_from(DEFLATER_MARKER, NULL) == NULL) {
+    // ObjectMonitor is not owned by another thread. Our setting
+    // owner to DEFLATER_MARKER forces any contending thread through
+    // the slow path. This is just the first part of the async
+    // deflation dance.
+
+    if (mid->_contentions != 0 || mid->_waiters != 0) {
+      // Another thread has raced to enter the ObjectMonitor after
+      // mid->is_busy() above or has already entered and waited on
+      // it which makes it busy so no deflation. Restore owner to
+      // NULL if it is still DEFLATER_MARKER.
+      mid->try_set_owner_from(NULL, DEFLATER_MARKER);
+      return false;
+    }
+
+    if (Atomic::cmpxchg(-max_jint, &mid->_ref_count, (jint)0) == 0) {
+      // Make ref_count negative to force any contending threads or
+      // ObjectMonitor* using threads to retry. This is the second
+      // part of the async deflation dance.
+
+      if (mid->owner_is_DEFLATER_MARKER()) {
+        // If owner is still DEFLATER_MARKER, then we have successfully
+        // signaled any contending threads to retry. If it is not, then we
+        // have lost the race to an entering thread and the ObjectMonitor
+        // is now busy. This is the third and final part of the async
+        // deflation dance.
+        // Note: This owner check solves the ABA problem with ref_count
+        // where another thread acquired the ObjectMonitor, finished
+        // using it and restored the ref_count to zero.
+
+        // Sanity checks for the races:
+        guarantee(mid->_contentions == 0, "must be 0: contentions=%d",
+                  mid->_contentions);
+        guarantee(mid->_waiters == 0, "must be 0: waiters=%d", mid->_waiters);
+        guarantee(mid->_cxq == NULL, "must be no contending threads: cxq="
+                  INTPTR_FORMAT, p2i(mid->_cxq));
+        guarantee(mid->_EntryList == NULL,
+                  "must be no entering threads: EntryList=" INTPTR_FORMAT,
+                  p2i(mid->_EntryList));
+
+        const oop obj = (oop) mid->object();
+        if (log_is_enabled(Trace, monitorinflation)) {
+          ResourceMark rm;
+          log_trace(monitorinflation)("deflate_monitor_using_JT: "
+                                      "object=" INTPTR_FORMAT ", mark="
+                                      INTPTR_FORMAT ", type='%s'",
+                                      p2i(obj), obj->mark().value(),
+                                      obj->klass()->external_name());
+        }
+
+        // Install the old mark word if nobody else has already done it.
+        mid->install_displaced_markword_in_object(obj);
+        mid->clear_using_JT();
+
+        assert(mid->object() == NULL, "must be NULL: object=" INTPTR_FORMAT,
+               p2i(mid->object()));
+        assert(mid->is_free(), "must be free: allocation_state=%d",
+               (int) mid->allocation_state());
+
+        // Move the deflated ObjectMonitor to the working free list
+        // defined by free_head_p and free_tail_p. No races on this list
+        // so no need for load_acquire() or store_release().
+        if (*free_head_p == NULL) {
+          // First one on the list.
+          *free_head_p = mid;
+        }
+        if (*free_tail_p != NULL) {
+          // We append to the list so the caller can use mid->_next_om
+          // to fix the linkages in its context.
+          ObjectMonitor* prevtail = *free_tail_p;
+          // Should have been cleaned up by the caller:
+          ObjectMonitor* next = mark_next_loop(prevtail);
+          assert(unmarked_next(prevtail) == NULL, "must be NULL: _next_om="
+                 INTPTR_FORMAT, p2i(unmarked_next(prevtail)));
+          set_next(prevtail, mid);  // prevtail now points to mid (and is unmarked)
+        }
+        *free_tail_p = mid;
+
+        // At this point, mid->_next_om still refers to its current
+        // value and another ObjectMonitor's _next_om field still
+        // refers to this ObjectMonitor. Those linkages have to be
+        // cleaned up by the caller who has the complete context.
+
+        // We leave owner == DEFLATER_MARKER and ref_count < 0
+        // to force any racing threads to retry.
+        return true;  // Success, ObjectMonitor has been deflated.
+      }
+
+      // The owner was changed from DEFLATER_MARKER so we lost the
+      // race since the ObjectMonitor is now busy.
+
+      // Add back max_jint to restore the ref_count field to its
+      // proper value (which may not be what we saw above):
+      Atomic::add(max_jint, &mid->_ref_count);
+
+      assert(mid->ref_count() >= 0, "must not be negative: ref_count=%d",
+             mid->ref_count());
+      return false;
+    }
+
+    // The ref_count was no longer 0 so we lost the race since the
+    // ObjectMonitor is now busy or the ObjectMonitor* is now is use.
+    // Restore owner to NULL if it is still DEFLATER_MARKER:
+    mid->try_set_owner_from(NULL, DEFLATER_MARKER);
+  }
+
+  // The owner field is no longer NULL so we lost the race since the
+  // ObjectMonitor is now busy.
+  return false;
+}
+
+// Walk a given monitor list, and deflate idle monitors.
+// The given list could be a per-thread list or a global list.
 //
 // In the case of parallel processing of thread local monitor lists,
 // work is done by Threads::parallel_threads_do() which ensures that
@@ -1562,47 +2273,219 @@
 // See also ParallelSPCleanupTask and
 // SafepointSynchronize::do_cleanup_tasks() in safepoint.cpp and
 // Threads::parallel_java_threads_do() in thread.cpp.
-int ObjectSynchronizer::deflate_monitor_list(ObjectMonitor** list_p,
+int ObjectSynchronizer::deflate_monitor_list(ObjectMonitor* volatile * list_p,
+                                             int volatile * count_p,
                                              ObjectMonitor** free_head_p,
                                              ObjectMonitor** free_tail_p) {
-  ObjectMonitor* mid;
-  ObjectMonitor* next;
   ObjectMonitor* cur_mid_in_use = NULL;
+  ObjectMonitor* mid = NULL;
+  ObjectMonitor* next = NULL;
   int deflated_count = 0;
 
-  for (mid = *list_p; mid != NULL;) {
+  // We use the simpler mark-mid-as-we-go protocol since there are no
+  // parallel list deletions since we are at a safepoint.
+  if (!mark_list_head(list_p, &mid, &next)) {
+    return 0;  // The list is empty so nothing to deflate.
+  }
+
+  while (true) {
     oop obj = (oop) mid->object();
     if (obj != NULL && deflate_monitor(mid, obj, free_head_p, free_tail_p)) {
       // Deflation succeeded and already updated free_head_p and
       // free_tail_p as needed. Finish the move to the local free list
       // by unlinking mid from the global or per-thread in-use list.
-      if (mid == *list_p) {
-        *list_p = mid->_next_om;
-      } else if (cur_mid_in_use != NULL) {
-        cur_mid_in_use->_next_om = mid->_next_om; // maintain the current thread in-use list
+      if (cur_mid_in_use == NULL) {
+        // mid is the list head and it is marked. Switch the list head
+        // to next which unmarks the list head, but leaves mid marked:
+        OrderAccess::release_store(list_p, next);
+      } else {
+        // mid is marked. Switch cur_mid_in_use's next field to next
+        // which is safe because we have no parallel list deletions,
+        // but we leave mid marked:
+        OrderAccess::release_store(&cur_mid_in_use->_next_om, next);
       }
-      next = mid->_next_om;
-      mid->_next_om = NULL;  // This mid is current tail in the free_head_p list
+      // At this point mid is disconnected from the in-use list so
+      // its marked next field no longer has any effects.
+      deflated_count++;
+      Atomic::dec(count_p);
+      // mid is current tail in the free_head_p list so NULL terminate it
+      // (which also unmarks it):
+      set_next(mid, NULL);
+
+      // All the list management is done so move on to the next one:
       mid = next;
+    } else {
+      set_next(mid, next);  // unmark next field
+
+      // All the list management is done so move on to the next one:
+      cur_mid_in_use = mid;
+      mid = next;
+    }
+    if (mid == NULL) {
+      break;  // Reached end of the list so nothing more to deflate.
+    }
+    // Mark mid's next field so we can possibly deflate it:
+    next = mark_next_loop(mid);
+  }
+  return deflated_count;
+}
+
+// Walk a given ObjectMonitor list and deflate idle ObjectMonitors using
+// a JavaThread. Returns the number of deflated ObjectMonitors. The given
+// list could be a per-thread in-use list or the global in-use list.
+// If a safepoint has started, then we save state via saved_mid_in_use_p
+// and return to the caller to honor the safepoint.
+//
+int ObjectSynchronizer::deflate_monitor_list_using_JT(ObjectMonitor* volatile * list_p,
+                                                      int volatile * count_p,
+                                                      ObjectMonitor** free_head_p,
+                                                      ObjectMonitor** free_tail_p,
+                                                      ObjectMonitor** saved_mid_in_use_p) {
+  assert(AsyncDeflateIdleMonitors, "sanity check");
+  assert(Thread::current()->is_Java_thread(), "precondition");
+
+  ObjectMonitor* cur_mid_in_use = NULL;
+  ObjectMonitor* mid = NULL;
+  ObjectMonitor* next = NULL;
+  ObjectMonitor* next_next = NULL;
+  int deflated_count = 0;
+
+  // We use the more complicated mark-cur_mid_in_use-and-mid-as-we-go
+  // protocol because om_release() can do list deletions in parallel.
+  // We also mark-next-next-as-we-go to prevent an om_flush() that is
+  // behind this thread from passing us.
+  if (*saved_mid_in_use_p == NULL) {
+    // No saved state so start at the beginning.
+    // Mark the list head's next field so we can possibly deflate it:
+    if (!mark_list_head(list_p, &mid, &next)) {
+      return 0;  // The list is empty so nothing to deflate.
+    }
+  } else {
+    // We're restarting after a safepoint so restore the necessary state
+    // before we resume.
+    cur_mid_in_use = *saved_mid_in_use_p;
+    // Mark cur_mid_in_use's next field so we can possibly update its
+    // next field to extract a deflated ObjectMonitor.
+    mid = mark_next_loop(cur_mid_in_use);
+    if (mid == NULL) {
+      set_next(cur_mid_in_use, NULL);  // unmark next field
+      *saved_mid_in_use_p = NULL;
+      return 0;  // The remainder is empty so nothing more to deflate.
+    }
+    // Mark mid's next field so we can possibly deflate it:
+    next = mark_next_loop(mid);
+  }
+
+  while (true) {
+    // The current mid's next field is marked at this point. If we have
+    // a cur_mid_in_use, then its next field is also marked at this point.
+
+    if (next != NULL) {
+      // We mark next's next field so that an om_flush()
+      // thread that is behind us cannot pass us when we
+      // unmark the current mid's next field.
+      next_next = mark_next_loop(next);
+    }
+
+    // Only try to deflate if there is an associated Java object and if
+    // mid is old (is not newly allocated and is not newly freed).
+    if (mid->object() != NULL && mid->is_old() &&
+        deflate_monitor_using_JT(mid, free_head_p, free_tail_p)) {
+      // Deflation succeeded and already updated free_head_p and
+      // free_tail_p as needed. Finish the move to the local free list
+      // by unlinking mid from the global or per-thread in-use list.
+      if (cur_mid_in_use == NULL) {
+        // mid is the list head and it is marked. Switch the list head
+        // to next which is also marked (if not NULL) and also leave
+        // mid marked:
+        OrderAccess::release_store(list_p, next);
+      } else {
+        ObjectMonitor* marked_next = mark_om_ptr(next);
+        // mid and cur_mid_in_use are marked. Switch cur_mid_in_use's
+        // next field to marked_next and also leave mid marked:
+        OrderAccess::release_store(&cur_mid_in_use->_next_om, marked_next);
+      }
+      // At this point mid is disconnected from the in-use list so
+      // its marked next field no longer has any effects.
       deflated_count++;
+      Atomic::dec(count_p);
+      // mid is current tail in the free_head_p list so NULL terminate it
+      // (which also unmarks it):
+      set_next(mid, NULL);
+
+      // All the list management is done so move on to the next one:
+      mid = next;  // mid keeps non-NULL next's marked next field
+      next = next_next;
     } else {
+      // mid is considered in-use if it does not have an associated
+      // Java object or mid is not old or deflation did not succeed.
+      // A mid->is_new() node can be seen here when it is freshly
+      // returned by om_alloc() (and skips the deflation code path).
+      // A mid->is_old() node can be seen here when deflation failed.
+      // A mid->is_free() node can be seen here when a fresh node from
+      // om_alloc() is released by om_release() due to losing the race
+      // in inflate().
+
+      // All the list management is done so move on to the next one:
+      if (cur_mid_in_use != NULL) {
+        set_next(cur_mid_in_use, mid);  // umark cur_mid_in_use
+      }
+      // The next cur_mid_in_use keeps mid's marked next field so
+      // that it is stable for a possible next field change. It
+      // cannot be modified by om_release() while it is marked.
       cur_mid_in_use = mid;
-      mid = mid->_next_om;
+      mid = next;  // mid keeps non-NULL next's marked next field
+      next = next_next;
+
+      if (SafepointSynchronize::is_synchronizing() &&
+          cur_mid_in_use != OrderAccess::load_acquire(list_p) &&
+          cur_mid_in_use->is_old()) {
+        // If a safepoint has started and cur_mid_in_use is not the list
+        // head and is old, then it is safe to use as saved state. Return
+        // to the caller before blocking.
+        *saved_mid_in_use_p = cur_mid_in_use;
+        set_next(cur_mid_in_use, mid);  // umark cur_mid_in_use
+        if (mid != NULL) {
+          set_next(mid, next);  // umark mid
+        }
+        return deflated_count;
+      }
     }
+    if (mid == NULL) {
+      if (cur_mid_in_use != NULL) {
+        set_next(cur_mid_in_use, mid);  // umark cur_mid_in_use
+      }
+      break;  // Reached end of the list so nothing more to deflate.
+    }
+
+    // The current mid's next field is marked at this point. If we have
+    // a cur_mid_in_use, then its next field is also marked at this point.
   }
+  // We finished the list without a safepoint starting so there's
+  // no need to save state.
+  *saved_mid_in_use_p = NULL;
   return deflated_count;
 }
 
 void ObjectSynchronizer::prepare_deflate_idle_monitors(DeflateMonitorCounters* counters) {
-  counters->n_in_use = 0;              // currently associated with objects
-  counters->n_in_circulation = 0;      // extant
-  counters->n_scavenged = 0;           // reclaimed (global and per-thread)
-  counters->per_thread_scavenged = 0;  // per-thread scavenge total
-  counters->per_thread_times = 0.0;    // per-thread scavenge times
+  OrderAccess::release_store(&counters->n_in_use, 0);              // currently associated with objects
+  OrderAccess::release_store(&counters->n_in_circulation, 0);      // extant
+  OrderAccess::release_store(&counters->n_scavenged, 0);           // reclaimed (global and per-thread)
+  OrderAccess::release_store(&counters->per_thread_scavenged, 0);  // per-thread scavenge total
+  counters->per_thread_times = 0.0;                                // per-thread scavenge times
 }
 
 void ObjectSynchronizer::deflate_idle_monitors(DeflateMonitorCounters* counters) {
   assert(SafepointSynchronize::is_at_safepoint(), "must be at safepoint");
+
+  if (AsyncDeflateIdleMonitors) {
+    // Nothing to do when global idle ObjectMonitors are deflated using
+    // a JavaThread unless a special deflation has been requested.
+    if (!is_special_deflation_requested()) {
+      return;
+    }
+  }
+
   bool deflated = false;
 
   ObjectMonitor* free_head_p = NULL;  // Local SLL of scavenged monitors
@@ -1613,33 +2496,28 @@
     timer.start();
   }
 
-  // Prevent om_flush from changing mids in Thread dtor's during deflation
-  // And in case the vm thread is acquiring a lock during a safepoint
-  // See e.g. 6320749
-  Thread::muxAcquire(&gListLock, "deflate_idle_monitors");
-
   // Note: the thread-local monitors lists get deflated in
   // a separate pass. See deflate_thread_local_monitors().
 
   // For moribund threads, scan g_om_in_use_list
   int deflated_count = 0;
-  if (g_om_in_use_list) {
-    counters->n_in_circulation += g_om_in_use_count;
-    deflated_count = deflate_monitor_list((ObjectMonitor **)&g_om_in_use_list, &free_head_p, &free_tail_p);
-    g_om_in_use_count -= deflated_count;
-    counters->n_scavenged += deflated_count;
-    counters->n_in_use += g_om_in_use_count;
+  if (OrderAccess::load_acquire(&g_om_in_use_list) != NULL) {
+    // Update n_in_circulation before g_om_in_use_count is updated by deflation.
+    Atomic::add(OrderAccess::load_acquire(&g_om_in_use_count), &counters->n_in_circulation);
+
+    deflated_count = deflate_monitor_list(&g_om_in_use_list, &g_om_in_use_count, &free_head_p, &free_tail_p);
+    Atomic::add(OrderAccess::load_acquire(&g_om_in_use_count), &counters->n_in_use);
   }
 
   if (free_head_p != NULL) {
     // Move the deflated ObjectMonitors back to the global free list.
-    guarantee(free_tail_p != NULL && counters->n_scavenged > 0, "invariant");
-    assert(free_tail_p->_next_om == NULL, "invariant");
-    // constant-time list splice - prepend scavenged segment to g_free_list
-    free_tail_p->_next_om = g_free_list;
-    g_free_list = free_head_p;
+    // No races on the working free list so no need for load_acquire().
+    guarantee(free_tail_p != NULL && deflated_count > 0, "invariant");
+    assert(free_tail_p->_next_om == NULL, "must be NULL: _next_om="
+           INTPTR_FORMAT, p2i(free_tail_p->_next_om));
+    prepend_list_to_g_free_list(free_head_p, free_tail_p, deflated_count);
+    Atomic::add(deflated_count, &counters->n_scavenged);
   }
-  Thread::muxRelease(&gListLock);
   timer.stop();
 
   LogStreamHandle(Debug, monitorinflation) lsh_debug;
@@ -1655,39 +2533,228 @@
   }
 }
 
+class HandshakeForDeflation : public ThreadClosure {
+ public:
+  void do_thread(Thread* thread) {
+    log_trace(monitorinflation)("HandshakeForDeflation::do_thread: thread="
+                                INTPTR_FORMAT, p2i(thread));
+  }
+};
+
+void ObjectSynchronizer::deflate_idle_monitors_using_JT() {
+  assert(AsyncDeflateIdleMonitors, "sanity check");
+
+  // Deflate any global idle monitors.
+  deflate_global_idle_monitors_using_JT();
+
+  int count = 0;
+  for (JavaThreadIteratorWithHandle jtiwh; JavaThread *jt = jtiwh.next(); ) {
+    if (jt->om_in_use_count > 0 && !jt->is_exiting()) {
+      // This JavaThread is using ObjectMonitors so deflate any that
+      // are idle unless this JavaThread is exiting; do not race with
+      // ObjectSynchronizer::om_flush().
+      deflate_per_thread_idle_monitors_using_JT(jt);
+      count++;
+    }
+  }
+  if (count > 0) {
+    log_debug(monitorinflation)("did async deflation of idle monitors for %d thread(s).", count);
+  }
+  // The ServiceThread's async deflation request has been processed.
+  set_is_async_deflation_requested(false);
+
+  if (HandshakeAfterDeflateIdleMonitors && g_om_wait_count > 0) {
+    // There are deflated ObjectMonitors waiting for a handshake
+    // (or a safepoint) for safety.
+
+    // g_wait_list and g_om_wait_count are only updated by the calling
+    // thread so no need for load_acquire() or release_store().
+    ObjectMonitor* list = g_wait_list;
+    ADIM_guarantee(list != NULL, "g_wait_list must not be NULL");
+    int count = g_om_wait_count;
+    g_wait_list = NULL;
+    g_om_wait_count = 0;
+
+    // Find the tail for prepend_list_to_common().
+    int l_count = 0;
+    ObjectMonitor* tail = NULL;
+    for (ObjectMonitor* n = list; n != NULL; n = unmarked_next(n)) {
+      tail = n;
+      l_count++;
+    }
+    ADIM_guarantee(count == l_count, "count=%d != l_count=%d", count, l_count);
+
+    // Will execute a safepoint if !ThreadLocalHandshakes:
+    HandshakeForDeflation hfd_tc;
+    Handshake::execute(&hfd_tc);
+
+    prepend_list_to_common(list, tail, count, &g_free_list, &g_om_free_count);
+
+    log_info(monitorinflation)("moved %d idle monitors from global waiting list to global free list", count);
+  }
+}
+
+// Deflate global idle ObjectMonitors using a JavaThread.
+//
+void ObjectSynchronizer::deflate_global_idle_monitors_using_JT() {
+  assert(AsyncDeflateIdleMonitors, "sanity check");
+  assert(Thread::current()->is_Java_thread(), "precondition");
+  JavaThread* self = JavaThread::current();
+
+  deflate_common_idle_monitors_using_JT(true /* is_global */, self);
+}
+
+// Deflate the specified JavaThread's idle ObjectMonitors using a JavaThread.
+//
+void ObjectSynchronizer::deflate_per_thread_idle_monitors_using_JT(JavaThread* target) {
+  assert(AsyncDeflateIdleMonitors, "sanity check");
+  assert(Thread::current()->is_Java_thread(), "precondition");
+
+  deflate_common_idle_monitors_using_JT(false /* !is_global */, target);
+}
+
+// Deflate global or per-thread idle ObjectMonitors using a JavaThread.
+//
+void ObjectSynchronizer::deflate_common_idle_monitors_using_JT(bool is_global, JavaThread* target) {
+  JavaThread* self = JavaThread::current();
+
+  int deflated_count = 0;
+  ObjectMonitor* free_head_p = NULL;  // Local SLL of scavenged ObjectMonitors
+  ObjectMonitor* free_tail_p = NULL;
+  ObjectMonitor* saved_mid_in_use_p = NULL;
+  elapsedTimer timer;
+
+  if (log_is_enabled(Info, monitorinflation)) {
+    timer.start();
+  }
+
+  if (is_global) {
+    OM_PERFDATA_OP(MonExtant, set_value(OrderAccess::load_acquire(&g_om_in_use_count)));
+  } else {
+    OM_PERFDATA_OP(MonExtant, inc(OrderAccess::load_acquire(&target->om_in_use_count)));
+  }
+
+  do {
+    int local_deflated_count;
+    if (is_global) {
+      local_deflated_count = deflate_monitor_list_using_JT(&g_om_in_use_list, &g_om_in_use_count, &free_head_p, &free_tail_p, &saved_mid_in_use_p);
+    } else {
+      local_deflated_count = deflate_monitor_list_using_JT(&target->om_in_use_list, &target->om_in_use_count, &free_head_p, &free_tail_p, &saved_mid_in_use_p);
+    }
+    deflated_count += local_deflated_count;
+
+    if (free_head_p != NULL) {
+      // Move the deflated ObjectMonitors to the global free list.
+      // No races on the working list so no need for load_acquire().
+      guarantee(free_tail_p != NULL && local_deflated_count > 0, "free_tail_p=" INTPTR_FORMAT ", local_deflated_count=%d", p2i(free_tail_p), local_deflated_count);
+      // Note: The target thread can be doing an om_alloc() that
+      // is trying to prepend an ObjectMonitor on its in-use list
+      // at the same time that we have deflated the current in-use
+      // list head and put it on the local free list. prepend_to_common()
+      // will detect the race and retry which avoids list corruption,
+      // but the next field in free_tail_p can flicker to marked
+      // and then unmarked while prepend_to_common() is sorting it
+      // all out.
+      assert(unmarked_next(free_tail_p) == NULL, "must be NULL: _next_om="
+             INTPTR_FORMAT, p2i(unmarked_next(free_tail_p)));
+
+      if (HandshakeAfterDeflateIdleMonitors) {
+        prepend_list_to_g_wait_list(free_head_p, free_tail_p, local_deflated_count);
+      } else {
+        prepend_list_to_g_free_list(free_head_p, free_tail_p, local_deflated_count);
+      }
+
+      OM_PERFDATA_OP(Deflations, inc(local_deflated_count));
+    }
+
+    if (saved_mid_in_use_p != NULL) {
+      // deflate_monitor_list_using_JT() detected a safepoint starting.
+      timer.stop();
+      {
+        if (is_global) {
+          log_debug(monitorinflation)("pausing deflation of global idle monitors for a safepoint.");
+        } else {
+          log_debug(monitorinflation)("jt=" INTPTR_FORMAT ": pausing deflation of per-thread idle monitors for a safepoint.", p2i(target));
+        }
+        assert(SafepointSynchronize::is_synchronizing(), "sanity check");
+        ThreadBlockInVM blocker(self);
+      }
+      // Prepare for another loop after the safepoint.
+      free_head_p = NULL;
+      free_tail_p = NULL;
+      if (log_is_enabled(Info, monitorinflation)) {
+        timer.start();
+      }
+    }
+  } while (saved_mid_in_use_p != NULL);
+  timer.stop();
+
+  LogStreamHandle(Debug, monitorinflation) lsh_debug;
+  LogStreamHandle(Info, monitorinflation) lsh_info;
+  LogStream* ls = NULL;
+  if (log_is_enabled(Debug, monitorinflation)) {
+    ls = &lsh_debug;
+  } else if (deflated_count != 0 && log_is_enabled(Info, monitorinflation)) {
+    ls = &lsh_info;
+  }
+  if (ls != NULL) {
+    if (is_global) {
+      ls->print_cr("async-deflating global idle monitors, %3.7f secs, %d monitors", timer.seconds(), deflated_count);
+    } else {
+      ls->print_cr("jt=" INTPTR_FORMAT ": async-deflating per-thread idle monitors, %3.7f secs, %d monitors", p2i(target), timer.seconds(), deflated_count);
+    }
+  }
+}
+
 void ObjectSynchronizer::finish_deflate_idle_monitors(DeflateMonitorCounters* counters) {
   // Report the cumulative time for deflating each thread's idle
   // monitors. Note: if the work is split among more than one
   // worker thread, then the reported time will likely be more
   // than a beginning to end measurement of the phase.
-  log_info(safepoint, cleanup)("deflating per-thread idle monitors, %3.7f secs, monitors=%d", counters->per_thread_times, counters->per_thread_scavenged);
+  // Note: AsyncDeflateIdleMonitors only deflates per-thread idle
+  // monitors at a safepoint when a special deflation has been requested.
+  log_info(safepoint, cleanup)("deflating per-thread idle monitors, %3.7f secs, monitors=%d",
+                               counters->per_thread_times,
+                               OrderAccess::load_acquire(&counters->per_thread_scavenged));
+
+  bool needs_special_deflation = is_special_deflation_requested();
+  if (!AsyncDeflateIdleMonitors || needs_special_deflation) {
+    // AsyncDeflateIdleMonitors does not use these counters unless
+    // there is a special deflation request.
 
-  g_om_free_count += counters->n_scavenged;
+    OM_PERFDATA_OP(Deflations, inc(counters->n_scavenged));
+    OM_PERFDATA_OP(MonExtant, set_value(counters->n_in_circulation));
+  }
 
   if (log_is_enabled(Debug, monitorinflation)) {
     // exit_globals()'s call to audit_and_print_stats() is done
     // at the Info level.
     ObjectSynchronizer::audit_and_print_stats(false /* on_exit */);
   } else if (log_is_enabled(Info, monitorinflation)) {
-    Thread::muxAcquire(&gListLock, "finish_deflate_idle_monitors");
     log_info(monitorinflation)("g_om_population=%d, g_om_in_use_count=%d, "
-                               "g_om_free_count=%d", g_om_population,
-                               g_om_in_use_count, g_om_free_count);
-    Thread::muxRelease(&gListLock);
+                               "g_om_free_count=%d, g_om_wait_count=%d",
+                               OrderAccess::load_acquire(&g_om_population),
+                               OrderAccess::load_acquire(&g_om_in_use_count),
+                               OrderAccess::load_acquire(&g_om_free_count),
+                               OrderAccess::load_acquire(&g_om_wait_count));
   }
 
   ForceMonitorScavenge = 0;    // Reset
-
-  OM_PERFDATA_OP(Deflations, inc(counters->n_scavenged));
-  OM_PERFDATA_OP(MonExtant, set_value(counters->n_in_circulation));
-
   GVars.stw_random = os::random();
   GVars.stw_cycle++;
+  if (needs_special_deflation) {
+    set_is_special_deflation_requested(false);  // special deflation is done
+  }
 }
 
 void ObjectSynchronizer::deflate_thread_local_monitors(Thread* thread, DeflateMonitorCounters* counters) {
   assert(SafepointSynchronize::is_at_safepoint(), "must be at safepoint");
 
+  if (AsyncDeflateIdleMonitors && !is_special_deflation_requested()) {
+    // Nothing to do if a special deflation has NOT been requested.
+    return;
+  }
+
   ObjectMonitor* free_head_p = NULL;  // Local SLL of scavenged monitors
   ObjectMonitor* free_tail_p = NULL;
   elapsedTimer timer;
@@ -1697,25 +2764,21 @@
     timer.start();
   }
 
-  int deflated_count = deflate_monitor_list(thread->om_in_use_list_addr(), &free_head_p, &free_tail_p);
+  // Update n_in_circulation before om_in_use_count is updated by deflation.
+  Atomic::add(OrderAccess::load_acquire(&thread->om_in_use_count), &counters->n_in_circulation);
 
-  Thread::muxAcquire(&gListLock, "deflate_thread_local_monitors");
-
-  // Adjust counters
-  counters->n_in_circulation += thread->om_in_use_count;
-  thread->om_in_use_count -= deflated_count;
-  counters->n_scavenged += deflated_count;
-  counters->n_in_use += thread->om_in_use_count;
-  counters->per_thread_scavenged += deflated_count;
+  int deflated_count = deflate_monitor_list(&thread->om_in_use_list, &thread->om_in_use_count, &free_head_p, &free_tail_p);
+  Atomic::add(OrderAccess::load_acquire(&thread->om_in_use_count), &counters->n_in_use);
 
   if (free_head_p != NULL) {
     // Move the deflated ObjectMonitors back to the global free list.
+    // No races on the working list so no need for load_acquire().
     guarantee(free_tail_p != NULL && deflated_count > 0, "invariant");
-    assert(free_tail_p->_next_om == NULL, "invariant");
-
-    // constant-time list splice - prepend scavenged segment to g_free_list
-    free_tail_p->_next_om = g_free_list;
-    g_free_list = free_head_p;
+    assert(free_tail_p->_next_om == NULL, "must be NULL: _next_om="
+           INTPTR_FORMAT, p2i(free_tail_p->_next_om));
+    prepend_list_to_g_free_list(free_head_p, free_tail_p, deflated_count);
+    Atomic::add(deflated_count, &counters->n_scavenged);
+    Atomic::add(deflated_count, &counters->per_thread_scavenged);
   }
 
   timer.stop();
@@ -1724,8 +2787,6 @@
   // should be cheap.
   counters->per_thread_times += timer.seconds();
 
-  Thread::muxRelease(&gListLock);
-
   LogStreamHandle(Debug, monitorinflation) lsh_debug;
   LogStreamHandle(Info, monitorinflation) lsh_info;
   LogStream* ls = NULL;
@@ -1776,9 +2837,7 @@
   assert(THREAD == JavaThread::current(), "must be current Java thread");
   NoSafepointVerifier nsv;
   ReleaseJavaMonitorsClosure rjmc(THREAD);
-  Thread::muxAcquire(&gListLock, "release_monitors_owned_by_thread");
   ObjectSynchronizer::monitors_iterate(&rjmc);
-  Thread::muxRelease(&gListLock);
   THREAD->clear_pending_exception();
 }
 
@@ -1832,11 +2891,6 @@
   }
   assert(ls != NULL, "sanity check");
 
-  if (!on_exit) {
-    // Not at VM exit so grab the global list lock.
-    Thread::muxAcquire(&gListLock, "audit_and_print_stats");
-  }
-
   // Log counts for the global and per-thread monitor lists:
   int chk_om_population = log_monitor_list_counts(ls);
   int error_cnt = 0;
@@ -1844,14 +2898,19 @@
   ls->print_cr("Checking global lists:");
 
   // Check g_om_population:
-  if (g_om_population == chk_om_population) {
+  if (OrderAccess::load_acquire(&g_om_population) == chk_om_population) {
     ls->print_cr("g_om_population=%d equals chk_om_population=%d",
-                 g_om_population, chk_om_population);
+                 OrderAccess::load_acquire(&g_om_population),
+                 chk_om_population);
   } else {
-    ls->print_cr("ERROR: g_om_population=%d is not equal to "
-                 "chk_om_population=%d", g_om_population,
+    // With lock free access to the monitor lists, it is possible for
+    // log_monitor_list_counts() to return a value that doesn't match
+    // g_om_population. So far a higher value has been seen in testing
+    // so something is being double counted by log_monitor_list_counts().
+    ls->print_cr("WARNING: g_om_population=%d is not equal to "
+                 "chk_om_population=%d",
+                 OrderAccess::load_acquire(&g_om_population),
                  chk_om_population);
-    error_cnt++;
   }
 
   // Check g_om_in_use_list and g_om_in_use_count:
@@ -1860,8 +2919,9 @@
   // Check g_free_list and g_om_free_count:
   chk_global_free_list_and_count(ls, &error_cnt);
 
-  if (!on_exit) {
-    Thread::muxRelease(&gListLock);
+  if (HandshakeAfterDeflateIdleMonitors) {
+    // Check g_wait_list and g_om_wait_count:
+    chk_global_wait_list_and_count(ls, &error_cnt);
   }
 
   ls->print_cr("Checking per-thread lists:");
@@ -1885,7 +2945,7 @@
     // When exiting this log output is at the Info level. When called
     // at a safepoint, this log output is at the Trace level since
     // there can be a lot of it.
-    log_in_use_monitor_details(ls, on_exit);
+    log_in_use_monitor_details(ls);
   }
 
   ls->flush();
@@ -1914,12 +2974,13 @@
                     ": free per-thread monitor must have NULL _header "
                     "field: _header=" INTPTR_FORMAT, p2i(jt), p2i(n),
                     n->header().value());
-    } else {
+      *error_cnt_p = *error_cnt_p + 1;
+    } else if (!AsyncDeflateIdleMonitors) {
       out->print_cr("ERROR: monitor=" INTPTR_FORMAT ": free global monitor "
                     "must have NULL _header field: _header=" INTPTR_FORMAT,
                     p2i(n), n->header().value());
+      *error_cnt_p = *error_cnt_p + 1;
     }
-    *error_cnt_p = *error_cnt_p + 1;
   }
   if (n->object() != NULL) {
     if (jt != NULL) {
@@ -1940,17 +3001,44 @@
 void ObjectSynchronizer::chk_global_free_list_and_count(outputStream * out,
                                                         int *error_cnt_p) {
   int chk_om_free_count = 0;
-  for (ObjectMonitor* n = g_free_list; n != NULL; n = n->_next_om) {
+  for (ObjectMonitor* n = OrderAccess::load_acquire(&g_free_list); n != NULL; n = unmarked_next(n)) {
     chk_free_entry(NULL /* jt */, n, out, error_cnt_p);
     chk_om_free_count++;
   }
-  if (g_om_free_count == chk_om_free_count) {
+  if (OrderAccess::load_acquire(&g_om_free_count) == chk_om_free_count) {
     out->print_cr("g_om_free_count=%d equals chk_om_free_count=%d",
-                  g_om_free_count, chk_om_free_count);
+                  OrderAccess::load_acquire(&g_om_free_count),
+                  chk_om_free_count);
   } else {
-    out->print_cr("ERROR: g_om_free_count=%d is not equal to "
-                  "chk_om_free_count=%d", g_om_free_count,
+    // With lock free access to g_free_list, it is possible for an
+    // ObjectMonitor to be prepended to g_free_list after we started
+    // calculating chk_om_free_count so g_om_free_count may not
+    // match anymore.
+    out->print_cr("WARNING: g_om_free_count=%d is not equal to "
+                  "chk_om_free_count=%d",
+                  OrderAccess::load_acquire(&g_om_free_count),
                   chk_om_free_count);
+  }
+}
+
+// Check the global wait list and count; log the results of the checks.
+void ObjectSynchronizer::chk_global_wait_list_and_count(outputStream * out,
+                                                        int *error_cnt_p) {
+  int chk_om_wait_count = 0;
+  for (ObjectMonitor* n = OrderAccess::load_acquire(&g_wait_list); n != NULL; n = unmarked_next(n)) {
+    // Rules for g_wait_list are the same as of g_free_list:
+    chk_free_entry(NULL /* jt */, n, out, error_cnt_p);
+    chk_om_wait_count++;
+  }
+  if (OrderAccess::load_acquire(&g_om_wait_count) == chk_om_wait_count) {
+    out->print_cr("g_om_wait_count=%d equals chk_om_wait_count=%d",
+                  OrderAccess::load_acquire(&g_om_wait_count),
+                  chk_om_wait_count);
+  } else {
+    out->print_cr("ERROR: g_om_wait_count=%d is not equal to "
+                  "chk_om_wait_count=%d",
+                  OrderAccess::load_acquire(&g_om_wait_count),
+                  chk_om_wait_count);
     *error_cnt_p = *error_cnt_p + 1;
   }
 }
@@ -1959,17 +3047,21 @@
 void ObjectSynchronizer::chk_global_in_use_list_and_count(outputStream * out,
                                                           int *error_cnt_p) {
   int chk_om_in_use_count = 0;
-  for (ObjectMonitor* n = g_om_in_use_list; n != NULL; n = n->_next_om) {
+  for (ObjectMonitor* n = OrderAccess::load_acquire(&g_om_in_use_list); n != NULL; n = unmarked_next(n)) {
     chk_in_use_entry(NULL /* jt */, n, out, error_cnt_p);
     chk_om_in_use_count++;
   }
-  if (g_om_in_use_count == chk_om_in_use_count) {
-    out->print_cr("g_om_in_use_count=%d equals chk_om_in_use_count=%d", g_om_in_use_count,
+  if (OrderAccess::load_acquire(&g_om_in_use_count) == chk_om_in_use_count) {
+    out->print_cr("g_om_in_use_count=%d equals chk_om_in_use_count=%d",
+                  OrderAccess::load_acquire(&g_om_in_use_count),
                   chk_om_in_use_count);
   } else {
-    out->print_cr("ERROR: g_om_in_use_count=%d is not equal to chk_om_in_use_count=%d",
-                  g_om_in_use_count, chk_om_in_use_count);
-    *error_cnt_p = *error_cnt_p + 1;
+    // With lock free access to the monitor lists, it is possible for
+    // an exiting JavaThread to put its in-use ObjectMonitors on the
+    // global in-use list after chk_om_in_use_count is calculated above.
+    out->print_cr("WARNING: g_om_in_use_count=%d is not equal to chk_om_in_use_count=%d",
+                  OrderAccess::load_acquire(&g_om_in_use_count),
+                  chk_om_in_use_count);
   }
 }
 
@@ -2037,16 +3129,19 @@
                                                             outputStream * out,
                                                             int *error_cnt_p) {
   int chk_om_free_count = 0;
-  for (ObjectMonitor* n = jt->om_free_list; n != NULL; n = n->_next_om) {
+  for (ObjectMonitor* n = OrderAccess::load_acquire(&jt->om_free_list); n != NULL; n = unmarked_next(n)) {
     chk_free_entry(jt, n, out, error_cnt_p);
     chk_om_free_count++;
   }
-  if (jt->om_free_count == chk_om_free_count) {
+  if (OrderAccess::load_acquire(&jt->om_free_count) == chk_om_free_count) {
     out->print_cr("jt=" INTPTR_FORMAT ": om_free_count=%d equals "
-                  "chk_om_free_count=%d", p2i(jt), jt->om_free_count, chk_om_free_count);
+                  "chk_om_free_count=%d", p2i(jt),
+                  OrderAccess::load_acquire(&jt->om_free_count),
+                  chk_om_free_count);
   } else {
     out->print_cr("ERROR: jt=" INTPTR_FORMAT ": om_free_count=%d is not "
-                  "equal to chk_om_free_count=%d", p2i(jt), jt->om_free_count,
+                  "equal to chk_om_free_count=%d", p2i(jt),
+                  OrderAccess::load_acquire(&jt->om_free_count),
                   chk_om_free_count);
     *error_cnt_p = *error_cnt_p + 1;
   }
@@ -2057,17 +3152,19 @@
                                                               outputStream * out,
                                                               int *error_cnt_p) {
   int chk_om_in_use_count = 0;
-  for (ObjectMonitor* n = jt->om_in_use_list; n != NULL; n = n->_next_om) {
+  for (ObjectMonitor* n = OrderAccess::load_acquire(&jt->om_in_use_list); n != NULL; n = unmarked_next(n)) {
     chk_in_use_entry(jt, n, out, error_cnt_p);
     chk_om_in_use_count++;
   }
-  if (jt->om_in_use_count == chk_om_in_use_count) {
+  if (OrderAccess::load_acquire(&jt->om_in_use_count) == chk_om_in_use_count) {
     out->print_cr("jt=" INTPTR_FORMAT ": om_in_use_count=%d equals "
-                  "chk_om_in_use_count=%d", p2i(jt), jt->om_in_use_count,
+                  "chk_om_in_use_count=%d", p2i(jt),
+                  OrderAccess::load_acquire(&jt->om_in_use_count),
                   chk_om_in_use_count);
   } else {
     out->print_cr("ERROR: jt=" INTPTR_FORMAT ": om_in_use_count=%d is not "
-                  "equal to chk_om_in_use_count=%d", p2i(jt), jt->om_in_use_count,
+                  "equal to chk_om_in_use_count=%d", p2i(jt),
+                  OrderAccess::load_acquire(&jt->om_in_use_count),
                   chk_om_in_use_count);
     *error_cnt_p = *error_cnt_p + 1;
   }
@@ -2076,27 +3173,22 @@
 // Log details about ObjectMonitors on the in-use lists. The 'BHL'
 // flags indicate why the entry is in-use, 'object' and 'object type'
 // indicate the associated object and its type.
-void ObjectSynchronizer::log_in_use_monitor_details(outputStream * out,
-                                                    bool on_exit) {
-  if (!on_exit) {
-    // Not at VM exit so grab the global list lock.
-    Thread::muxAcquire(&gListLock, "log_in_use_monitor_details");
-  }
-
+void ObjectSynchronizer::log_in_use_monitor_details(outputStream * out) {
   stringStream ss;
-  if (g_om_in_use_count > 0) {
+  if (OrderAccess::load_acquire(&g_om_in_use_count) > 0) {
     out->print_cr("In-use global monitor info:");
     out->print_cr("(B -> is_busy, H -> has hash code, L -> lock status)");
-    out->print_cr("%18s  %s  %18s  %18s",
-                  "monitor", "BHL", "object", "object type");
-    out->print_cr("==================  ===  ==================  ==================");
-    for (ObjectMonitor* n = g_om_in_use_list; n != NULL; n = n->_next_om) {
+    out->print_cr("%18s  %s  %7s  %18s  %18s",
+                  "monitor", "BHL", "ref_cnt", "object", "object type");
+    out->print_cr("==================  ===  =======  ==================  ==================");
+    for (ObjectMonitor* n = OrderAccess::load_acquire(&g_om_in_use_list); n != NULL; n = unmarked_next(n)) {
       const oop obj = (oop) n->object();
       const markWord mark = n->header();
       ResourceMark rm;
-      out->print(INTPTR_FORMAT "  %d%d%d  " INTPTR_FORMAT "  %s", p2i(n),
-                 n->is_busy() != 0, mark.hash() != 0, n->owner() != NULL,
-                 p2i(obj), obj->klass()->external_name());
+      out->print(INTPTR_FORMAT "  %d%d%d  %7d  " INTPTR_FORMAT "  %s",
+                 p2i(n), n->is_busy() != 0, mark.hash() != 0,
+                 n->owner() != NULL, (int)n->ref_count(), p2i(obj),
+                 obj->klass()->external_name());
       if (n->is_busy() != 0) {
         out->print(" (%s)", n->is_busy_to_string(&ss));
         ss.reset();
@@ -2105,24 +3197,20 @@
     }
   }
 
-  if (!on_exit) {
-    Thread::muxRelease(&gListLock);
-  }
-
   out->print_cr("In-use per-thread monitor info:");
   out->print_cr("(B -> is_busy, H -> has hash code, L -> lock status)");
-  out->print_cr("%18s  %18s  %s  %18s  %18s",
-                "jt", "monitor", "BHL", "object", "object type");
-  out->print_cr("==================  ==================  ===  ==================  ==================");
+  out->print_cr("%18s  %18s  %s  %7s  %18s  %18s",
+                "jt", "monitor", "BHL", "ref_cnt", "object", "object type");
+  out->print_cr("==================  ==================  ===  =======  ==================  ==================");
   for (JavaThreadIteratorWithHandle jtiwh; JavaThread *jt = jtiwh.next(); ) {
-    for (ObjectMonitor* n = jt->om_in_use_list; n != NULL; n = n->_next_om) {
+    for (ObjectMonitor* n = OrderAccess::load_acquire(&jt->om_in_use_list); n != NULL; n = unmarked_next(n)) {
       const oop obj = (oop) n->object();
       const markWord mark = n->header();
       ResourceMark rm;
-      out->print(INTPTR_FORMAT "  " INTPTR_FORMAT "  %d%d%d  " INTPTR_FORMAT
-                 "  %s", p2i(jt), p2i(n), n->is_busy() != 0,
-                 mark.hash() != 0, n->owner() != NULL, p2i(obj),
-                 obj->klass()->external_name());
+      out->print(INTPTR_FORMAT "  " INTPTR_FORMAT "  %d%d%d  %7d  "
+                 INTPTR_FORMAT "  %s", p2i(jt), p2i(n), n->is_busy() != 0,
+                 mark.hash() != 0, n->owner() != NULL, (int)n->ref_count(),
+                 p2i(obj), obj->klass()->external_name());
       if (n->is_busy() != 0) {
         out->print(" (%s)", n->is_busy_to_string(&ss));
         ss.reset();
@@ -2138,12 +3226,19 @@
 // the population count.
 int ObjectSynchronizer::log_monitor_list_counts(outputStream * out) {
   int pop_count = 0;
-  out->print_cr("%18s  %10s  %10s  %10s",
-                "Global Lists:", "InUse", "Free", "Total");
-  out->print_cr("==================  ==========  ==========  ==========");
-  out->print_cr("%18s  %10d  %10d  %10d", "",
-                g_om_in_use_count, g_om_free_count, g_om_population);
-  pop_count += g_om_in_use_count + g_om_free_count;
+  out->print_cr("%18s  %10s  %10s  %10s  %10s",
+                "Global Lists:", "InUse", "Free", "Wait", "Total");
+  out->print_cr("==================  ==========  ==========  ==========  ==========");
+  out->print_cr("%18s  %10d  %10d  %10d  %10d", "",
+                OrderAccess::load_acquire(&g_om_in_use_count),
+                OrderAccess::load_acquire(&g_om_free_count),
+                OrderAccess::load_acquire(&g_om_wait_count),
+                OrderAccess::load_acquire(&g_om_population));
+  pop_count += OrderAccess::load_acquire(&g_om_in_use_count) +
+               OrderAccess::load_acquire(&g_om_free_count);
+  if (HandshakeAfterDeflateIdleMonitors) {
+    pop_count += OrderAccess::load_acquire(&g_om_wait_count);
+  }
 
   out->print_cr("%18s  %10s  %10s  %10s",
                 "Per-Thread Lists:", "InUse", "Free", "Provision");
@@ -2151,8 +3246,11 @@
 
   for (JavaThreadIteratorWithHandle jtiwh; JavaThread *jt = jtiwh.next(); ) {
     out->print_cr(INTPTR_FORMAT "  %10d  %10d  %10d", p2i(jt),
-                  jt->om_in_use_count, jt->om_free_count, jt->om_free_provision);
-    pop_count += jt->om_in_use_count + jt->om_free_count;
+                  OrderAccess::load_acquire(&jt->om_in_use_count),
+                  OrderAccess::load_acquire(&jt->om_free_count),
+                  jt->om_free_provision);
+    pop_count += OrderAccess::load_acquire(&jt->om_in_use_count) +
+                 OrderAccess::load_acquire(&jt->om_free_count);
   }
   return pop_count;
 }
@@ -2174,7 +3272,8 @@
       assert((diff % sizeof(PaddedObjectMonitor)) == 0, "must be aligned");
       return 1;
     }
-    block = (PaddedObjectMonitor*)block->_next_om;
+    // unmarked_next() is not needed with g_block_list (no next field marking).
+    block = (PaddedObjectMonitor*)OrderAccess::load_acquire(&block->_next_om);
   }
   return 0;
 }