--- old/src/hotspot/cpu/x86/macroAssembler_x86.cpp 2019-11-04 14:57:23.000000000 -0500 +++ new/src/hotspot/cpu/x86/macroAssembler_x86.cpp 2019-11-04 14:57:22.000000000 -0500 @@ -1296,6 +1296,58 @@ #ifdef COMPILER2 +// Increment the ObjectMonitor's ref_count for safety or force a branch +// to 'done' with ICC.ZF=0 to indicate failure/take the slow path. +void MacroAssembler::inc_om_ref_count(Register obj_reg, Register om_reg, Register tmp_reg, Label& done) { + atomic_incl(Address(om_reg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count))); + + Label LGoSlowPath; + if (AsyncDeflateIdleMonitors) { + // Race here if monitor is not owned! The above ref_count bump + // will cause subsequent async deflation to skip it. However, + // previous or concurrent async deflation is a race. + + // First check: if the owner field == DEFLATER_MARKER: + movptr(tmp_reg, Address(om_reg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); + // DEFLATER_MARKER == reinterpret_cast(-1) so the compiler + // doesn't like to use the define here: + cmpptr(tmp_reg, -1); + // If marked for async deflation, then take the slow path. This is a + // simpler check than what ObjectMonitorHandle::save_om_ptr() does + // so ObjectMonitor::install_displaced_markword_in_object() doesn't + // have to be implemented in macro assembler. + jccb(Assembler::equal, LGoSlowPath); + + // Second check: if ref_count field <= 0: + movptr(tmp_reg, Address(om_reg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count))); + cmpptr(tmp_reg, 0); + // If async deflation is in the process of bailing out, but has not + // yet restored the ref_count field, then we take the slow path. We + // want a stable ref_count value for the fast path. + jccb(Assembler::lessEqual, LGoSlowPath); + + // Final check: if object field == obj_reg: + cmpptr(obj_reg, Address(om_reg, OM_OFFSET_NO_MONITOR_VALUE_TAG(object))); + // If the ObjectMonitor has been deflated and recycled, then take + // the slow path. + jccb(Assembler::notEqual, LGoSlowPath); + } + + Label LRetToCaller; + // We leave the ref_count incremented to protect the caller's code + // paths against async deflation. + jmpb(LRetToCaller); + + bind(LGoSlowPath); + lock(); + decrementl(Address(om_reg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count))); + // Jump to 'done' with ICC.ZF=0 to indicate failure/take the slow path. + orl(tmp_reg, 1); + jmp(done); + + bind(LRetToCaller); +} + #if INCLUDE_RTM_OPT // Update rtm_counters based on abort status @@ -1529,11 +1581,21 @@ assert(UseRTMLocking, "why call this otherwise?"); assert(tmpReg == rax, ""); assert(scrReg == rdx, ""); - Label L_rtm_retry, L_decrement_retry, L_on_abort; + Label L_rtm_retry, L_decrement_retry, L_on_abort, L_local_done; int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); - // Without cast to int32_t a movptr will destroy r10 which is typically obj + // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); + + if (!HandshakeAfterDeflateIdleMonitors) { + // Increment the ObjectMonitor's ref_count for safety or force the + // enter slow path via DONE_LABEL. + // In rtm_inflated_locking(), initially tmpReg contains the object's + // mark word which, in this case, is the (ObjectMonitor* | monitor_value). + // Also this code uses scrReg as its temporary register. + inc_om_ref_count(objReg, tmpReg /* om_reg */, scrReg /* tmp_reg */, DONE_LABEL); + } + movptr(boxReg, tmpReg); // Save ObjectMonitor address if (RTMRetryCount > 0) { @@ -1555,7 +1617,7 @@ movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); movptr(tmpReg, Address(tmpReg, owner_offset)); testptr(tmpReg, tmpReg); - jcc(Assembler::zero, DONE_LABEL); + jcc(Assembler::zero, L_local_done); if (UseRTMXendForLockBusy) { xend(); jmp(L_decrement_retry); @@ -1590,7 +1652,7 @@ if (RTMRetryCount > 0) { // success done else retry - jccb(Assembler::equal, DONE_LABEL) ; + jccb(Assembler::equal, L_local_done); bind(L_decrement_retry); // Spin and retry if lock is busy. rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); @@ -1598,15 +1660,28 @@ else { bind(L_decrement_retry); } + + // rtm_inflated_locking() exit paths come here except for a failed + // inc_om_ref_count() which goes directly to DONE_LABEL. + bind(L_local_done); + if (!HandshakeAfterDeflateIdleMonitors) { + pushf(); // Preserve flags. + // Decrement the ObjectMonitor's ref_count. + lock(); + decrementl(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count))); + popf(); // Restore flags so we have the proper ICC.ZF value. + } + + jmp(DONE_LABEL) ; } #endif // INCLUDE_RTM_OPT -// Fast_Lock and Fast_Unlock used by C2 +// fast_lock and fast_unlock used by C2 // Because the transitions from emitted code to the runtime // monitorenter/exit helper stubs are so slow it's critical that -// we inline both the stack-locking fast-path and the inflated fast path. +// we inline both the stack-locking fast path and the inflated fast path. // // See also: cmpFastLock and cmpFastUnlock. // @@ -1615,7 +1690,7 @@ // option would be to emit TrySlowEnter and TrySlowExit methods // at startup-time. These methods would accept arguments as // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure -// indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply +// indications in the icc.ZFlag. fast_lock and fast_unlock would simply // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. // In practice, however, the # of lock sites is bounded and is usually small. // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer @@ -1634,8 +1709,8 @@ // // TODO: // -// * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr). -// This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals. +// * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). +// This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. // Given TLAB allocation, Self is usually manifested in a register, so passing it into // the lock operators would typically be faster than reifying Self. // @@ -1661,14 +1736,14 @@ // * use jccb and jmpb instead of jcc and jmp to improve code density. // But beware of excessive branch density on AMD Opterons. // -// * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success -// or failure of the fast-path. If the fast-path fails then we pass -// control to the slow-path, typically in C. In Fast_Lock and -// Fast_Unlock we often branch to DONE_LABEL, just to find that C2 +// * Both fast_lock and fast_unlock set the ICC.ZF to indicate success +// or failure of the fast path. If the fast path fails then we pass +// control to the slow path, typically in C. In fast_lock and +// fast_unlock we often branch to DONE_LABEL, just to find that C2 // will emit a conditional branch immediately after the node. // So we have branches to branches and lots of ICC.ZF games. // Instead, it might be better to have C2 pass a "FailureLabel" -// into Fast_Lock and Fast_Unlock. In the case of success, control +// into fast_lock and fast_unlock. In the case of success, control // will drop through the node. ICC.ZF is undefined at exit. // In the case of failure, the node will branch directly to the // FailureLabel @@ -1813,7 +1888,7 @@ movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success - // If the CAS fails we can either retry or pass control to the slow-path. + // If the CAS fails we can either retry or pass control to the slow path. // We use the latter tactic. // Pass the CAS result in the icc.ZFlag into DONE_LABEL // If the CAS was successful ... @@ -1821,17 +1896,35 @@ // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. // Intentional fall-through into DONE_LABEL ... #else // _LP64 - // It's inflated + // It's inflated and we use scrReg for ObjectMonitor* in this section. movq(scrReg, tmpReg); - xorq(tmpReg, tmpReg); - lock(); - cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); // Unconditionally set box->_displaced_header = markWord::unused_mark(). - // Without cast to int32_t movptr will destroy r10 which is typically obj. + // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); + + if (!HandshakeAfterDeflateIdleMonitors) { + // Increment the ObjectMonitor's ref_count for safety or force the + // enter slow path via DONE_LABEL. + // In fast_lock(), scrReg contains the object's mark word which, + // in this case, is the (ObjectMonitor* | monitor_value). Also this + // code uses tmpReg as its temporary register. + inc_om_ref_count(objReg, scrReg /* om_reg */, tmpReg /* tmp_reg */, DONE_LABEL); + } + + xorq(tmpReg, tmpReg); + lock(); + cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); // Intentional fall-through into DONE_LABEL ... // Propagate ICC.ZF from CAS above into DONE_LABEL. + + if (!HandshakeAfterDeflateIdleMonitors) { + pushf(); // Preserve flags. + // Decrement the ObjectMonitor's ref_count. + lock(); + decrementl(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count))); + popf(); // Restore flags so we have the proper ICC.ZF value. + } #endif // _LP64 #if INCLUDE_RTM_OPT } // use_rtm() @@ -1844,9 +1937,9 @@ bind(DONE_LABEL); // At DONE_LABEL the icc ZFlag is set as follows ... - // Fast_Unlock uses the same protocol. + // fast_unlock uses the same protocol. // ZFlag == 1 -> Success - // ZFlag == 0 -> Failure - force control through the slow-path + // ZFlag == 0 -> Failure - force control through the slow path } // obj: object to unlock @@ -1855,7 +1948,7 @@ // // Some commentary on balanced locking: // -// Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites. +// fast_lock and fast_unlock are emitted only for provably balanced lock sites. // Methods that don't have provably balanced locking are forced to run in the // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. // The interpreter provides two properties: @@ -1876,7 +1969,7 @@ // should not be unlocked by "normal" java-level locking and vice-versa. The specification // doesn't specify what will occur if a program engages in such mixed-mode locking, however. // Arguably given that the spec legislates the JNI case as undefined our implementation -// could reasonably *avoid* checking owner in Fast_Unlock(). +// could reasonably *avoid* checking owner in fast_unlock(). // In the interest of performance we elide m->Owner==Self check in unlock. // A perfectly viable alternative is to elide the owner check except when // Xcheck:jni is enabled. @@ -1911,7 +2004,7 @@ jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword testptr(tmpReg, markWord::monitor_value); // Inflated? - jccb (Assembler::zero, Stacked); + jcc (Assembler::zero, Stacked); // It's inflated. #if INCLUDE_RTM_OPT @@ -1922,7 +2015,7 @@ testptr(boxReg, boxReg); jccb(Assembler::notZero, L_regular_inflated_unlock); xend(); - jmpb(DONE_LABEL); + jmp(DONE_LABEL); bind(L_regular_inflated_unlock); } #endif @@ -1941,7 +2034,7 @@ // a costly MEMBAR or CAS. See synchronizer.cpp for details on how // we detect and recover from the race that the 1-0 exit admits. // - // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier + // Conceptually fast_unlock() must execute a STST|LDST "release" barrier // before it STs null into _owner, releasing the lock. Updates // to data protected by the critical section must be visible before // we drop the lock (and thus before any other thread could acquire @@ -1984,27 +2077,39 @@ bind (CheckSucc); #else // _LP64 // It's inflated + + if (!HandshakeAfterDeflateIdleMonitors) { + // Increment the ObjectMonitor's ref_count for safety or force the + // exit slow path via DONE_LABEL. + // In fast_unlock(), tmpReg contains the object's mark word which, + // in this case, is the (ObjectMonitor* | monitor_value). Also this + // code uses boxReg as its temporary register. + inc_om_ref_count(objReg, tmpReg /* om_reg */, boxReg /* tmp_reg */, DONE_LABEL); + } + + // Try to avoid passing control into the slow path ... + Label LSuccess, LGoSlowPath; xorptr(boxReg, boxReg); orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); - jccb (Assembler::notZero, DONE_LABEL); + jccb(Assembler::notZero, LGoSlowPath); movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); jccb (Assembler::notZero, CheckSucc); + // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); - jmpb (DONE_LABEL); + jmpb(LSuccess); - // Try to avoid passing control into the slow_path ... - Label LSuccess, LGoSlowPath ; bind (CheckSucc); // The following optional optimization can be elided if necessary - // Effectively: if (succ == null) goto SlowPath + // Effectively: if (succ == null) goto slow path // The code reduces the window for a race, however, // and thus benefits performance. cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); jccb (Assembler::zero, LGoSlowPath); xorptr(boxReg, boxReg); + // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); // Memory barrier/fence @@ -2039,13 +2144,21 @@ // If that didn't work, then another thread grabbed the // lock so we're done (and exit was a success). jccb (Assembler::notEqual, LSuccess); - // Intentional fall-through into slow-path + // Intentional fall-through into slow path bind (LGoSlowPath); + if (!HandshakeAfterDeflateIdleMonitors) { + lock(); + decrementl(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count))); + } orl (boxReg, 1); // set ICC.ZF=0 to indicate failure jmpb (DONE_LABEL); bind (LSuccess); + if (!HandshakeAfterDeflateIdleMonitors) { + lock(); + decrementl(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count))); + } testl (boxReg, 0); // set ICC.ZF=1 to indicate success jmpb (DONE_LABEL);