--- old/src/cpu/x86/vm/macroAssembler_x86.cpp Mon Mar 9 09:40:57 2015 +++ new/src/cpu/x86/vm/macroAssembler_x86.cpp Mon Mar 9 09:40:57 2015 @@ -1958,6 +1958,11 @@ // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter // should not be unlocked by "normal" java-level locking and vice-versa. The specification // doesn't specify what will occur if a program engages in such mixed-mode locking, however. +// Arguably given that the spec legislates the JNI case as undefined our implementation +// could reasonably *avoid* checking owner in Fast_Unlock(). +// In the interest of performance we elide m->Owner==Self check in unlock. +// A perfectly viable alternative is to elide the owner check except when +// Xcheck:jni is enabled. void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { assert(boxReg == rax, ""); @@ -1966,24 +1971,6 @@ if (EmitSync & 4) { // Disable - inhibit all inlining. Force control through the slow-path cmpptr (rsp, 0); - } else - if (EmitSync & 8) { - Label DONE_LABEL; - if (UseBiasedLocking) { - biased_locking_exit(objReg, tmpReg, DONE_LABEL); - } - // Classic stack-locking code ... - // Check whether the displaced header is 0 - //(=> recursive unlock) - movptr(tmpReg, Address(boxReg, 0)); - testptr(tmpReg, tmpReg); - jccb(Assembler::zero, DONE_LABEL); - // If not recursive lock, reset the header to displaced header - if (os::is_MP()) { - lock(); - } - cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box - bind(DONE_LABEL); } else { Label DONE_LABEL, Stacked, CheckSucc; @@ -2060,9 +2047,9 @@ // the number of loads below (currently 4) to just 2 or 3. // Refer to the comments in synchronizer.cpp. // In practice the chain of fetches doesn't seem to impact performance, however. + xorptr(boxReg, boxReg); if ((EmitSync & 65536) == 0 && (EmitSync & 256)) { // Attempt to reduce branch density - AMD's branch predictor. - xorptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); @@ -2070,7 +2057,6 @@ movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); jmpb (DONE_LABEL); } else { - xorptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); jccb (Assembler::notZero, DONE_LABEL); movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); @@ -2093,10 +2079,8 @@ bind (CheckSucc); // Optional pre-test ... it's safe to elide this - if ((EmitSync & 16) == 0) { - cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); - jccb (Assembler::zero, LGoSlowPath); - } + cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); + jccb(Assembler::zero, LGoSlowPath); // We have a classic Dekker-style idiom: // ST m->_owner = 0 ; MEMBAR; LD m->_succ @@ -2109,7 +2093,8 @@ // In older IA32 processors MFENCE is slower than lock:add or xchg // particularly if the write-buffer is full as might be the case if // if stores closely precede the fence or fence-equivalent instruction. - // In more modern implementations MFENCE appears faster, however. + // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences + // as the situation has changed with Nehalem and Shanghai. // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack // The $lines underlying the top-of-stack should be in M-state. // The locked add instruction is serializing, of course. @@ -2126,11 +2111,7 @@ movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); if (os::is_MP()) { - if (VM_Version::supports_sse2() && 1 == FenceInstruction) { - mfence(); - } else { - lock (); addptr(Address(rsp, 0), 0); - } + lock(); addptr(Address(rsp, 0), 0); } // Ratify _succ remains non-null cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0); @@ -2179,8 +2160,15 @@ } #else // _LP64 // It's inflated - movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); - xorptr(boxReg, r15_thread); + if (EmitSync & 1024) { + // Don't bother to ratify that m_Owner==Self + // We might consider predicating the m_Owner==Self check on Xcheck:jni + // or running on a debug build. + movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); + xorptr(boxReg, r15_thread); + } else { + xorptr(boxReg, boxReg); + } orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); jccb (Assembler::notZero, DONE_LABEL); movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); @@ -2190,23 +2178,58 @@ jmpb (DONE_LABEL); if ((EmitSync & 65536) == 0) { + // Try to avoid passing control into the slow_path ... Label LSuccess, LGoSlowPath ; bind (CheckSucc); + + // The following optional optimization can be elided if necessary + // Effectively: if (succ == null) goto SlowPath + // The code reduces the window for a race, however, + // and thus benefits performance. cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); jccb (Assembler::zero, LGoSlowPath); - // I'd much rather use lock:andl m->_owner, 0 as it's faster than the - // the explicit ST;MEMBAR combination, but masm doesn't currently support - // "ANDQ M,IMM". Don't use MFENCE here. lock:add to TOS, xchg, etc - // are all faster when the write buffer is populated. - movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); - if (os::is_MP()) { - lock (); addl (Address(rsp, 0), 0); + if ((EmitSync & 16) && os::is_MP()) { + orptr(boxReg, boxReg); + xchgptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); + } else { + movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); + if (os::is_MP()) { + // Memory barrier/fence + // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ + // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. + // This is faster on Nehalem and AMD Shanghai/Barcelona. + // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences + // We might also restructure (ST Owner=0;barrier;LD _Succ) to + // (mov box,0; xchgq box, &m->Owner; LD _succ) . + if (EmitSync & 32) { + // An alternative is to use xchgq r15, r15->TSelf which has bidirectional + // barrier semantics but is otherwise a no-op, as r15->TSelf == r15. + xchgptr(r15_thread, Address(r15_thread, Thread::TSelf_offset())); + } else { + lock(); addl(Address(rsp, 0), 0); + } + } } cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); jccb (Assembler::notZero, LSuccess); - movptr (boxReg, (int32_t)NULL_WORD); // box is really EAX + // Rare inopportune interleaving - race. + // The successor vanished in the small window above. + // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. + // We need to ensure progress and succession. + // Try to reacquire the lock. + // If that fails then the new owner is responsible for succession and this + // thread needs to take no further action and can exit via the fast path (success). + // If the re-acquire succeeds then pass control into the slow path. + // As implemented, this latter mode is horrible because we generated more + // coherence traffic on the lock *and* artifically extended the critical section + // length while by virtue of passing control into the slow path. + + // box is really RAX -- the following CMPXCHG depends on that binding + // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) + + movptr(boxReg, (int32_t)NULL_WORD); // box is really RAX if (os::is_MP()) { lock(); } cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); jccb (Assembler::notEqual, LSuccess); @@ -2231,10 +2254,6 @@ } #endif bind(DONE_LABEL); - // Avoid branch to branch on AMD processors - if (EmitSync & 32768) { - nop(); - } } } #endif // COMPILER2