< prev index next >
src/hotspot/cpu/x86/macroAssembler_x86.cpp
Print this page
rev 51633 : imported patch 8210381
@@ -1719,16 +1719,11 @@
}
if (counters != NULL) {
atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
}
- if (EmitSync & 1) {
- // set box->dhw = markOopDesc::unused_mark()
- // Force all sync thru slow-path: slow_enter() and slow_exit()
- movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
- cmpptr (rsp, (int32_t)NULL_WORD);
- } else {
+
// Possible cases that we'll encounter in fast_lock
// ------------------------------------------------
// * Inflated
// -- unlocked
// -- Locked
@@ -1813,41 +1808,17 @@
// We'd like to write:
// set box->_displaced_header = markOopDesc::unused_mark(). Any non-0 value suffices.
// This is convenient but results a ST-before-CAS penalty. The following CAS suffers
// additional latency as we have another ST in the store buffer that must drain.
- if (EmitSync & 8192) {
- movptr(Address(boxReg, 0), 3); // results in ST-before-CAS penalty
- get_thread (scrReg);
- movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
- movptr(tmpReg, NULL_WORD); // consider: xor vs mov
- if (os::is_MP()) {
- lock();
- }
- cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
- } else
- if ((EmitSync & 128) == 0) { // avoid ST-before-CAS
+ // avoid ST-before-CAS
// register juggle because we need tmpReg for cmpxchgptr below
movptr(scrReg, boxReg);
movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
- // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
- if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
- // prefetchw [eax + Offset(_owner)-2]
- prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
- }
-
- if ((EmitSync & 64) == 0) {
// Optimistic form: consider XORL tmpReg,tmpReg
movptr(tmpReg, NULL_WORD);
- } else {
- // Can suffer RTS->RTO upgrades on shared or cold $ lines
- // Test-And-CAS instead of CAS
- movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); // rax, = m->_owner
- testptr(tmpReg, tmpReg); // Locked ?
- jccb (Assembler::notZero, DONE_LABEL);
- }
// Appears unlocked - try to swing _owner from null to non-null.
// Ideally, I'd manifest "Self" with get_thread and then attempt
// to CAS the register containing Self into m->Owner.
// But we don't have enough registers, so instead we can either try to CAS
@@ -1873,47 +1844,10 @@
// Pass the CAS result in the icc.ZFlag into DONE_LABEL
// If the CAS was successful ...
// Self has acquired the lock
// Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
// Intentional fall-through into DONE_LABEL ...
- } else {
- movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark())); // results in ST-before-CAS penalty
- movptr(boxReg, tmpReg);
-
- // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
- if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
- // prefetchw [eax + Offset(_owner)-2]
- prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
- }
-
- if ((EmitSync & 64) == 0) {
- // Optimistic form
- xorptr (tmpReg, tmpReg);
- } else {
- // Can suffer RTS->RTO upgrades on shared or cold $ lines
- movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); // rax, = m->_owner
- testptr(tmpReg, tmpReg); // Locked ?
- jccb (Assembler::notZero, DONE_LABEL);
- }
-
- // Appears unlocked - try to swing _owner from null to non-null.
- // Use either "Self" (in scr) or rsp as thread identity in _owner.
- // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
- get_thread (scrReg);
- if (os::is_MP()) {
- lock();
- }
- cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
-
- // If the CAS fails we can either retry or pass control to the slow-path.
- // We use the latter tactic.
- // Pass the CAS result in the icc.ZFlag into DONE_LABEL
- // If the CAS was successful ...
- // Self has acquired the lock
- // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
- // Intentional fall-through into DONE_LABEL ...
- }
#else // _LP64
// It's inflated
movq(scrReg, tmpReg);
xorq(tmpReg, tmpReg);
@@ -1939,11 +1873,10 @@
// At DONE_LABEL the icc ZFlag is set as follows ...
// Fast_Unlock uses the same protocol.
// ZFlag == 1 -> Success
// ZFlag == 0 -> Failure - force control through the slow-path
- }
}
// obj: object to unlock
// box: box address (displaced header location), killed. Must be EAX.
// tmp: killed, cannot be obj nor box.
@@ -1978,14 +1911,10 @@
void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
assert(boxReg == rax, "");
assert_different_registers(objReg, boxReg, tmpReg);
- if (EmitSync & 4) {
- // Disable - inhibit all inlining. Force control through the slow-path
- cmpptr (rsp, 0);
- } else {
Label DONE_LABEL, Stacked, CheckSucc;
// Critically, the biased locking test must have precedence over
// and appear before the (box->dhw == 0) recursive stack-lock test.
if (UseBiasedLocking && !UseOptoBiasInlining) {
@@ -2048,111 +1977,23 @@
// IA32's memory-model is SPO, so STs are ordered with respect to
// each other and there's no need for an explicit barrier (fence).
// See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
#ifndef _LP64
get_thread (boxReg);
- if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
- // prefetchw [ebx + Offset(_owner)-2]
- prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
- }
// Note that we could employ various encoding schemes to reduce
// the number of loads below (currently 4) to just 2 or 3.
// Refer to the comments in synchronizer.cpp.
// In practice the chain of fetches doesn't seem to impact performance, however.
xorptr(boxReg, boxReg);
- if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
- // Attempt to reduce branch density - AMD's branch predictor.
- orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
- orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
- orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
- jccb (Assembler::notZero, DONE_LABEL);
- movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
- jmpb (DONE_LABEL);
- } else {
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
jccb (Assembler::notZero, DONE_LABEL);
movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
jccb (Assembler::notZero, CheckSucc);
movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
jmpb (DONE_LABEL);
- }
-
- // The Following code fragment (EmitSync & 65536) improves the performance of
- // contended applications and contended synchronization microbenchmarks.
- // Unfortunately the emission of the code - even though not executed - causes regressions
- // in scimark and jetstream, evidently because of $ effects. Replacing the code
- // with an equal number of never-executed NOPs results in the same regression.
- // We leave it off by default.
-
- if ((EmitSync & 65536) != 0) {
- Label LSuccess, LGoSlowPath ;
-
- bind (CheckSucc);
-
- // Optional pre-test ... it's safe to elide this
- cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
- jccb(Assembler::zero, LGoSlowPath);
-
- // We have a classic Dekker-style idiom:
- // ST m->_owner = 0 ; MEMBAR; LD m->_succ
- // There are a number of ways to implement the barrier:
- // (1) lock:andl &m->_owner, 0
- // is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
- // LOCK: ANDL [ebx+Offset(_Owner)-2], 0
- // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
- // (2) If supported, an explicit MFENCE is appealing.
- // In older IA32 processors MFENCE is slower than lock:add or xchg
- // particularly if the write-buffer is full as might be the case if
- // if stores closely precede the fence or fence-equivalent instruction.
- // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
- // as the situation has changed with Nehalem and Shanghai.
- // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
- // The $lines underlying the top-of-stack should be in M-state.
- // The locked add instruction is serializing, of course.
- // (4) Use xchg, which is serializing
- // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
- // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
- // The integer condition codes will tell us if succ was 0.
- // Since _succ and _owner should reside in the same $line and
- // we just stored into _owner, it's likely that the $line
- // remains in M-state for the lock:orl.
- //
- // We currently use (3), although it's likely that switching to (2)
- // is correct for the future.
-
- movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
- if (os::is_MP()) {
- lock(); addptr(Address(rsp, 0), 0);
- }
- // Ratify _succ remains non-null
- cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0);
- jccb (Assembler::notZero, LSuccess);
-
- xorptr(boxReg, boxReg); // box is really EAX
- if (os::is_MP()) { lock(); }
- cmpxchgptr(rsp, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
- // There's no successor so we tried to regrab the lock with the
- // placeholder value. If that didn't work, then another thread
- // grabbed the lock so we're done (and exit was a success).
- jccb (Assembler::notEqual, LSuccess);
- // Since we're low on registers we installed rsp as a placeholding in _owner.
- // Now install Self over rsp. This is safe as we're transitioning from
- // non-null to non=null
- get_thread (boxReg);
- movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), boxReg);
- // Intentional fall-through into LGoSlowPath ...
-
- bind (LGoSlowPath);
- orptr(boxReg, 1); // set ICC.ZF=0 to indicate failure
- jmpb (DONE_LABEL);
-
- bind (LSuccess);
- xorptr(boxReg, boxReg); // set ICC.ZF=1 to indicate success
- jmpb (DONE_LABEL);
- }
bind (Stacked);
// It's not inflated and it's not recursively stack-locked and it's not biased.
// It must be stack-locked.
// Try to reset the header to displaced header.
@@ -2168,35 +2009,22 @@
// DONE_LABEL is a hot target - we'd really like to place it at the
// start of cache line by padding with NOPs.
// See the AMD and Intel software optimization manuals for the
// most efficient "long" NOP encodings.
// Unfortunately none of our alignment mechanisms suffice.
- if ((EmitSync & 65536) == 0) {
bind (CheckSucc);
- }
#else // _LP64
// It's inflated
- if (EmitSync & 1024) {
- // Emit code to check that _owner == Self
- // We could fold the _owner test into subsequent code more efficiently
- // than using a stand-alone check, but since _owner checking is off by
- // default we don't bother. We also might consider predicating the
- // _owner==Self check on Xcheck:jni or running on a debug build.
- movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
- xorptr(boxReg, r15_thread);
- } else {
xorptr(boxReg, boxReg);
- }
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
jccb (Assembler::notZero, DONE_LABEL);
movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
jccb (Assembler::notZero, CheckSucc);
movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
jmpb (DONE_LABEL);
- if ((EmitSync & 65536) == 0) {
// Try to avoid passing control into the slow_path ...
Label LSuccess, LGoSlowPath ;
bind (CheckSucc);
// The following optional optimization can be elided if necessary
@@ -2205,13 +2033,10 @@
// and thus benefits performance.
cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
jccb (Assembler::zero, LGoSlowPath);
xorptr(boxReg, boxReg);
- if ((EmitSync & 16) && os::is_MP()) {
- xchgptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
- } else {
movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
if (os::is_MP()) {
// Memory barrier/fence
// Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
// Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
@@ -2219,11 +2044,10 @@
// See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
// We might also restructure (ST Owner=0;barrier;LD _Succ) to
// (mov box,0; xchgq box, &m->Owner; LD _succ) .
lock(); addl(Address(rsp, 0), 0);
}
- }
cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
jccb (Assembler::notZero, LSuccess);
// Rare inopportune interleaving - race.
// The successor vanished in the small window above.
@@ -2252,23 +2076,18 @@
jmpb (DONE_LABEL);
bind (LSuccess);
testl (boxReg, 0); // set ICC.ZF=1 to indicate success
jmpb (DONE_LABEL);
- }
bind (Stacked);
movptr(tmpReg, Address (boxReg, 0)); // re-fetch
if (os::is_MP()) { lock(); }
cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
- if (EmitSync & 65536) {
- bind (CheckSucc);
- }
#endif
bind(DONE_LABEL);
- }
}
#endif // COMPILER2
void MacroAssembler::c2bool(Register x) {
// implements x == 0 ? 0 : 1
< prev index next >