src/cpu/x86/vm/x86_32.ad
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File
*** old/src/cpu/x86/vm/x86_32.ad	Thu Feb 13 18:54:11 2014
--- new/src/cpu/x86/vm/x86_32.ad	Thu Feb 13 18:54:11 2014

*** 2916,3461 **** --- 2916,2925 ---- emit_opcode(cbuf,0x83); // SBB hi,0 emit_rm (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg)); emit_d8 (cbuf,0 ); %} // Because the transitions from emitted code to the runtime // monitorenter/exit helper stubs are so slow it's critical that // we inline both the stack-locking fast-path and the inflated fast path. // // See also: cmpFastLock and cmpFastUnlock. // // What follows is a specialized inline transliteration of the code // in slow_enter() and slow_exit(). If we're concerned about I$ bloat // another option would be to emit TrySlowEnter and TrySlowExit methods // at startup-time. These methods would accept arguments as // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure // indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. // In practice, however, the # of lock sites is bounded and is usually small. // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer // if the processor uses simple bimodal branch predictors keyed by EIP // Since the helper routines would be called from multiple synchronization // sites. // // An even better approach would be write "MonitorEnter()" and "MonitorExit()" // in java - using j.u.c and unsafe - and just bind the lock and unlock sites // to those specialized methods. That'd give us a mostly platform-independent // implementation that the JITs could optimize and inline at their pleasure. // Done correctly, the only time we'd need to cross to native could would be // to park() or unpark() threads. We'd also need a few more unsafe operators // to (a) prevent compiler-JIT reordering of non-volatile accesses, and // (b) explicit barriers or fence operations. // // TODO: // // * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr). // This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals. // Given TLAB allocation, Self is usually manifested in a register, so passing it into // the lock operators would typically be faster than reifying Self. // // * Ideally I'd define the primitives as: // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED // Unfortunately ADLC bugs prevent us from expressing the ideal form. // Instead, we're stuck with a rather awkward and brittle register assignments below. // Furthermore the register assignments are overconstrained, possibly resulting in // sub-optimal code near the synchronization site. // // * Eliminate the sp-proximity tests and just use "== Self" tests instead. // Alternately, use a better sp-proximity test. // // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. // Either one is sufficient to uniquely identify a thread. // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. // // * Intrinsify notify() and notifyAll() for the common cases where the // object is locked by the calling thread but the waitlist is empty. // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). // // * use jccb and jmpb instead of jcc and jmp to improve code density. // But beware of excessive branch density on AMD Opterons. // // * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success // or failure of the fast-path. If the fast-path fails then we pass // control to the slow-path, typically in C. In Fast_Lock and // Fast_Unlock we often branch to DONE_LABEL, just to find that C2 // will emit a conditional branch immediately after the node. // So we have branches to branches and lots of ICC.ZF games. // Instead, it might be better to have C2 pass a "FailureLabel" // into Fast_Lock and Fast_Unlock. In the case of success, control // will drop through the node. ICC.ZF is undefined at exit. // In the case of failure, the node will branch directly to the // FailureLabel // obj: object to lock // box: on-stack box address (displaced header location) - KILLED // rax,: tmp -- KILLED // scr: tmp -- KILLED enc_class Fast_Lock( eRegP obj, eRegP box, eAXRegI tmp, eRegP scr ) %{ Register objReg = as_Register($obj$$reg); Register boxReg = as_Register($box$$reg); Register tmpReg = as_Register($tmp$$reg); Register scrReg = as_Register($scr$$reg); // Ensure the register assignents are disjoint guarantee (objReg != boxReg, "") ; guarantee (objReg != tmpReg, "") ; guarantee (objReg != scrReg, "") ; guarantee (boxReg != tmpReg, "") ; guarantee (boxReg != scrReg, "") ; guarantee (tmpReg == as_Register(EAX_enc), "") ; MacroAssembler masm(&cbuf); if (_counters != NULL) { masm.atomic_incl(ExternalAddress((address) _counters->total_entry_count_addr())); } if (EmitSync & 1) { // set box->dhw = unused_mark (3) // Force all sync thru slow-path: slow_enter() and slow_exit() masm.movptr (Address(boxReg, 0), int32_t(markOopDesc::unused_mark())) ; masm.cmpptr (rsp, (int32_t)0) ; } else if (EmitSync & 2) { Label DONE_LABEL ; if (UseBiasedLocking) { // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument. masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters); } masm.movptr(tmpReg, Address(objReg, 0)) ; // fetch markword masm.orptr (tmpReg, 0x1); masm.movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS if (os::is_MP()) { masm.lock(); } masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg masm.jcc(Assembler::equal, DONE_LABEL); // Recursive locking masm.subptr(tmpReg, rsp); masm.andptr(tmpReg, (int32_t) 0xFFFFF003 ); masm.movptr(Address(boxReg, 0), tmpReg); masm.bind(DONE_LABEL) ; } else { // Possible cases that we'll encounter in fast_lock // ------------------------------------------------ // * Inflated // -- unlocked // -- Locked // = by self // = by other // * biased // -- by Self // -- by other // * neutral // * stack-locked // -- by self // = sp-proximity test hits // = sp-proximity test generates false-negative // -- by other // Label IsInflated, DONE_LABEL, PopDone ; // TODO: optimize away redundant LDs of obj->mark and improve the markword triage // order to reduce the number of conditional branches in the most common cases. // Beware -- there's a subtle invariant that fetch of the markword // at [FETCH], below, will never observe a biased encoding (*101b). // If this invariant is not held we risk exclusion (safety) failure. if (UseBiasedLocking && !UseOptoBiasInlining) { masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters); } masm.movptr(tmpReg, Address(objReg, 0)) ; // [FETCH] masm.testptr(tmpReg, 0x02) ; // Inflated v (Stack-locked or neutral) masm.jccb (Assembler::notZero, IsInflated) ; // Attempt stack-locking ... masm.orptr (tmpReg, 0x1); masm.movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS if (os::is_MP()) { masm.lock(); } masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg if (_counters != NULL) { masm.cond_inc32(Assembler::equal, ExternalAddress((address)_counters->fast_path_entry_count_addr())); } masm.jccb (Assembler::equal, DONE_LABEL); // Recursive locking masm.subptr(tmpReg, rsp); masm.andptr(tmpReg, 0xFFFFF003 ); masm.movptr(Address(boxReg, 0), tmpReg); if (_counters != NULL) { masm.cond_inc32(Assembler::equal, ExternalAddress((address)_counters->fast_path_entry_count_addr())); } masm.jmp (DONE_LABEL) ; masm.bind (IsInflated) ; // The object is inflated. // // TODO-FIXME: eliminate the ugly use of manifest constants: // Use markOopDesc::monitor_value instead of "2". // use markOop::unused_mark() instead of "3". // The tmpReg value is an objectMonitor reference ORed with // markOopDesc::monitor_value (2). We can either convert tmpReg to an // objectmonitor pointer by masking off the "2" bit or we can just // use tmpReg as an objectmonitor pointer but bias the objectmonitor // field offsets with "-2" to compensate for and annul the low-order tag bit. // // I use the latter as it avoids AGI stalls. // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]" // instead of "mov r, [tmpReg+OFFSETOF(Owner)]". // #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2) // boxReg refers to the on-stack BasicLock in the current frame. // We'd like to write: // set box->_displaced_header = markOop::unused_mark(). Any non-0 value suffices. // This is convenient but results a ST-before-CAS penalty. The following CAS suffers // additional latency as we have another ST in the store buffer that must drain. if (EmitSync & 8192) { masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty masm.get_thread (scrReg) ; masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] masm.movptr(tmpReg, NULL_WORD); // consider: xor vs mov if (os::is_MP()) { masm.lock(); } masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; } else if ((EmitSync & 128) == 0) { // avoid ST-before-CAS masm.movptr(scrReg, boxReg) ; masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { // prefetchw [eax + Offset(_owner)-2] masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2)); } if ((EmitSync & 64) == 0) { // Optimistic form: consider XORL tmpReg,tmpReg masm.movptr(tmpReg, NULL_WORD) ; } else { // Can suffer RTS->RTO upgrades on shared or cold $ lines // Test-And-CAS instead of CAS masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner masm.testptr(tmpReg, tmpReg) ; // Locked ? masm.jccb (Assembler::notZero, DONE_LABEL) ; } // Appears unlocked - try to swing _owner from null to non-null. // Ideally, I'd manifest "Self" with get_thread and then attempt // to CAS the register containing Self into m->Owner. // But we don't have enough registers, so instead we can either try to CAS // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds // we later store "Self" into m->Owner. Transiently storing a stack address // (rsp or the address of the box) into m->owner is harmless. // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. if (os::is_MP()) { masm.lock(); } masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; masm.movptr(Address(scrReg, 0), 3) ; // box->_displaced_header = 3 masm.jccb (Assembler::notZero, DONE_LABEL) ; masm.get_thread (scrReg) ; // beware: clobbers ICCs masm.movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg) ; masm.xorptr(boxReg, boxReg) ; // set icc.ZFlag = 1 to indicate success // If the CAS fails we can either retry or pass control to the slow-path. // We use the latter tactic. // Pass the CAS result in the icc.ZFlag into DONE_LABEL // If the CAS was successful ... // Self has acquired the lock // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. // Intentional fall-through into DONE_LABEL ... } else { masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty masm.movptr(boxReg, tmpReg) ; // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { // prefetchw [eax + Offset(_owner)-2] masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2)); } if ((EmitSync & 64) == 0) { // Optimistic form masm.xorptr (tmpReg, tmpReg) ; } else { // Can suffer RTS->RTO upgrades on shared or cold $ lines masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner masm.testptr(tmpReg, tmpReg) ; // Locked ? masm.jccb (Assembler::notZero, DONE_LABEL) ; } // Appears unlocked - try to swing _owner from null to non-null. // Use either "Self" (in scr) or rsp as thread identity in _owner. // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. masm.get_thread (scrReg) ; if (os::is_MP()) { masm.lock(); } masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // If the CAS fails we can either retry or pass control to the slow-path. // We use the latter tactic. // Pass the CAS result in the icc.ZFlag into DONE_LABEL // If the CAS was successful ... // Self has acquired the lock // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. // Intentional fall-through into DONE_LABEL ... } // DONE_LABEL is a hot target - we'd really like to place it at the // start of cache line by padding with NOPs. // See the AMD and Intel software optimization manuals for the // most efficient "long" NOP encodings. // Unfortunately none of our alignment mechanisms suffice. masm.bind(DONE_LABEL); // Avoid branch-to-branch on AMD processors // This appears to be superstition. if (EmitSync & 32) masm.nop() ; // At DONE_LABEL the icc ZFlag is set as follows ... // Fast_Unlock uses the same protocol. // ZFlag == 1 -> Success // ZFlag == 0 -> Failure - force control through the slow-path } %} // obj: object to unlock // box: box address (displaced header location), killed. Must be EAX. // rbx,: killed tmp; cannot be obj nor box. // // Some commentary on balanced locking: // // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites. // Methods that don't have provably balanced locking are forced to run in the // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. // The interpreter provides two properties: // I1: At return-time the interpreter automatically and quietly unlocks any // objects acquired the current activation (frame). Recall that the // interpreter maintains an on-stack list of locks currently held by // a frame. // I2: If a method attempts to unlock an object that is not held by the // the frame the interpreter throws IMSX. // // Lets say A(), which has provably balanced locking, acquires O and then calls B(). // B() doesn't have provably balanced locking so it runs in the interpreter. // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O // is still locked by A(). // // The only other source of unbalanced locking would be JNI. The "Java Native Interface: // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter // should not be unlocked by "normal" java-level locking and vice-versa. The specification // doesn't specify what will occur if a program engages in such mixed-mode locking, however. enc_class Fast_Unlock( nabxRegP obj, eAXRegP box, eRegP tmp) %{ Register objReg = as_Register($obj$$reg); Register boxReg = as_Register($box$$reg); Register tmpReg = as_Register($tmp$$reg); guarantee (objReg != boxReg, "") ; guarantee (objReg != tmpReg, "") ; guarantee (boxReg != tmpReg, "") ; guarantee (boxReg == as_Register(EAX_enc), "") ; MacroAssembler masm(&cbuf); if (EmitSync & 4) { // Disable - inhibit all inlining. Force control through the slow-path masm.cmpptr (rsp, 0) ; } else if (EmitSync & 8) { Label DONE_LABEL ; if (UseBiasedLocking) { masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL); } // classic stack-locking code ... masm.movptr(tmpReg, Address(boxReg, 0)) ; masm.testptr(tmpReg, tmpReg) ; masm.jcc (Assembler::zero, DONE_LABEL) ; if (os::is_MP()) { masm.lock(); } masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box masm.bind(DONE_LABEL); } else { Label DONE_LABEL, Stacked, CheckSucc, Inflated ; // Critically, the biased locking test must have precedence over // and appear before the (box->dhw == 0) recursive stack-lock test. if (UseBiasedLocking && !UseOptoBiasInlining) { masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL); } masm.cmpptr(Address(boxReg, 0), 0) ; // Examine the displaced header masm.movptr(tmpReg, Address(objReg, 0)) ; // Examine the object's markword masm.jccb (Assembler::zero, DONE_LABEL) ; // 0 indicates recursive stack-lock masm.testptr(tmpReg, 0x02) ; // Inflated? masm.jccb (Assembler::zero, Stacked) ; masm.bind (Inflated) ; // It's inflated. // Despite our balanced locking property we still check that m->_owner == Self // as java routines or native JNI code called by this thread might // have released the lock. // Refer to the comments in synchronizer.cpp for how we might encode extra // state in _succ so we can avoid fetching EntryList|cxq. // // I'd like to add more cases in fast_lock() and fast_unlock() -- // such as recursive enter and exit -- but we have to be wary of // I$ bloat, T$ effects and BP$ effects. // // If there's no contention try a 1-0 exit. That is, exit without // a costly MEMBAR or CAS. See synchronizer.cpp for details on how // we detect and recover from the race that the 1-0 exit admits. // // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier // before it STs null into _owner, releasing the lock. Updates // to data protected by the critical section must be visible before // we drop the lock (and thus before any other thread could acquire // the lock and observe the fields protected by the lock). // IA32's memory-model is SPO, so STs are ordered with respect to // each other and there's no need for an explicit barrier (fence). // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. masm.get_thread (boxReg) ; if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { // prefetchw [ebx + Offset(_owner)-2] masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2)); } // Note that we could employ various encoding schemes to reduce // the number of loads below (currently 4) to just 2 or 3. // Refer to the comments in synchronizer.cpp. // In practice the chain of fetches doesn't seem to impact performance, however. if ((EmitSync & 65536) == 0 && (EmitSync & 256)) { // Attempt to reduce branch density - AMD's branch predictor. masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ; masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; masm.jccb (Assembler::notZero, DONE_LABEL) ; masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; masm.jmpb (DONE_LABEL) ; } else { masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ; masm.jccb (Assembler::notZero, DONE_LABEL) ; masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; masm.jccb (Assembler::notZero, CheckSucc) ; masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; masm.jmpb (DONE_LABEL) ; } // The Following code fragment (EmitSync & 65536) improves the performance of // contended applications and contended synchronization microbenchmarks. // Unfortunately the emission of the code - even though not executed - causes regressions // in scimark and jetstream, evidently because of $ effects. Replacing the code // with an equal number of never-executed NOPs results in the same regression. // We leave it off by default. if ((EmitSync & 65536) != 0) { Label LSuccess, LGoSlowPath ; masm.bind (CheckSucc) ; // Optional pre-test ... it's safe to elide this if ((EmitSync & 16) == 0) { masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; masm.jccb (Assembler::zero, LGoSlowPath) ; } // We have a classic Dekker-style idiom: // ST m->_owner = 0 ; MEMBAR; LD m->_succ // There are a number of ways to implement the barrier: // (1) lock:andl &m->_owner, 0 // is fast, but mask doesn't currently support the "ANDL M,IMM32" form. // LOCK: ANDL [ebx+Offset(_Owner)-2], 0 // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8 // (2) If supported, an explicit MFENCE is appealing. // In older IA32 processors MFENCE is slower than lock:add or xchg // particularly if the write-buffer is full as might be the case if // if stores closely precede the fence or fence-equivalent instruction. // In more modern implementations MFENCE appears faster, however. // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack // The $lines underlying the top-of-stack should be in M-state. // The locked add instruction is serializing, of course. // (4) Use xchg, which is serializing // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0. // The integer condition codes will tell us if succ was 0. // Since _succ and _owner should reside in the same $line and // we just stored into _owner, it's likely that the $line // remains in M-state for the lock:orl. // // We currently use (3), although it's likely that switching to (2) // is correct for the future. masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; if (os::is_MP()) { if (VM_Version::supports_sse2() && 1 == FenceInstruction) { masm.mfence(); } else { masm.lock () ; masm.addptr(Address(rsp, 0), 0) ; } } // Ratify _succ remains non-null masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; masm.jccb (Assembler::notZero, LSuccess) ; masm.xorptr(boxReg, boxReg) ; // box is really EAX if (os::is_MP()) { masm.lock(); } masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); masm.jccb (Assembler::notEqual, LSuccess) ; // Since we're low on registers we installed rsp as a placeholding in _owner. // Now install Self over rsp. This is safe as we're transitioning from // non-null to non=null masm.get_thread (boxReg) ; masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ; // Intentional fall-through into LGoSlowPath ... masm.bind (LGoSlowPath) ; masm.orptr(boxReg, 1) ; // set ICC.ZF=0 to indicate failure masm.jmpb (DONE_LABEL) ; masm.bind (LSuccess) ; masm.xorptr(boxReg, boxReg) ; // set ICC.ZF=1 to indicate success masm.jmpb (DONE_LABEL) ; } masm.bind (Stacked) ; // It's not inflated and it's not recursively stack-locked and it's not biased. // It must be stack-locked. // Try to reset the header to displaced header. // The "box" value on the stack is stable, so we can reload // and be assured we observe the same value as above. masm.movptr(tmpReg, Address(boxReg, 0)) ; if (os::is_MP()) { masm.lock(); } masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box // Intention fall-thru into DONE_LABEL // DONE_LABEL is a hot target - we'd really like to place it at the // start of cache line by padding with NOPs. // See the AMD and Intel software optimization manuals for the // most efficient "long" NOP encodings. // Unfortunately none of our alignment mechanisms suffice. if ((EmitSync & 65536) == 0) { masm.bind (CheckSucc) ; } masm.bind(DONE_LABEL); // Avoid branch to branch on AMD processors if (EmitSync & 32768) { masm.nop() ; } } %} enc_class enc_pop_rdx() %{ emit_opcode(cbuf,0x5A); %} enc_class enc_rethrow() %{
*** 13155,13181 **** --- 12619,12648 ---- ins_pipe( pipe_jmp ); %} // inlined locking and unlocking instruct cmpFastLock( eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{ ! match( Set cr (FastLock object box) ); effect( TEMP tmp, TEMP scr, USE_KILL box ); + instruct cmpFastLock(eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{ + match(Set cr (FastLock object box)); ! effect(TEMP tmp, TEMP scr, USE_KILL box); ins_cost(300); format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %} ! ins_encode( Fast_Lock(object,box,tmp,scr) ); ! ins_pipe( pipe_slow ); ! ins_encode %{ ! __ fast_lock($object$$Register, $box$$Register, $tmp$$Register, $scr$$Register, _counters); + %} + ins_pipe(pipe_slow); %} - instruct cmpFastUnlock( eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{ ! match( Set cr (FastUnlock object box) ); ! effect( TEMP tmp, USE_KILL box ); ! effect(TEMP tmp, USE_KILL box); ins_cost(300); format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %} ! ins_encode( Fast_Unlock(object,box,tmp) ); ! ins_pipe( pipe_slow ); ! ins_encode %{ ! __ fast_unlock($object$$Register, $box$$Register, $tmp$$Register); + %} + ins_pipe(pipe_slow); %} // ============================================================================

src/cpu/x86/vm/x86_32.ad
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File