src/cpu/x86/vm/macroAssembler_x86.cpp

Print this page




1941 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1942 // Methods that don't have provably balanced locking are forced to run in the
1943 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1944 // The interpreter provides two properties:
1945 // I1:  At return-time the interpreter automatically and quietly unlocks any
1946 //      objects acquired the current activation (frame).  Recall that the
1947 //      interpreter maintains an on-stack list of locks currently held by
1948 //      a frame.
1949 // I2:  If a method attempts to unlock an object that is not held by the
1950 //      the frame the interpreter throws IMSX.
1951 //
1952 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1953 // B() doesn't have provably balanced locking so it runs in the interpreter.
1954 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1955 // is still locked by A().
1956 //
1957 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1958 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1959 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1960 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.





1961 
1962 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1963   assert(boxReg == rax, "");
1964   assert_different_registers(objReg, boxReg, tmpReg);
1965 
1966   if (EmitSync & 4) {
1967     // Disable - inhibit all inlining.  Force control through the slow-path
1968     cmpptr (rsp, 0);
1969   } else
1970   if (EmitSync & 8) {
1971     Label DONE_LABEL;
1972     if (UseBiasedLocking) {
1973        biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1974     }
1975     // Classic stack-locking code ...
1976     // Check whether the displaced header is 0
1977     //(=> recursive unlock)
1978     movptr(tmpReg, Address(boxReg, 0));
1979     testptr(tmpReg, tmpReg);
1980     jccb(Assembler::zero, DONE_LABEL);
1981     // If not recursive lock, reset the header to displaced header
1982     if (os::is_MP()) {
1983       lock();
1984     }
1985     cmpxchgptr(tmpReg, Address(objReg, 0));   // Uses RAX which is box
1986     bind(DONE_LABEL);
1987   } else {
1988     Label DONE_LABEL, Stacked, CheckSucc;
1989 
1990     // Critically, the biased locking test must have precedence over
1991     // and appear before the (box->dhw == 0) recursive stack-lock test.
1992     if (UseBiasedLocking && !UseOptoBiasInlining) {
1993        biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1994     }
1995 
1996 #if INCLUDE_RTM_OPT
1997     if (UseRTMForStackLocks && use_rtm) {
1998       assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1999       Label L_regular_unlock;
2000       movptr(tmpReg, Address(objReg, 0));           // fetch markword
2001       andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2002       cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
2003       jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
2004       xend();                                       // otherwise end...
2005       jmp(DONE_LABEL);                              // ... and we're done
2006       bind(L_regular_unlock);


2043     //
2044     // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
2045     // before it STs null into _owner, releasing the lock.  Updates
2046     // to data protected by the critical section must be visible before
2047     // we drop the lock (and thus before any other thread could acquire
2048     // the lock and observe the fields protected by the lock).
2049     // IA32's memory-model is SPO, so STs are ordered with respect to
2050     // each other and there's no need for an explicit barrier (fence).
2051     // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
2052 #ifndef _LP64
2053     get_thread (boxReg);
2054     if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
2055       // prefetchw [ebx + Offset(_owner)-2]
2056       prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2057     }
2058 
2059     // Note that we could employ various encoding schemes to reduce
2060     // the number of loads below (currently 4) to just 2 or 3.
2061     // Refer to the comments in synchronizer.cpp.
2062     // In practice the chain of fetches doesn't seem to impact performance, however.

2063     if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
2064        // Attempt to reduce branch density - AMD's branch predictor.
2065        xorptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2066        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2067        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2068        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2069        jccb  (Assembler::notZero, DONE_LABEL);
2070        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2071        jmpb  (DONE_LABEL);
2072     } else {
2073        xorptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2074        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2075        jccb  (Assembler::notZero, DONE_LABEL);
2076        movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2077        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2078        jccb  (Assembler::notZero, CheckSucc);
2079        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2080        jmpb  (DONE_LABEL);
2081     }
2082 
2083     // The Following code fragment (EmitSync & 65536) improves the performance of
2084     // contended applications and contended synchronization microbenchmarks.
2085     // Unfortunately the emission of the code - even though not executed - causes regressions
2086     // in scimark and jetstream, evidently because of $ effects.  Replacing the code
2087     // with an equal number of never-executed NOPs results in the same regression.
2088     // We leave it off by default.
2089 
2090     if ((EmitSync & 65536) != 0) {
2091        Label LSuccess, LGoSlowPath ;
2092 
2093        bind  (CheckSucc);
2094 
2095        // Optional pre-test ... it's safe to elide this
2096        if ((EmitSync & 16) == 0) {
2097           cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2098           jccb  (Assembler::zero, LGoSlowPath);
2099        }
2100 
2101        // We have a classic Dekker-style idiom:
2102        //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
2103        // There are a number of ways to implement the barrier:
2104        // (1) lock:andl &m->_owner, 0
2105        //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
2106        //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
2107        //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
2108        // (2) If supported, an explicit MFENCE is appealing.
2109        //     In older IA32 processors MFENCE is slower than lock:add or xchg
2110        //     particularly if the write-buffer is full as might be the case if
2111        //     if stores closely precede the fence or fence-equivalent instruction.
2112        //     In more modern implementations MFENCE appears faster, however.

2113        // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
2114        //     The $lines underlying the top-of-stack should be in M-state.
2115        //     The locked add instruction is serializing, of course.
2116        // (4) Use xchg, which is serializing
2117        //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
2118        // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
2119        //     The integer condition codes will tell us if succ was 0.
2120        //     Since _succ and _owner should reside in the same $line and
2121        //     we just stored into _owner, it's likely that the $line
2122        //     remains in M-state for the lock:orl.
2123        //
2124        // We currently use (3), although it's likely that switching to (2)
2125        // is correct for the future.
2126 
2127        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2128        if (os::is_MP()) {
2129           if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
2130             mfence();
2131           } else {
2132             lock (); addptr(Address(rsp, 0), 0);
2133           }
2134        }
2135        // Ratify _succ remains non-null
2136        cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0);
2137        jccb  (Assembler::notZero, LSuccess);
2138 
2139        xorptr(boxReg, boxReg);                  // box is really EAX
2140        if (os::is_MP()) { lock(); }
2141        cmpxchgptr(rsp, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2142        jccb  (Assembler::notEqual, LSuccess);
2143        // Since we're low on registers we installed rsp as a placeholding in _owner.
2144        // Now install Self over rsp.  This is safe as we're transitioning from
2145        // non-null to non=null
2146        get_thread (boxReg);
2147        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), boxReg);
2148        // Intentional fall-through into LGoSlowPath ...
2149 
2150        bind  (LGoSlowPath);
2151        orptr(boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2152        jmpb  (DONE_LABEL);
2153 
2154        bind  (LSuccess);


2162     // Try to reset the header to displaced header.
2163     // The "box" value on the stack is stable, so we can reload
2164     // and be assured we observe the same value as above.
2165     movptr(tmpReg, Address(boxReg, 0));
2166     if (os::is_MP()) {
2167       lock();
2168     }
2169     cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2170     // Intention fall-thru into DONE_LABEL
2171 
2172     // DONE_LABEL is a hot target - we'd really like to place it at the
2173     // start of cache line by padding with NOPs.
2174     // See the AMD and Intel software optimization manuals for the
2175     // most efficient "long" NOP encodings.
2176     // Unfortunately none of our alignment mechanisms suffice.
2177     if ((EmitSync & 65536) == 0) {
2178        bind (CheckSucc);
2179     }
2180 #else // _LP64
2181     // It's inflated




2182     movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2183     xorptr(boxReg, r15_thread);



2184     orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2185     jccb  (Assembler::notZero, DONE_LABEL);
2186     movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2187     orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2188     jccb  (Assembler::notZero, CheckSucc);
2189     movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2190     jmpb  (DONE_LABEL);
2191 
2192     if ((EmitSync & 65536) == 0) {

2193       Label LSuccess, LGoSlowPath ;
2194       bind  (CheckSucc);





2195       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2196       jccb  (Assembler::zero, LGoSlowPath);
2197 
2198       // I'd much rather use lock:andl m->_owner, 0 as it's faster than the
2199       // the explicit ST;MEMBAR combination, but masm doesn't currently support
2200       // "ANDQ M,IMM".  Don't use MFENCE here.  lock:add to TOS, xchg, etc
2201       // are all faster when the write buffer is populated.
2202       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2203       if (os::is_MP()) {
2204          lock (); addl (Address(rsp, 0), 0);












2205       }


2206       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2207       jccb  (Assembler::notZero, LSuccess);
2208 
2209       movptr (boxReg, (int32_t)NULL_WORD);                   // box is really EAX















2210       if (os::is_MP()) { lock(); }
2211       cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2212       jccb  (Assembler::notEqual, LSuccess);
2213       // Intentional fall-through into slow-path
2214 
2215       bind  (LGoSlowPath);
2216       orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2217       jmpb  (DONE_LABEL);
2218 
2219       bind  (LSuccess);
2220       testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2221       jmpb  (DONE_LABEL);
2222     }
2223 
2224     bind  (Stacked);
2225     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2226     if (os::is_MP()) { lock(); }
2227     cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2228 
2229     if (EmitSync & 65536) {
2230        bind (CheckSucc);
2231     }
2232 #endif
2233     bind(DONE_LABEL);
2234     // Avoid branch to branch on AMD processors
2235     if (EmitSync & 32768) {
2236        nop();
2237     }
2238   }
2239 }
2240 #endif // COMPILER2
2241 
2242 void MacroAssembler::c2bool(Register x) {
2243   // implements x == 0 ? 0 : 1
2244   // note: must only look at least-significant byte of x
2245   //       since C-style booleans are stored in one byte
2246   //       only! (was bug)
2247   andl(x, 0xFF);
2248   setb(Assembler::notZero, x);
2249 }
2250 
2251 // Wouldn't need if AddressLiteral version had new name
2252 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2253   Assembler::call(L, rtype);
2254 }
2255 
2256 void MacroAssembler::call(Register entry) {
2257   Assembler::call(entry);
2258 }




1941 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1942 // Methods that don't have provably balanced locking are forced to run in the
1943 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1944 // The interpreter provides two properties:
1945 // I1:  At return-time the interpreter automatically and quietly unlocks any
1946 //      objects acquired the current activation (frame).  Recall that the
1947 //      interpreter maintains an on-stack list of locks currently held by
1948 //      a frame.
1949 // I2:  If a method attempts to unlock an object that is not held by the
1950 //      the frame the interpreter throws IMSX.
1951 //
1952 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1953 // B() doesn't have provably balanced locking so it runs in the interpreter.
1954 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1955 // is still locked by A().
1956 //
1957 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1958 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1959 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1960 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1961 // Arguably given that the spec legislates the JNI case as undefined our implementation
1962 // could reasonably *avoid* checking owner in Fast_Unlock().
1963 // In the interest of performance we elide m->Owner==Self check in unlock.
1964 // A perfectly viable alternative is to elide the owner check except when
1965 // Xcheck:jni is enabled.
1966 
1967 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1968   assert(boxReg == rax, "");
1969   assert_different_registers(objReg, boxReg, tmpReg);
1970 
1971   if (EmitSync & 4) {
1972     // Disable - inhibit all inlining.  Force control through the slow-path
1973     cmpptr (rsp, 0);


















1974   } else {
1975     Label DONE_LABEL, Stacked, CheckSucc;
1976 
1977     // Critically, the biased locking test must have precedence over
1978     // and appear before the (box->dhw == 0) recursive stack-lock test.
1979     if (UseBiasedLocking && !UseOptoBiasInlining) {
1980        biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1981     }
1982 
1983 #if INCLUDE_RTM_OPT
1984     if (UseRTMForStackLocks && use_rtm) {
1985       assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1986       Label L_regular_unlock;
1987       movptr(tmpReg, Address(objReg, 0));           // fetch markword
1988       andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1989       cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1990       jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
1991       xend();                                       // otherwise end...
1992       jmp(DONE_LABEL);                              // ... and we're done
1993       bind(L_regular_unlock);


2030     //
2031     // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
2032     // before it STs null into _owner, releasing the lock.  Updates
2033     // to data protected by the critical section must be visible before
2034     // we drop the lock (and thus before any other thread could acquire
2035     // the lock and observe the fields protected by the lock).
2036     // IA32's memory-model is SPO, so STs are ordered with respect to
2037     // each other and there's no need for an explicit barrier (fence).
2038     // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
2039 #ifndef _LP64
2040     get_thread (boxReg);
2041     if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
2042       // prefetchw [ebx + Offset(_owner)-2]
2043       prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2044     }
2045 
2046     // Note that we could employ various encoding schemes to reduce
2047     // the number of loads below (currently 4) to just 2 or 3.
2048     // Refer to the comments in synchronizer.cpp.
2049     // In practice the chain of fetches doesn't seem to impact performance, however.
2050     xorptr(boxReg, boxReg);
2051     if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
2052        // Attempt to reduce branch density - AMD's branch predictor.

2053        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2054        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2055        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2056        jccb  (Assembler::notZero, DONE_LABEL);
2057        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2058        jmpb  (DONE_LABEL);
2059     } else {

2060        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2061        jccb  (Assembler::notZero, DONE_LABEL);
2062        movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2063        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2064        jccb  (Assembler::notZero, CheckSucc);
2065        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2066        jmpb  (DONE_LABEL);
2067     }
2068 
2069     // The Following code fragment (EmitSync & 65536) improves the performance of
2070     // contended applications and contended synchronization microbenchmarks.
2071     // Unfortunately the emission of the code - even though not executed - causes regressions
2072     // in scimark and jetstream, evidently because of $ effects.  Replacing the code
2073     // with an equal number of never-executed NOPs results in the same regression.
2074     // We leave it off by default.
2075 
2076     if ((EmitSync & 65536) != 0) {
2077        Label LSuccess, LGoSlowPath ;
2078 
2079        bind  (CheckSucc);
2080 
2081        // Optional pre-test ... it's safe to elide this

2082        cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2083        jccb(Assembler::zero, LGoSlowPath);

2084 
2085        // We have a classic Dekker-style idiom:
2086        //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
2087        // There are a number of ways to implement the barrier:
2088        // (1) lock:andl &m->_owner, 0
2089        //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
2090        //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
2091        //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
2092        // (2) If supported, an explicit MFENCE is appealing.
2093        //     In older IA32 processors MFENCE is slower than lock:add or xchg
2094        //     particularly if the write-buffer is full as might be the case if
2095        //     if stores closely precede the fence or fence-equivalent instruction.
2096        //     See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2097        //     as the situation has changed with Nehalem and Shanghai.
2098        // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
2099        //     The $lines underlying the top-of-stack should be in M-state.
2100        //     The locked add instruction is serializing, of course.
2101        // (4) Use xchg, which is serializing
2102        //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
2103        // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
2104        //     The integer condition codes will tell us if succ was 0.
2105        //     Since _succ and _owner should reside in the same $line and
2106        //     we just stored into _owner, it's likely that the $line
2107        //     remains in M-state for the lock:orl.
2108        //
2109        // We currently use (3), although it's likely that switching to (2)
2110        // is correct for the future.
2111 
2112        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2113        if (os::is_MP()) {
2114          lock(); addptr(Address(rsp, 0), 0);



2115        }

2116        // Ratify _succ remains non-null
2117        cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0);
2118        jccb  (Assembler::notZero, LSuccess);
2119 
2120        xorptr(boxReg, boxReg);                  // box is really EAX
2121        if (os::is_MP()) { lock(); }
2122        cmpxchgptr(rsp, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2123        jccb  (Assembler::notEqual, LSuccess);
2124        // Since we're low on registers we installed rsp as a placeholding in _owner.
2125        // Now install Self over rsp.  This is safe as we're transitioning from
2126        // non-null to non=null
2127        get_thread (boxReg);
2128        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), boxReg);
2129        // Intentional fall-through into LGoSlowPath ...
2130 
2131        bind  (LGoSlowPath);
2132        orptr(boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2133        jmpb  (DONE_LABEL);
2134 
2135        bind  (LSuccess);


2143     // Try to reset the header to displaced header.
2144     // The "box" value on the stack is stable, so we can reload
2145     // and be assured we observe the same value as above.
2146     movptr(tmpReg, Address(boxReg, 0));
2147     if (os::is_MP()) {
2148       lock();
2149     }
2150     cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2151     // Intention fall-thru into DONE_LABEL
2152 
2153     // DONE_LABEL is a hot target - we'd really like to place it at the
2154     // start of cache line by padding with NOPs.
2155     // See the AMD and Intel software optimization manuals for the
2156     // most efficient "long" NOP encodings.
2157     // Unfortunately none of our alignment mechanisms suffice.
2158     if ((EmitSync & 65536) == 0) {
2159        bind (CheckSucc);
2160     }
2161 #else // _LP64
2162     // It's inflated
2163     if (EmitSync & 1024) {
2164       // Don't bother to ratify that m_Owner==Self
2165       // We might consider predicating the m_Owner==Self check on Xcheck:jni
2166       // or running on a debug build.
2167       movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2168       xorptr(boxReg, r15_thread);
2169     } else {
2170       xorptr(boxReg, boxReg);
2171     }
2172     orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2173     jccb  (Assembler::notZero, DONE_LABEL);
2174     movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2175     orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2176     jccb  (Assembler::notZero, CheckSucc);
2177     movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2178     jmpb  (DONE_LABEL);
2179 
2180     if ((EmitSync & 65536) == 0) {
2181       // Try to avoid passing control into the slow_path ...
2182       Label LSuccess, LGoSlowPath ;
2183       bind  (CheckSucc);
2184 
2185       // The following optional optimization can be elided if necessary
2186       // Effectively: if (succ == null) goto SlowPath
2187       // The code reduces the window for a race, however,
2188       // and thus benefits performance.
2189       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2190       jccb  (Assembler::zero, LGoSlowPath);
2191 
2192       if ((EmitSync & 16) && os::is_MP()) {
2193         orptr(boxReg, boxReg);
2194         xchgptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2195       } else {
2196         movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2197         if (os::is_MP()) {
2198           // Memory barrier/fence
2199           // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2200           // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2201           // This is faster on Nehalem and AMD Shanghai/Barcelona.
2202           // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2203           // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2204           // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2205           if (EmitSync & 32) {
2206             // An alternative is to use xchgq r15, r15->TSelf which has bidirectional
2207             // barrier semantics but is otherwise a no-op, as r15->TSelf == r15.
2208             xchgptr(r15_thread, Address(r15_thread, Thread::TSelf_offset()));
2209           } else {
2210             lock(); addl(Address(rsp, 0), 0);
2211           }
2212         }
2213       }
2214       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2215       jccb  (Assembler::notZero, LSuccess);
2216 
2217       // Rare inopportune interleaving - race.
2218       // The successor vanished in the small window above.
2219       // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2220       // We need to ensure progress and succession.
2221       // Try to reacquire the lock.
2222       // If that fails then the new owner is responsible for succession and this
2223       // thread needs to take no further action and can exit via the fast path (success).
2224       // If the re-acquire succeeds then pass control into the slow path.
2225       // As implemented, this latter mode is horrible because we generated more
2226       // coherence traffic on the lock *and* artifically extended the critical section
2227       // length while by virtue of passing control into the slow path.
2228 
2229       // box is really RAX -- the following CMPXCHG depends on that binding
2230       // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2231 
2232       movptr(boxReg, (int32_t)NULL_WORD);     // box is really RAX
2233       if (os::is_MP()) { lock(); }
2234       cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2235       jccb  (Assembler::notEqual, LSuccess);
2236       // Intentional fall-through into slow-path
2237 
2238       bind  (LGoSlowPath);
2239       orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2240       jmpb  (DONE_LABEL);
2241 
2242       bind  (LSuccess);
2243       testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2244       jmpb  (DONE_LABEL);
2245     }
2246 
2247     bind  (Stacked);
2248     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2249     if (os::is_MP()) { lock(); }
2250     cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2251 
2252     if (EmitSync & 65536) {
2253        bind (CheckSucc);
2254     }
2255 #endif
2256     bind(DONE_LABEL);



2257   }

2258 }
2259 #endif // COMPILER2
2260 
2261 void MacroAssembler::c2bool(Register x) {
2262   // implements x == 0 ? 0 : 1
2263   // note: must only look at least-significant byte of x
2264   //       since C-style booleans are stored in one byte
2265   //       only! (was bug)
2266   andl(x, 0xFF);
2267   setb(Assembler::notZero, x);
2268 }
2269 
2270 // Wouldn't need if AddressLiteral version had new name
2271 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2272   Assembler::call(L, rtype);
2273 }
2274 
2275 void MacroAssembler::call(Register entry) {
2276   Assembler::call(entry);
2277 }