1704 Register scrReg, Register cx1Reg, Register cx2Reg,
1705 BiasedLockingCounters* counters,
1706 RTMLockingCounters* rtm_counters,
1707 RTMLockingCounters* stack_rtm_counters,
1708 Metadata* method_data,
1709 bool use_rtm, bool profile_rtm) {
1710 // Ensure the register assignments are disjoint
1711 assert(tmpReg == rax, "");
1712
1713 if (use_rtm) {
1714 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1715 } else {
1716 assert(cx1Reg == noreg, "");
1717 assert(cx2Reg == noreg, "");
1718 assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1719 }
1720
1721 if (counters != NULL) {
1722 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1723 }
1724 if (EmitSync & 1) {
1725 // set box->dhw = markOopDesc::unused_mark()
1726 // Force all sync thru slow-path: slow_enter() and slow_exit()
1727 movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1728 cmpptr (rsp, (int32_t)NULL_WORD);
1729 } else {
1730 // Possible cases that we'll encounter in fast_lock
1731 // ------------------------------------------------
1732 // * Inflated
1733 // -- unlocked
1734 // -- Locked
1735 // = by self
1736 // = by other
1737 // * biased
1738 // -- by Self
1739 // -- by other
1740 // * neutral
1741 // * stack-locked
1742 // -- by self
1743 // = sp-proximity test hits
1744 // = sp-proximity test generates false-negative
1745 // -- by other
1746 //
1747
1748 Label IsInflated, DONE_LABEL;
1749
1798 bind(IsInflated);
1799 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value
1800
1801 #if INCLUDE_RTM_OPT
1802 // Use the same RTM locking code in 32- and 64-bit VM.
1803 if (use_rtm) {
1804 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1805 rtm_counters, method_data, profile_rtm, DONE_LABEL);
1806 } else {
1807 #endif // INCLUDE_RTM_OPT
1808
1809 #ifndef _LP64
1810 // The object is inflated.
1811
1812 // boxReg refers to the on-stack BasicLock in the current frame.
1813 // We'd like to write:
1814 // set box->_displaced_header = markOopDesc::unused_mark(). Any non-0 value suffices.
1815 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
1816 // additional latency as we have another ST in the store buffer that must drain.
1817
1818 if (EmitSync & 8192) {
1819 movptr(Address(boxReg, 0), 3); // results in ST-before-CAS penalty
1820 get_thread (scrReg);
1821 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
1822 movptr(tmpReg, NULL_WORD); // consider: xor vs mov
1823 if (os::is_MP()) {
1824 lock();
1825 }
1826 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1827 } else
1828 if ((EmitSync & 128) == 0) { // avoid ST-before-CAS
1829 // register juggle because we need tmpReg for cmpxchgptr below
1830 movptr(scrReg, boxReg);
1831 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
1832
1833 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1834 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1835 // prefetchw [eax + Offset(_owner)-2]
1836 prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1837 }
1838
1839 if ((EmitSync & 64) == 0) {
1840 // Optimistic form: consider XORL tmpReg,tmpReg
1841 movptr(tmpReg, NULL_WORD);
1842 } else {
1843 // Can suffer RTS->RTO upgrades on shared or cold $ lines
1844 // Test-And-CAS instead of CAS
1845 movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); // rax, = m->_owner
1846 testptr(tmpReg, tmpReg); // Locked ?
1847 jccb (Assembler::notZero, DONE_LABEL);
1848 }
1849
1850 // Appears unlocked - try to swing _owner from null to non-null.
1851 // Ideally, I'd manifest "Self" with get_thread and then attempt
1852 // to CAS the register containing Self into m->Owner.
1853 // But we don't have enough registers, so instead we can either try to CAS
1854 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
1855 // we later store "Self" into m->Owner. Transiently storing a stack address
1856 // (rsp or the address of the box) into m->owner is harmless.
1857 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1858 if (os::is_MP()) {
1859 lock();
1860 }
1861 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1862 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
1863 // If we weren't able to swing _owner from NULL to the BasicLock
1864 // then take the slow path.
1865 jccb (Assembler::notZero, DONE_LABEL);
1866 // update _owner from BasicLock to thread
1867 get_thread (scrReg); // beware: clobbers ICCs
1868 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1869 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
1870
1871 // If the CAS fails we can either retry or pass control to the slow-path.
1872 // We use the latter tactic.
1873 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1874 // If the CAS was successful ...
1875 // Self has acquired the lock
1876 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1877 // Intentional fall-through into DONE_LABEL ...
1878 } else {
1879 movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark())); // results in ST-before-CAS penalty
1880 movptr(boxReg, tmpReg);
1881
1882 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1883 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1884 // prefetchw [eax + Offset(_owner)-2]
1885 prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1886 }
1887
1888 if ((EmitSync & 64) == 0) {
1889 // Optimistic form
1890 xorptr (tmpReg, tmpReg);
1891 } else {
1892 // Can suffer RTS->RTO upgrades on shared or cold $ lines
1893 movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); // rax, = m->_owner
1894 testptr(tmpReg, tmpReg); // Locked ?
1895 jccb (Assembler::notZero, DONE_LABEL);
1896 }
1897
1898 // Appears unlocked - try to swing _owner from null to non-null.
1899 // Use either "Self" (in scr) or rsp as thread identity in _owner.
1900 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1901 get_thread (scrReg);
1902 if (os::is_MP()) {
1903 lock();
1904 }
1905 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1906
1907 // If the CAS fails we can either retry or pass control to the slow-path.
1908 // We use the latter tactic.
1909 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1910 // If the CAS was successful ...
1911 // Self has acquired the lock
1912 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1913 // Intentional fall-through into DONE_LABEL ...
1914 }
1915 #else // _LP64
1916 // It's inflated
1917 movq(scrReg, tmpReg);
1918 xorq(tmpReg, tmpReg);
1919
1920 if (os::is_MP()) {
1921 lock();
1922 }
1923 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1924 // Unconditionally set box->_displaced_header = markOopDesc::unused_mark().
1925 // Without cast to int32_t movptr will destroy r10 which is typically obj.
1926 movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1927 // Intentional fall-through into DONE_LABEL ...
1928 // Propagate ICC.ZF from CAS above into DONE_LABEL.
1929 #endif // _LP64
1930 #if INCLUDE_RTM_OPT
1931 } // use_rtm()
1932 #endif
1933 // DONE_LABEL is a hot target - we'd really like to place it at the
1934 // start of cache line by padding with NOPs.
1935 // See the AMD and Intel software optimization manuals for the
1936 // most efficient "long" NOP encodings.
1937 // Unfortunately none of our alignment mechanisms suffice.
1938 bind(DONE_LABEL);
1939
1940 // At DONE_LABEL the icc ZFlag is set as follows ...
1941 // Fast_Unlock uses the same protocol.
1942 // ZFlag == 1 -> Success
1943 // ZFlag == 0 -> Failure - force control through the slow-path
1944 }
1945 }
1946
1947 // obj: object to unlock
1948 // box: box address (displaced header location), killed. Must be EAX.
1949 // tmp: killed, cannot be obj nor box.
1950 //
1951 // Some commentary on balanced locking:
1952 //
1953 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1954 // Methods that don't have provably balanced locking are forced to run in the
1955 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1956 // The interpreter provides two properties:
1957 // I1: At return-time the interpreter automatically and quietly unlocks any
1958 // objects acquired the current activation (frame). Recall that the
1959 // interpreter maintains an on-stack list of locks currently held by
1960 // a frame.
1961 // I2: If a method attempts to unlock an object that is not held by the
1962 // the frame the interpreter throws IMSX.
1963 //
1964 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1965 // B() doesn't have provably balanced locking so it runs in the interpreter.
1966 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
1967 // is still locked by A().
1968 //
1969 // The only other source of unbalanced locking would be JNI. The "Java Native Interface:
1970 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1971 // should not be unlocked by "normal" java-level locking and vice-versa. The specification
1972 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1973 // Arguably given that the spec legislates the JNI case as undefined our implementation
1974 // could reasonably *avoid* checking owner in Fast_Unlock().
1975 // In the interest of performance we elide m->Owner==Self check in unlock.
1976 // A perfectly viable alternative is to elide the owner check except when
1977 // Xcheck:jni is enabled.
1978
1979 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1980 assert(boxReg == rax, "");
1981 assert_different_registers(objReg, boxReg, tmpReg);
1982
1983 if (EmitSync & 4) {
1984 // Disable - inhibit all inlining. Force control through the slow-path
1985 cmpptr (rsp, 0);
1986 } else {
1987 Label DONE_LABEL, Stacked, CheckSucc;
1988
1989 // Critically, the biased locking test must have precedence over
1990 // and appear before the (box->dhw == 0) recursive stack-lock test.
1991 if (UseBiasedLocking && !UseOptoBiasInlining) {
1992 biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1993 }
1994
1995 #if INCLUDE_RTM_OPT
1996 if (UseRTMForStackLocks && use_rtm) {
1997 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1998 Label L_regular_unlock;
1999 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
2000 andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2001 cmpptr(tmpReg, markOopDesc::unlocked_value); // bits = 001 unlocked
2002 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock
2003 xend(); // otherwise end...
2004 jmp(DONE_LABEL); // ... and we're done
2005 bind(L_regular_unlock);
2006 }
2033 // state in _succ so we can avoid fetching EntryList|cxq.
2034 //
2035 // I'd like to add more cases in fast_lock() and fast_unlock() --
2036 // such as recursive enter and exit -- but we have to be wary of
2037 // I$ bloat, T$ effects and BP$ effects.
2038 //
2039 // If there's no contention try a 1-0 exit. That is, exit without
2040 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
2041 // we detect and recover from the race that the 1-0 exit admits.
2042 //
2043 // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
2044 // before it STs null into _owner, releasing the lock. Updates
2045 // to data protected by the critical section must be visible before
2046 // we drop the lock (and thus before any other thread could acquire
2047 // the lock and observe the fields protected by the lock).
2048 // IA32's memory-model is SPO, so STs are ordered with respect to
2049 // each other and there's no need for an explicit barrier (fence).
2050 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
2051 #ifndef _LP64
2052 get_thread (boxReg);
2053 if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
2054 // prefetchw [ebx + Offset(_owner)-2]
2055 prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2056 }
2057
2058 // Note that we could employ various encoding schemes to reduce
2059 // the number of loads below (currently 4) to just 2 or 3.
2060 // Refer to the comments in synchronizer.cpp.
2061 // In practice the chain of fetches doesn't seem to impact performance, however.
2062 xorptr(boxReg, boxReg);
2063 if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
2064 // Attempt to reduce branch density - AMD's branch predictor.
2065 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2066 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2067 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2068 jccb (Assembler::notZero, DONE_LABEL);
2069 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2070 jmpb (DONE_LABEL);
2071 } else {
2072 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2073 jccb (Assembler::notZero, DONE_LABEL);
2074 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2075 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2076 jccb (Assembler::notZero, CheckSucc);
2077 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2078 jmpb (DONE_LABEL);
2079 }
2080
2081 // The Following code fragment (EmitSync & 65536) improves the performance of
2082 // contended applications and contended synchronization microbenchmarks.
2083 // Unfortunately the emission of the code - even though not executed - causes regressions
2084 // in scimark and jetstream, evidently because of $ effects. Replacing the code
2085 // with an equal number of never-executed NOPs results in the same regression.
2086 // We leave it off by default.
2087
2088 if ((EmitSync & 65536) != 0) {
2089 Label LSuccess, LGoSlowPath ;
2090
2091 bind (CheckSucc);
2092
2093 // Optional pre-test ... it's safe to elide this
2094 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2095 jccb(Assembler::zero, LGoSlowPath);
2096
2097 // We have a classic Dekker-style idiom:
2098 // ST m->_owner = 0 ; MEMBAR; LD m->_succ
2099 // There are a number of ways to implement the barrier:
2100 // (1) lock:andl &m->_owner, 0
2101 // is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
2102 // LOCK: ANDL [ebx+Offset(_Owner)-2], 0
2103 // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
2104 // (2) If supported, an explicit MFENCE is appealing.
2105 // In older IA32 processors MFENCE is slower than lock:add or xchg
2106 // particularly if the write-buffer is full as might be the case if
2107 // if stores closely precede the fence or fence-equivalent instruction.
2108 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2109 // as the situation has changed with Nehalem and Shanghai.
2110 // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
2111 // The $lines underlying the top-of-stack should be in M-state.
2112 // The locked add instruction is serializing, of course.
2113 // (4) Use xchg, which is serializing
2114 // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
2115 // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
2116 // The integer condition codes will tell us if succ was 0.
2117 // Since _succ and _owner should reside in the same $line and
2118 // we just stored into _owner, it's likely that the $line
2119 // remains in M-state for the lock:orl.
2120 //
2121 // We currently use (3), although it's likely that switching to (2)
2122 // is correct for the future.
2123
2124 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2125 if (os::is_MP()) {
2126 lock(); addptr(Address(rsp, 0), 0);
2127 }
2128 // Ratify _succ remains non-null
2129 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0);
2130 jccb (Assembler::notZero, LSuccess);
2131
2132 xorptr(boxReg, boxReg); // box is really EAX
2133 if (os::is_MP()) { lock(); }
2134 cmpxchgptr(rsp, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2135 // There's no successor so we tried to regrab the lock with the
2136 // placeholder value. If that didn't work, then another thread
2137 // grabbed the lock so we're done (and exit was a success).
2138 jccb (Assembler::notEqual, LSuccess);
2139 // Since we're low on registers we installed rsp as a placeholding in _owner.
2140 // Now install Self over rsp. This is safe as we're transitioning from
2141 // non-null to non=null
2142 get_thread (boxReg);
2143 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), boxReg);
2144 // Intentional fall-through into LGoSlowPath ...
2145
2146 bind (LGoSlowPath);
2147 orptr(boxReg, 1); // set ICC.ZF=0 to indicate failure
2148 jmpb (DONE_LABEL);
2149
2150 bind (LSuccess);
2151 xorptr(boxReg, boxReg); // set ICC.ZF=1 to indicate success
2152 jmpb (DONE_LABEL);
2153 }
2154
2155 bind (Stacked);
2156 // It's not inflated and it's not recursively stack-locked and it's not biased.
2157 // It must be stack-locked.
2158 // Try to reset the header to displaced header.
2159 // The "box" value on the stack is stable, so we can reload
2160 // and be assured we observe the same value as above.
2161 movptr(tmpReg, Address(boxReg, 0));
2162 if (os::is_MP()) {
2163 lock();
2164 }
2165 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2166 // Intention fall-thru into DONE_LABEL
2167
2168 // DONE_LABEL is a hot target - we'd really like to place it at the
2169 // start of cache line by padding with NOPs.
2170 // See the AMD and Intel software optimization manuals for the
2171 // most efficient "long" NOP encodings.
2172 // Unfortunately none of our alignment mechanisms suffice.
2173 if ((EmitSync & 65536) == 0) {
2174 bind (CheckSucc);
2175 }
2176 #else // _LP64
2177 // It's inflated
2178 if (EmitSync & 1024) {
2179 // Emit code to check that _owner == Self
2180 // We could fold the _owner test into subsequent code more efficiently
2181 // than using a stand-alone check, but since _owner checking is off by
2182 // default we don't bother. We also might consider predicating the
2183 // _owner==Self check on Xcheck:jni or running on a debug build.
2184 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2185 xorptr(boxReg, r15_thread);
2186 } else {
2187 xorptr(boxReg, boxReg);
2188 }
2189 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2190 jccb (Assembler::notZero, DONE_LABEL);
2191 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2192 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2193 jccb (Assembler::notZero, CheckSucc);
2194 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2195 jmpb (DONE_LABEL);
2196
2197 if ((EmitSync & 65536) == 0) {
2198 // Try to avoid passing control into the slow_path ...
2199 Label LSuccess, LGoSlowPath ;
2200 bind (CheckSucc);
2201
2202 // The following optional optimization can be elided if necessary
2203 // Effectively: if (succ == null) goto SlowPath
2204 // The code reduces the window for a race, however,
2205 // and thus benefits performance.
2206 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2207 jccb (Assembler::zero, LGoSlowPath);
2208
2209 xorptr(boxReg, boxReg);
2210 if ((EmitSync & 16) && os::is_MP()) {
2211 xchgptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2212 } else {
2213 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2214 if (os::is_MP()) {
2215 // Memory barrier/fence
2216 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2217 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2218 // This is faster on Nehalem and AMD Shanghai/Barcelona.
2219 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2220 // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2221 // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2222 lock(); addl(Address(rsp, 0), 0);
2223 }
2224 }
2225 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2226 jccb (Assembler::notZero, LSuccess);
2227
2228 // Rare inopportune interleaving - race.
2229 // The successor vanished in the small window above.
2230 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2231 // We need to ensure progress and succession.
2232 // Try to reacquire the lock.
2233 // If that fails then the new owner is responsible for succession and this
2234 // thread needs to take no further action and can exit via the fast path (success).
2235 // If the re-acquire succeeds then pass control into the slow path.
2236 // As implemented, this latter mode is horrible because we generated more
2237 // coherence traffic on the lock *and* artifically extended the critical section
2238 // length while by virtue of passing control into the slow path.
2239
2240 // box is really RAX -- the following CMPXCHG depends on that binding
2241 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2242 if (os::is_MP()) { lock(); }
2243 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2244 // There's no successor so we tried to regrab the lock.
2245 // If that didn't work, then another thread grabbed the
2246 // lock so we're done (and exit was a success).
2247 jccb (Assembler::notEqual, LSuccess);
2248 // Intentional fall-through into slow-path
2249
2250 bind (LGoSlowPath);
2251 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
2252 jmpb (DONE_LABEL);
2253
2254 bind (LSuccess);
2255 testl (boxReg, 0); // set ICC.ZF=1 to indicate success
2256 jmpb (DONE_LABEL);
2257 }
2258
2259 bind (Stacked);
2260 movptr(tmpReg, Address (boxReg, 0)); // re-fetch
2261 if (os::is_MP()) { lock(); }
2262 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2263
2264 if (EmitSync & 65536) {
2265 bind (CheckSucc);
2266 }
2267 #endif
2268 bind(DONE_LABEL);
2269 }
2270 }
2271 #endif // COMPILER2
2272
2273 void MacroAssembler::c2bool(Register x) {
2274 // implements x == 0 ? 0 : 1
2275 // note: must only look at least-significant byte of x
2276 // since C-style booleans are stored in one byte
2277 // only! (was bug)
2278 andl(x, 0xFF);
2279 setb(Assembler::notZero, x);
2280 }
2281
2282 // Wouldn't need if AddressLiteral version had new name
2283 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2284 Assembler::call(L, rtype);
2285 }
2286
2287 void MacroAssembler::call(Register entry) {
2288 Assembler::call(entry);
2289 }
|
1704 Register scrReg, Register cx1Reg, Register cx2Reg,
1705 BiasedLockingCounters* counters,
1706 RTMLockingCounters* rtm_counters,
1707 RTMLockingCounters* stack_rtm_counters,
1708 Metadata* method_data,
1709 bool use_rtm, bool profile_rtm) {
1710 // Ensure the register assignments are disjoint
1711 assert(tmpReg == rax, "");
1712
1713 if (use_rtm) {
1714 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1715 } else {
1716 assert(cx1Reg == noreg, "");
1717 assert(cx2Reg == noreg, "");
1718 assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1719 }
1720
1721 if (counters != NULL) {
1722 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1723 }
1724
1725 // Possible cases that we'll encounter in fast_lock
1726 // ------------------------------------------------
1727 // * Inflated
1728 // -- unlocked
1729 // -- Locked
1730 // = by self
1731 // = by other
1732 // * biased
1733 // -- by Self
1734 // -- by other
1735 // * neutral
1736 // * stack-locked
1737 // -- by self
1738 // = sp-proximity test hits
1739 // = sp-proximity test generates false-negative
1740 // -- by other
1741 //
1742
1743 Label IsInflated, DONE_LABEL;
1744
1793 bind(IsInflated);
1794 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value
1795
1796 #if INCLUDE_RTM_OPT
1797 // Use the same RTM locking code in 32- and 64-bit VM.
1798 if (use_rtm) {
1799 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1800 rtm_counters, method_data, profile_rtm, DONE_LABEL);
1801 } else {
1802 #endif // INCLUDE_RTM_OPT
1803
1804 #ifndef _LP64
1805 // The object is inflated.
1806
1807 // boxReg refers to the on-stack BasicLock in the current frame.
1808 // We'd like to write:
1809 // set box->_displaced_header = markOopDesc::unused_mark(). Any non-0 value suffices.
1810 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
1811 // additional latency as we have another ST in the store buffer that must drain.
1812
1813 // avoid ST-before-CAS
1814 // register juggle because we need tmpReg for cmpxchgptr below
1815 movptr(scrReg, boxReg);
1816 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
1817
1818 // Optimistic form: consider XORL tmpReg,tmpReg
1819 movptr(tmpReg, NULL_WORD);
1820
1821 // Appears unlocked - try to swing _owner from null to non-null.
1822 // Ideally, I'd manifest "Self" with get_thread and then attempt
1823 // to CAS the register containing Self into m->Owner.
1824 // But we don't have enough registers, so instead we can either try to CAS
1825 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
1826 // we later store "Self" into m->Owner. Transiently storing a stack address
1827 // (rsp or the address of the box) into m->owner is harmless.
1828 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1829 if (os::is_MP()) {
1830 lock();
1831 }
1832 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1833 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
1834 // If we weren't able to swing _owner from NULL to the BasicLock
1835 // then take the slow path.
1836 jccb (Assembler::notZero, DONE_LABEL);
1837 // update _owner from BasicLock to thread
1838 get_thread (scrReg); // beware: clobbers ICCs
1839 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1840 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
1841
1842 // If the CAS fails we can either retry or pass control to the slow-path.
1843 // We use the latter tactic.
1844 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1845 // If the CAS was successful ...
1846 // Self has acquired the lock
1847 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1848 // Intentional fall-through into DONE_LABEL ...
1849 #else // _LP64
1850 // It's inflated
1851 movq(scrReg, tmpReg);
1852 xorq(tmpReg, tmpReg);
1853
1854 if (os::is_MP()) {
1855 lock();
1856 }
1857 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1858 // Unconditionally set box->_displaced_header = markOopDesc::unused_mark().
1859 // Without cast to int32_t movptr will destroy r10 which is typically obj.
1860 movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1861 // Intentional fall-through into DONE_LABEL ...
1862 // Propagate ICC.ZF from CAS above into DONE_LABEL.
1863 #endif // _LP64
1864 #if INCLUDE_RTM_OPT
1865 } // use_rtm()
1866 #endif
1867 // DONE_LABEL is a hot target - we'd really like to place it at the
1868 // start of cache line by padding with NOPs.
1869 // See the AMD and Intel software optimization manuals for the
1870 // most efficient "long" NOP encodings.
1871 // Unfortunately none of our alignment mechanisms suffice.
1872 bind(DONE_LABEL);
1873
1874 // At DONE_LABEL the icc ZFlag is set as follows ...
1875 // Fast_Unlock uses the same protocol.
1876 // ZFlag == 1 -> Success
1877 // ZFlag == 0 -> Failure - force control through the slow-path
1878 }
1879
1880 // obj: object to unlock
1881 // box: box address (displaced header location), killed. Must be EAX.
1882 // tmp: killed, cannot be obj nor box.
1883 //
1884 // Some commentary on balanced locking:
1885 //
1886 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1887 // Methods that don't have provably balanced locking are forced to run in the
1888 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1889 // The interpreter provides two properties:
1890 // I1: At return-time the interpreter automatically and quietly unlocks any
1891 // objects acquired the current activation (frame). Recall that the
1892 // interpreter maintains an on-stack list of locks currently held by
1893 // a frame.
1894 // I2: If a method attempts to unlock an object that is not held by the
1895 // the frame the interpreter throws IMSX.
1896 //
1897 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1898 // B() doesn't have provably balanced locking so it runs in the interpreter.
1899 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
1900 // is still locked by A().
1901 //
1902 // The only other source of unbalanced locking would be JNI. The "Java Native Interface:
1903 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1904 // should not be unlocked by "normal" java-level locking and vice-versa. The specification
1905 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1906 // Arguably given that the spec legislates the JNI case as undefined our implementation
1907 // could reasonably *avoid* checking owner in Fast_Unlock().
1908 // In the interest of performance we elide m->Owner==Self check in unlock.
1909 // A perfectly viable alternative is to elide the owner check except when
1910 // Xcheck:jni is enabled.
1911
1912 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1913 assert(boxReg == rax, "");
1914 assert_different_registers(objReg, boxReg, tmpReg);
1915
1916 Label DONE_LABEL, Stacked, CheckSucc;
1917
1918 // Critically, the biased locking test must have precedence over
1919 // and appear before the (box->dhw == 0) recursive stack-lock test.
1920 if (UseBiasedLocking && !UseOptoBiasInlining) {
1921 biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1922 }
1923
1924 #if INCLUDE_RTM_OPT
1925 if (UseRTMForStackLocks && use_rtm) {
1926 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1927 Label L_regular_unlock;
1928 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
1929 andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1930 cmpptr(tmpReg, markOopDesc::unlocked_value); // bits = 001 unlocked
1931 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock
1932 xend(); // otherwise end...
1933 jmp(DONE_LABEL); // ... and we're done
1934 bind(L_regular_unlock);
1935 }
1962 // state in _succ so we can avoid fetching EntryList|cxq.
1963 //
1964 // I'd like to add more cases in fast_lock() and fast_unlock() --
1965 // such as recursive enter and exit -- but we have to be wary of
1966 // I$ bloat, T$ effects and BP$ effects.
1967 //
1968 // If there's no contention try a 1-0 exit. That is, exit without
1969 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
1970 // we detect and recover from the race that the 1-0 exit admits.
1971 //
1972 // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
1973 // before it STs null into _owner, releasing the lock. Updates
1974 // to data protected by the critical section must be visible before
1975 // we drop the lock (and thus before any other thread could acquire
1976 // the lock and observe the fields protected by the lock).
1977 // IA32's memory-model is SPO, so STs are ordered with respect to
1978 // each other and there's no need for an explicit barrier (fence).
1979 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
1980 #ifndef _LP64
1981 get_thread (boxReg);
1982
1983 // Note that we could employ various encoding schemes to reduce
1984 // the number of loads below (currently 4) to just 2 or 3.
1985 // Refer to the comments in synchronizer.cpp.
1986 // In practice the chain of fetches doesn't seem to impact performance, however.
1987 xorptr(boxReg, boxReg);
1988 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1989 jccb (Assembler::notZero, DONE_LABEL);
1990 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1991 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1992 jccb (Assembler::notZero, CheckSucc);
1993 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1994 jmpb (DONE_LABEL);
1995
1996 bind (Stacked);
1997 // It's not inflated and it's not recursively stack-locked and it's not biased.
1998 // It must be stack-locked.
1999 // Try to reset the header to displaced header.
2000 // The "box" value on the stack is stable, so we can reload
2001 // and be assured we observe the same value as above.
2002 movptr(tmpReg, Address(boxReg, 0));
2003 if (os::is_MP()) {
2004 lock();
2005 }
2006 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2007 // Intention fall-thru into DONE_LABEL
2008
2009 // DONE_LABEL is a hot target - we'd really like to place it at the
2010 // start of cache line by padding with NOPs.
2011 // See the AMD and Intel software optimization manuals for the
2012 // most efficient "long" NOP encodings.
2013 // Unfortunately none of our alignment mechanisms suffice.
2014 bind (CheckSucc);
2015 #else // _LP64
2016 // It's inflated
2017 xorptr(boxReg, boxReg);
2018 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2019 jccb (Assembler::notZero, DONE_LABEL);
2020 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2021 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2022 jccb (Assembler::notZero, CheckSucc);
2023 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2024 jmpb (DONE_LABEL);
2025
2026 // Try to avoid passing control into the slow_path ...
2027 Label LSuccess, LGoSlowPath ;
2028 bind (CheckSucc);
2029
2030 // The following optional optimization can be elided if necessary
2031 // Effectively: if (succ == null) goto SlowPath
2032 // The code reduces the window for a race, however,
2033 // and thus benefits performance.
2034 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2035 jccb (Assembler::zero, LGoSlowPath);
2036
2037 xorptr(boxReg, boxReg);
2038 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2039 if (os::is_MP()) {
2040 // Memory barrier/fence
2041 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2042 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2043 // This is faster on Nehalem and AMD Shanghai/Barcelona.
2044 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2045 // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2046 // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2047 lock(); addl(Address(rsp, 0), 0);
2048 }
2049 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2050 jccb (Assembler::notZero, LSuccess);
2051
2052 // Rare inopportune interleaving - race.
2053 // The successor vanished in the small window above.
2054 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2055 // We need to ensure progress and succession.
2056 // Try to reacquire the lock.
2057 // If that fails then the new owner is responsible for succession and this
2058 // thread needs to take no further action and can exit via the fast path (success).
2059 // If the re-acquire succeeds then pass control into the slow path.
2060 // As implemented, this latter mode is horrible because we generated more
2061 // coherence traffic on the lock *and* artifically extended the critical section
2062 // length while by virtue of passing control into the slow path.
2063
2064 // box is really RAX -- the following CMPXCHG depends on that binding
2065 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2066 if (os::is_MP()) { lock(); }
2067 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2068 // There's no successor so we tried to regrab the lock.
2069 // If that didn't work, then another thread grabbed the
2070 // lock so we're done (and exit was a success).
2071 jccb (Assembler::notEqual, LSuccess);
2072 // Intentional fall-through into slow-path
2073
2074 bind (LGoSlowPath);
2075 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
2076 jmpb (DONE_LABEL);
2077
2078 bind (LSuccess);
2079 testl (boxReg, 0); // set ICC.ZF=1 to indicate success
2080 jmpb (DONE_LABEL);
2081
2082 bind (Stacked);
2083 movptr(tmpReg, Address (boxReg, 0)); // re-fetch
2084 if (os::is_MP()) { lock(); }
2085 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2086
2087 #endif
2088 bind(DONE_LABEL);
2089 }
2090 #endif // COMPILER2
2091
2092 void MacroAssembler::c2bool(Register x) {
2093 // implements x == 0 ? 0 : 1
2094 // note: must only look at least-significant byte of x
2095 // since C-style booleans are stored in one byte
2096 // only! (was bug)
2097 andl(x, 0xFF);
2098 setb(Assembler::notZero, x);
2099 }
2100
2101 // Wouldn't need if AddressLiteral version had new name
2102 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2103 Assembler::call(L, rtype);
2104 }
2105
2106 void MacroAssembler::call(Register entry) {
2107 Assembler::call(entry);
2108 }
|