1941 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites. 1942 // Methods that don't have provably balanced locking are forced to run in the 1943 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 1944 // The interpreter provides two properties: 1945 // I1: At return-time the interpreter automatically and quietly unlocks any 1946 // objects acquired the current activation (frame). Recall that the 1947 // interpreter maintains an on-stack list of locks currently held by 1948 // a frame. 1949 // I2: If a method attempts to unlock an object that is not held by the 1950 // the frame the interpreter throws IMSX. 1951 // 1952 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 1953 // B() doesn't have provably balanced locking so it runs in the interpreter. 1954 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 1955 // is still locked by A(). 1956 // 1957 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 1958 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 1959 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 1960 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 1961 1962 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 1963 assert(boxReg == rax, ""); 1964 assert_different_registers(objReg, boxReg, tmpReg); 1965 1966 if (EmitSync & 4) { 1967 // Disable - inhibit all inlining. Force control through the slow-path 1968 cmpptr (rsp, 0); 1969 } else 1970 if (EmitSync & 8) { 1971 Label DONE_LABEL; 1972 if (UseBiasedLocking) { 1973 biased_locking_exit(objReg, tmpReg, DONE_LABEL); 1974 } 1975 // Classic stack-locking code ... 1976 // Check whether the displaced header is 0 1977 //(=> recursive unlock) 1978 movptr(tmpReg, Address(boxReg, 0)); 1979 testptr(tmpReg, tmpReg); 1980 jccb(Assembler::zero, DONE_LABEL); 1981 // If not recursive lock, reset the header to displaced header 1982 if (os::is_MP()) { 1983 lock(); 1984 } 1985 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box 1986 bind(DONE_LABEL); 1987 } else { 1988 Label DONE_LABEL, Stacked, CheckSucc; 1989 1990 // Critically, the biased locking test must have precedence over 1991 // and appear before the (box->dhw == 0) recursive stack-lock test. 1992 if (UseBiasedLocking && !UseOptoBiasInlining) { 1993 biased_locking_exit(objReg, tmpReg, DONE_LABEL); 1994 } 1995 1996 #if INCLUDE_RTM_OPT 1997 if (UseRTMForStackLocks && use_rtm) { 1998 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 1999 Label L_regular_unlock; 2000 movptr(tmpReg, Address(objReg, 0)); // fetch markword 2001 andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2002 cmpptr(tmpReg, markOopDesc::unlocked_value); // bits = 001 unlocked 2003 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 2004 xend(); // otherwise end... 2005 jmp(DONE_LABEL); // ... and we're done 2006 bind(L_regular_unlock); 2043 // 2044 // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier 2045 // before it STs null into _owner, releasing the lock. Updates 2046 // to data protected by the critical section must be visible before 2047 // we drop the lock (and thus before any other thread could acquire 2048 // the lock and observe the fields protected by the lock). 2049 // IA32's memory-model is SPO, so STs are ordered with respect to 2050 // each other and there's no need for an explicit barrier (fence). 2051 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 2052 #ifndef _LP64 2053 get_thread (boxReg); 2054 if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { 2055 // prefetchw [ebx + Offset(_owner)-2] 2056 prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2057 } 2058 2059 // Note that we could employ various encoding schemes to reduce 2060 // the number of loads below (currently 4) to just 2 or 3. 2061 // Refer to the comments in synchronizer.cpp. 2062 // In practice the chain of fetches doesn't seem to impact performance, however. 2063 if ((EmitSync & 65536) == 0 && (EmitSync & 256)) { 2064 // Attempt to reduce branch density - AMD's branch predictor. 2065 xorptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2066 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 2067 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 2068 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 2069 jccb (Assembler::notZero, DONE_LABEL); 2070 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 2071 jmpb (DONE_LABEL); 2072 } else { 2073 xorptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2074 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 2075 jccb (Assembler::notZero, DONE_LABEL); 2076 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 2077 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 2078 jccb (Assembler::notZero, CheckSucc); 2079 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 2080 jmpb (DONE_LABEL); 2081 } 2082 2083 // The Following code fragment (EmitSync & 65536) improves the performance of 2084 // contended applications and contended synchronization microbenchmarks. 2085 // Unfortunately the emission of the code - even though not executed - causes regressions 2086 // in scimark and jetstream, evidently because of $ effects. Replacing the code 2087 // with an equal number of never-executed NOPs results in the same regression. 2088 // We leave it off by default. 2089 2090 if ((EmitSync & 65536) != 0) { 2091 Label LSuccess, LGoSlowPath ; 2092 2093 bind (CheckSucc); 2094 2095 // Optional pre-test ... it's safe to elide this 2096 if ((EmitSync & 16) == 0) { 2097 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 2098 jccb (Assembler::zero, LGoSlowPath); 2099 } 2100 2101 // We have a classic Dekker-style idiom: 2102 // ST m->_owner = 0 ; MEMBAR; LD m->_succ 2103 // There are a number of ways to implement the barrier: 2104 // (1) lock:andl &m->_owner, 0 2105 // is fast, but mask doesn't currently support the "ANDL M,IMM32" form. 2106 // LOCK: ANDL [ebx+Offset(_Owner)-2], 0 2107 // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8 2108 // (2) If supported, an explicit MFENCE is appealing. 2109 // In older IA32 processors MFENCE is slower than lock:add or xchg 2110 // particularly if the write-buffer is full as might be the case if 2111 // if stores closely precede the fence or fence-equivalent instruction. 2112 // In more modern implementations MFENCE appears faster, however. 2113 // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack 2114 // The $lines underlying the top-of-stack should be in M-state. 2115 // The locked add instruction is serializing, of course. 2116 // (4) Use xchg, which is serializing 2117 // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works 2118 // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0. 2119 // The integer condition codes will tell us if succ was 0. 2120 // Since _succ and _owner should reside in the same $line and 2121 // we just stored into _owner, it's likely that the $line 2122 // remains in M-state for the lock:orl. 2123 // 2124 // We currently use (3), although it's likely that switching to (2) 2125 // is correct for the future. 2126 2127 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 2128 if (os::is_MP()) { 2129 if (VM_Version::supports_sse2() && 1 == FenceInstruction) { 2130 mfence(); 2131 } else { 2132 lock (); addptr(Address(rsp, 0), 0); 2133 } 2134 } 2135 // Ratify _succ remains non-null 2136 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0); 2137 jccb (Assembler::notZero, LSuccess); 2138 2139 xorptr(boxReg, boxReg); // box is really EAX 2140 if (os::is_MP()) { lock(); } 2141 cmpxchgptr(rsp, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2142 jccb (Assembler::notEqual, LSuccess); 2143 // Since we're low on registers we installed rsp as a placeholding in _owner. 2144 // Now install Self over rsp. This is safe as we're transitioning from 2145 // non-null to non=null 2146 get_thread (boxReg); 2147 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), boxReg); 2148 // Intentional fall-through into LGoSlowPath ... 2149 2150 bind (LGoSlowPath); 2151 orptr(boxReg, 1); // set ICC.ZF=0 to indicate failure 2152 jmpb (DONE_LABEL); 2153 2154 bind (LSuccess); 2162 // Try to reset the header to displaced header. 2163 // The "box" value on the stack is stable, so we can reload 2164 // and be assured we observe the same value as above. 2165 movptr(tmpReg, Address(boxReg, 0)); 2166 if (os::is_MP()) { 2167 lock(); 2168 } 2169 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box 2170 // Intention fall-thru into DONE_LABEL 2171 2172 // DONE_LABEL is a hot target - we'd really like to place it at the 2173 // start of cache line by padding with NOPs. 2174 // See the AMD and Intel software optimization manuals for the 2175 // most efficient "long" NOP encodings. 2176 // Unfortunately none of our alignment mechanisms suffice. 2177 if ((EmitSync & 65536) == 0) { 2178 bind (CheckSucc); 2179 } 2180 #else // _LP64 2181 // It's inflated 2182 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2183 xorptr(boxReg, r15_thread); 2184 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 2185 jccb (Assembler::notZero, DONE_LABEL); 2186 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 2187 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 2188 jccb (Assembler::notZero, CheckSucc); 2189 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 2190 jmpb (DONE_LABEL); 2191 2192 if ((EmitSync & 65536) == 0) { 2193 Label LSuccess, LGoSlowPath ; 2194 bind (CheckSucc); 2195 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 2196 jccb (Assembler::zero, LGoSlowPath); 2197 2198 // I'd much rather use lock:andl m->_owner, 0 as it's faster than the 2199 // the explicit ST;MEMBAR combination, but masm doesn't currently support 2200 // "ANDQ M,IMM". Don't use MFENCE here. lock:add to TOS, xchg, etc 2201 // are all faster when the write buffer is populated. 2202 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 2203 if (os::is_MP()) { 2204 lock (); addl (Address(rsp, 0), 0); 2205 } 2206 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 2207 jccb (Assembler::notZero, LSuccess); 2208 2209 movptr (boxReg, (int32_t)NULL_WORD); // box is really EAX 2210 if (os::is_MP()) { lock(); } 2211 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2212 jccb (Assembler::notEqual, LSuccess); 2213 // Intentional fall-through into slow-path 2214 2215 bind (LGoSlowPath); 2216 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 2217 jmpb (DONE_LABEL); 2218 2219 bind (LSuccess); 2220 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 2221 jmpb (DONE_LABEL); 2222 } 2223 2224 bind (Stacked); 2225 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 2226 if (os::is_MP()) { lock(); } 2227 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box 2228 2229 if (EmitSync & 65536) { 2230 bind (CheckSucc); 2231 } 2232 #endif 2233 bind(DONE_LABEL); 2234 // Avoid branch to branch on AMD processors 2235 if (EmitSync & 32768) { 2236 nop(); 2237 } 2238 } 2239 } 2240 #endif // COMPILER2 2241 2242 void MacroAssembler::c2bool(Register x) { 2243 // implements x == 0 ? 0 : 1 2244 // note: must only look at least-significant byte of x 2245 // since C-style booleans are stored in one byte 2246 // only! (was bug) 2247 andl(x, 0xFF); 2248 setb(Assembler::notZero, x); 2249 } 2250 2251 // Wouldn't need if AddressLiteral version had new name 2252 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) { 2253 Assembler::call(L, rtype); 2254 } 2255 2256 void MacroAssembler::call(Register entry) { 2257 Assembler::call(entry); 2258 } | 1941 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites. 1942 // Methods that don't have provably balanced locking are forced to run in the 1943 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 1944 // The interpreter provides two properties: 1945 // I1: At return-time the interpreter automatically and quietly unlocks any 1946 // objects acquired the current activation (frame). Recall that the 1947 // interpreter maintains an on-stack list of locks currently held by 1948 // a frame. 1949 // I2: If a method attempts to unlock an object that is not held by the 1950 // the frame the interpreter throws IMSX. 1951 // 1952 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 1953 // B() doesn't have provably balanced locking so it runs in the interpreter. 1954 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 1955 // is still locked by A(). 1956 // 1957 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 1958 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 1959 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 1960 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 1961 // Arguably given that the spec legislates the JNI case as undefined our implementation 1962 // could reasonably *avoid* checking owner in Fast_Unlock(). 1963 // In the interest of performance we elide m->Owner==Self check in unlock. 1964 // A perfectly viable alternative is to elide the owner check except when 1965 // Xcheck:jni is enabled. 1966 1967 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 1968 assert(boxReg == rax, ""); 1969 assert_different_registers(objReg, boxReg, tmpReg); 1970 1971 if (EmitSync & 4) { 1972 // Disable - inhibit all inlining. Force control through the slow-path 1973 cmpptr (rsp, 0); 1974 } else { 1975 Label DONE_LABEL, Stacked, CheckSucc; 1976 1977 // Critically, the biased locking test must have precedence over 1978 // and appear before the (box->dhw == 0) recursive stack-lock test. 1979 if (UseBiasedLocking && !UseOptoBiasInlining) { 1980 biased_locking_exit(objReg, tmpReg, DONE_LABEL); 1981 } 1982 1983 #if INCLUDE_RTM_OPT 1984 if (UseRTMForStackLocks && use_rtm) { 1985 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 1986 Label L_regular_unlock; 1987 movptr(tmpReg, Address(objReg, 0)); // fetch markword 1988 andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 1989 cmpptr(tmpReg, markOopDesc::unlocked_value); // bits = 001 unlocked 1990 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 1991 xend(); // otherwise end... 1992 jmp(DONE_LABEL); // ... and we're done 1993 bind(L_regular_unlock); 2030 // 2031 // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier 2032 // before it STs null into _owner, releasing the lock. Updates 2033 // to data protected by the critical section must be visible before 2034 // we drop the lock (and thus before any other thread could acquire 2035 // the lock and observe the fields protected by the lock). 2036 // IA32's memory-model is SPO, so STs are ordered with respect to 2037 // each other and there's no need for an explicit barrier (fence). 2038 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 2039 #ifndef _LP64 2040 get_thread (boxReg); 2041 if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { 2042 // prefetchw [ebx + Offset(_owner)-2] 2043 prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2044 } 2045 2046 // Note that we could employ various encoding schemes to reduce 2047 // the number of loads below (currently 4) to just 2 or 3. 2048 // Refer to the comments in synchronizer.cpp. 2049 // In practice the chain of fetches doesn't seem to impact performance, however. 2050 xorptr(boxReg, boxReg); 2051 if ((EmitSync & 65536) == 0 && (EmitSync & 256)) { 2052 // Attempt to reduce branch density - AMD's branch predictor. 2053 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 2054 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 2055 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 2056 jccb (Assembler::notZero, DONE_LABEL); 2057 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 2058 jmpb (DONE_LABEL); 2059 } else { 2060 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 2061 jccb (Assembler::notZero, DONE_LABEL); 2062 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 2063 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 2064 jccb (Assembler::notZero, CheckSucc); 2065 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 2066 jmpb (DONE_LABEL); 2067 } 2068 2069 // The Following code fragment (EmitSync & 65536) improves the performance of 2070 // contended applications and contended synchronization microbenchmarks. 2071 // Unfortunately the emission of the code - even though not executed - causes regressions 2072 // in scimark and jetstream, evidently because of $ effects. Replacing the code 2073 // with an equal number of never-executed NOPs results in the same regression. 2074 // We leave it off by default. 2075 2076 if ((EmitSync & 65536) != 0) { 2077 Label LSuccess, LGoSlowPath ; 2078 2079 bind (CheckSucc); 2080 2081 // Optional pre-test ... it's safe to elide this 2082 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 2083 jccb(Assembler::zero, LGoSlowPath); 2084 2085 // We have a classic Dekker-style idiom: 2086 // ST m->_owner = 0 ; MEMBAR; LD m->_succ 2087 // There are a number of ways to implement the barrier: 2088 // (1) lock:andl &m->_owner, 0 2089 // is fast, but mask doesn't currently support the "ANDL M,IMM32" form. 2090 // LOCK: ANDL [ebx+Offset(_Owner)-2], 0 2091 // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8 2092 // (2) If supported, an explicit MFENCE is appealing. 2093 // In older IA32 processors MFENCE is slower than lock:add or xchg 2094 // particularly if the write-buffer is full as might be the case if 2095 // if stores closely precede the fence or fence-equivalent instruction. 2096 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 2097 // as the situation has changed with Nehalem and Shanghai. 2098 // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack 2099 // The $lines underlying the top-of-stack should be in M-state. 2100 // The locked add instruction is serializing, of course. 2101 // (4) Use xchg, which is serializing 2102 // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works 2103 // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0. 2104 // The integer condition codes will tell us if succ was 0. 2105 // Since _succ and _owner should reside in the same $line and 2106 // we just stored into _owner, it's likely that the $line 2107 // remains in M-state for the lock:orl. 2108 // 2109 // We currently use (3), although it's likely that switching to (2) 2110 // is correct for the future. 2111 2112 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 2113 if (os::is_MP()) { 2114 lock(); addptr(Address(rsp, 0), 0); 2115 } 2116 // Ratify _succ remains non-null 2117 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0); 2118 jccb (Assembler::notZero, LSuccess); 2119 2120 xorptr(boxReg, boxReg); // box is really EAX 2121 if (os::is_MP()) { lock(); } 2122 cmpxchgptr(rsp, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2123 jccb (Assembler::notEqual, LSuccess); 2124 // Since we're low on registers we installed rsp as a placeholding in _owner. 2125 // Now install Self over rsp. This is safe as we're transitioning from 2126 // non-null to non=null 2127 get_thread (boxReg); 2128 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), boxReg); 2129 // Intentional fall-through into LGoSlowPath ... 2130 2131 bind (LGoSlowPath); 2132 orptr(boxReg, 1); // set ICC.ZF=0 to indicate failure 2133 jmpb (DONE_LABEL); 2134 2135 bind (LSuccess); 2143 // Try to reset the header to displaced header. 2144 // The "box" value on the stack is stable, so we can reload 2145 // and be assured we observe the same value as above. 2146 movptr(tmpReg, Address(boxReg, 0)); 2147 if (os::is_MP()) { 2148 lock(); 2149 } 2150 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box 2151 // Intention fall-thru into DONE_LABEL 2152 2153 // DONE_LABEL is a hot target - we'd really like to place it at the 2154 // start of cache line by padding with NOPs. 2155 // See the AMD and Intel software optimization manuals for the 2156 // most efficient "long" NOP encodings. 2157 // Unfortunately none of our alignment mechanisms suffice. 2158 if ((EmitSync & 65536) == 0) { 2159 bind (CheckSucc); 2160 } 2161 #else // _LP64 2162 // It's inflated 2163 if (EmitSync & 1024) { 2164 // Don't bother to ratify that m_Owner==Self 2165 // We might consider predicating the m_Owner==Self check on Xcheck:jni 2166 // or running on a debug build. 2167 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2168 xorptr(boxReg, r15_thread); 2169 } else { 2170 xorptr(boxReg, boxReg); 2171 } 2172 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 2173 jccb (Assembler::notZero, DONE_LABEL); 2174 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 2175 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 2176 jccb (Assembler::notZero, CheckSucc); 2177 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 2178 jmpb (DONE_LABEL); 2179 2180 if ((EmitSync & 65536) == 0) { 2181 // Try to avoid passing control into the slow_path ... 2182 Label LSuccess, LGoSlowPath ; 2183 bind (CheckSucc); 2184 2185 // The following optional optimization can be elided if necessary 2186 // Effectively: if (succ == null) goto SlowPath 2187 // The code reduces the window for a race, however, 2188 // and thus benefits performance. 2189 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 2190 jccb (Assembler::zero, LGoSlowPath); 2191 2192 if ((EmitSync & 16) && os::is_MP()) { 2193 orptr(boxReg, boxReg); 2194 xchgptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2195 } else { 2196 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 2197 if (os::is_MP()) { 2198 // Memory barrier/fence 2199 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 2200 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 2201 // This is faster on Nehalem and AMD Shanghai/Barcelona. 2202 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 2203 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 2204 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 2205 if (EmitSync & 32) { 2206 // An alternative is to use xchgq r15, r15->TSelf which has bidirectional 2207 // barrier semantics but is otherwise a no-op, as r15->TSelf == r15. 2208 xchgptr(r15_thread, Address(r15_thread, Thread::TSelf_offset())); 2209 } else { 2210 lock(); addl(Address(rsp, 0), 0); 2211 } 2212 } 2213 } 2214 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 2215 jccb (Assembler::notZero, LSuccess); 2216 2217 // Rare inopportune interleaving - race. 2218 // The successor vanished in the small window above. 2219 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 2220 // We need to ensure progress and succession. 2221 // Try to reacquire the lock. 2222 // If that fails then the new owner is responsible for succession and this 2223 // thread needs to take no further action and can exit via the fast path (success). 2224 // If the re-acquire succeeds then pass control into the slow path. 2225 // As implemented, this latter mode is horrible because we generated more 2226 // coherence traffic on the lock *and* artifically extended the critical section 2227 // length while by virtue of passing control into the slow path. 2228 2229 // box is really RAX -- the following CMPXCHG depends on that binding 2230 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 2231 2232 movptr(boxReg, (int32_t)NULL_WORD); // box is really RAX 2233 if (os::is_MP()) { lock(); } 2234 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2235 jccb (Assembler::notEqual, LSuccess); 2236 // Intentional fall-through into slow-path 2237 2238 bind (LGoSlowPath); 2239 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 2240 jmpb (DONE_LABEL); 2241 2242 bind (LSuccess); 2243 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 2244 jmpb (DONE_LABEL); 2245 } 2246 2247 bind (Stacked); 2248 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 2249 if (os::is_MP()) { lock(); } 2250 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box 2251 2252 if (EmitSync & 65536) { 2253 bind (CheckSucc); 2254 } 2255 #endif 2256 bind(DONE_LABEL); 2257 } 2258 } 2259 #endif // COMPILER2 2260 2261 void MacroAssembler::c2bool(Register x) { 2262 // implements x == 0 ? 0 : 1 2263 // note: must only look at least-significant byte of x 2264 // since C-style booleans are stored in one byte 2265 // only! (was bug) 2266 andl(x, 0xFF); 2267 setb(Assembler::notZero, x); 2268 } 2269 2270 // Wouldn't need if AddressLiteral version had new name 2271 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) { 2272 Assembler::call(L, rtype); 2273 } 2274 2275 void MacroAssembler::call(Register entry) { 2276 Assembler::call(entry); 2277 } |