< prev index next >

src/hotspot/cpu/x86/macroAssembler_x86.cpp

Print this page
rev 51633 : imported patch 8210381


1704                                Register scrReg, Register cx1Reg, Register cx2Reg,
1705                                BiasedLockingCounters* counters,
1706                                RTMLockingCounters* rtm_counters,
1707                                RTMLockingCounters* stack_rtm_counters,
1708                                Metadata* method_data,
1709                                bool use_rtm, bool profile_rtm) {
1710   // Ensure the register assignments are disjoint
1711   assert(tmpReg == rax, "");
1712 
1713   if (use_rtm) {
1714     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1715   } else {
1716     assert(cx1Reg == noreg, "");
1717     assert(cx2Reg == noreg, "");
1718     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1719   }
1720 
1721   if (counters != NULL) {
1722     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1723   }
1724   if (EmitSync & 1) {
1725       // set box->dhw = markOopDesc::unused_mark()
1726       // Force all sync thru slow-path: slow_enter() and slow_exit()
1727       movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1728       cmpptr (rsp, (int32_t)NULL_WORD);
1729   } else {
1730     // Possible cases that we'll encounter in fast_lock
1731     // ------------------------------------------------
1732     // * Inflated
1733     //    -- unlocked
1734     //    -- Locked
1735     //       = by self
1736     //       = by other
1737     // * biased
1738     //    -- by Self
1739     //    -- by other
1740     // * neutral
1741     // * stack-locked
1742     //    -- by self
1743     //       = sp-proximity test hits
1744     //       = sp-proximity test generates false-negative
1745     //    -- by other
1746     //
1747 
1748     Label IsInflated, DONE_LABEL;
1749 


1798     bind(IsInflated);
1799     // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value
1800 
1801 #if INCLUDE_RTM_OPT
1802     // Use the same RTM locking code in 32- and 64-bit VM.
1803     if (use_rtm) {
1804       rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1805                            rtm_counters, method_data, profile_rtm, DONE_LABEL);
1806     } else {
1807 #endif // INCLUDE_RTM_OPT
1808 
1809 #ifndef _LP64
1810     // The object is inflated.
1811 
1812     // boxReg refers to the on-stack BasicLock in the current frame.
1813     // We'd like to write:
1814     //   set box->_displaced_header = markOopDesc::unused_mark().  Any non-0 value suffices.
1815     // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
1816     // additional latency as we have another ST in the store buffer that must drain.
1817 
1818     if (EmitSync & 8192) {
1819        movptr(Address(boxReg, 0), 3);            // results in ST-before-CAS penalty
1820        get_thread (scrReg);
1821        movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2]
1822        movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
1823        if (os::is_MP()) {
1824          lock();
1825        }
1826        cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1827     } else
1828     if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
1829        // register juggle because we need tmpReg for cmpxchgptr below
1830        movptr(scrReg, boxReg);
1831        movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
1832 
1833        // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1834        if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1835           // prefetchw [eax + Offset(_owner)-2]
1836           prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1837        }
1838 
1839        if ((EmitSync & 64) == 0) {
1840          // Optimistic form: consider XORL tmpReg,tmpReg
1841          movptr(tmpReg, NULL_WORD);
1842        } else {
1843          // Can suffer RTS->RTO upgrades on shared or cold $ lines
1844          // Test-And-CAS instead of CAS
1845          movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));   // rax, = m->_owner
1846          testptr(tmpReg, tmpReg);                   // Locked ?
1847          jccb  (Assembler::notZero, DONE_LABEL);
1848        }
1849 
1850        // Appears unlocked - try to swing _owner from null to non-null.
1851        // Ideally, I'd manifest "Self" with get_thread and then attempt
1852        // to CAS the register containing Self into m->Owner.
1853        // But we don't have enough registers, so instead we can either try to CAS
1854        // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1855        // we later store "Self" into m->Owner.  Transiently storing a stack address
1856        // (rsp or the address of the box) into  m->owner is harmless.
1857        // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1858        if (os::is_MP()) {
1859          lock();
1860        }
1861        cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1862        movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1863        // If we weren't able to swing _owner from NULL to the BasicLock
1864        // then take the slow path.
1865        jccb  (Assembler::notZero, DONE_LABEL);
1866        // update _owner from BasicLock to thread
1867        get_thread (scrReg);                    // beware: clobbers ICCs
1868        movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1869        xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1870 
1871        // If the CAS fails we can either retry or pass control to the slow-path.
1872        // We use the latter tactic.
1873        // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1874        // If the CAS was successful ...
1875        //   Self has acquired the lock
1876        //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1877        // Intentional fall-through into DONE_LABEL ...
1878     } else {
1879        movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark()));  // results in ST-before-CAS penalty
1880        movptr(boxReg, tmpReg);
1881 
1882        // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1883        if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1884           // prefetchw [eax + Offset(_owner)-2]
1885           prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1886        }
1887 
1888        if ((EmitSync & 64) == 0) {
1889          // Optimistic form
1890          xorptr  (tmpReg, tmpReg);
1891        } else {
1892          // Can suffer RTS->RTO upgrades on shared or cold $ lines
1893          movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));   // rax, = m->_owner
1894          testptr(tmpReg, tmpReg);                   // Locked ?
1895          jccb  (Assembler::notZero, DONE_LABEL);
1896        }
1897 
1898        // Appears unlocked - try to swing _owner from null to non-null.
1899        // Use either "Self" (in scr) or rsp as thread identity in _owner.
1900        // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1901        get_thread (scrReg);
1902        if (os::is_MP()) {
1903          lock();
1904        }
1905        cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1906 
1907        // If the CAS fails we can either retry or pass control to the slow-path.
1908        // We use the latter tactic.
1909        // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1910        // If the CAS was successful ...
1911        //   Self has acquired the lock
1912        //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1913        // Intentional fall-through into DONE_LABEL ...
1914     }
1915 #else // _LP64
1916     // It's inflated
1917     movq(scrReg, tmpReg);
1918     xorq(tmpReg, tmpReg);
1919 
1920     if (os::is_MP()) {
1921       lock();
1922     }
1923     cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1924     // Unconditionally set box->_displaced_header = markOopDesc::unused_mark().
1925     // Without cast to int32_t movptr will destroy r10 which is typically obj.
1926     movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1927     // Intentional fall-through into DONE_LABEL ...
1928     // Propagate ICC.ZF from CAS above into DONE_LABEL.
1929 #endif // _LP64
1930 #if INCLUDE_RTM_OPT
1931     } // use_rtm()
1932 #endif
1933     // DONE_LABEL is a hot target - we'd really like to place it at the
1934     // start of cache line by padding with NOPs.
1935     // See the AMD and Intel software optimization manuals for the
1936     // most efficient "long" NOP encodings.
1937     // Unfortunately none of our alignment mechanisms suffice.
1938     bind(DONE_LABEL);
1939 
1940     // At DONE_LABEL the icc ZFlag is set as follows ...
1941     // Fast_Unlock uses the same protocol.
1942     // ZFlag == 1 -> Success
1943     // ZFlag == 0 -> Failure - force control through the slow-path
1944   }
1945 }
1946 
1947 // obj: object to unlock
1948 // box: box address (displaced header location), killed.  Must be EAX.
1949 // tmp: killed, cannot be obj nor box.
1950 //
1951 // Some commentary on balanced locking:
1952 //
1953 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1954 // Methods that don't have provably balanced locking are forced to run in the
1955 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1956 // The interpreter provides two properties:
1957 // I1:  At return-time the interpreter automatically and quietly unlocks any
1958 //      objects acquired the current activation (frame).  Recall that the
1959 //      interpreter maintains an on-stack list of locks currently held by
1960 //      a frame.
1961 // I2:  If a method attempts to unlock an object that is not held by the
1962 //      the frame the interpreter throws IMSX.
1963 //
1964 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1965 // B() doesn't have provably balanced locking so it runs in the interpreter.
1966 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1967 // is still locked by A().
1968 //
1969 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1970 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1971 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1972 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1973 // Arguably given that the spec legislates the JNI case as undefined our implementation
1974 // could reasonably *avoid* checking owner in Fast_Unlock().
1975 // In the interest of performance we elide m->Owner==Self check in unlock.
1976 // A perfectly viable alternative is to elide the owner check except when
1977 // Xcheck:jni is enabled.
1978 
1979 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1980   assert(boxReg == rax, "");
1981   assert_different_registers(objReg, boxReg, tmpReg);
1982 
1983   if (EmitSync & 4) {
1984     // Disable - inhibit all inlining.  Force control through the slow-path
1985     cmpptr (rsp, 0);
1986   } else {
1987     Label DONE_LABEL, Stacked, CheckSucc;
1988 
1989     // Critically, the biased locking test must have precedence over
1990     // and appear before the (box->dhw == 0) recursive stack-lock test.
1991     if (UseBiasedLocking && !UseOptoBiasInlining) {
1992        biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1993     }
1994 
1995 #if INCLUDE_RTM_OPT
1996     if (UseRTMForStackLocks && use_rtm) {
1997       assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1998       Label L_regular_unlock;
1999       movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));           // fetch markword
2000       andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2001       cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
2002       jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
2003       xend();                                       // otherwise end...
2004       jmp(DONE_LABEL);                              // ... and we're done
2005       bind(L_regular_unlock);
2006     }


2033     // state in _succ so we can avoid fetching EntryList|cxq.
2034     //
2035     // I'd like to add more cases in fast_lock() and fast_unlock() --
2036     // such as recursive enter and exit -- but we have to be wary of
2037     // I$ bloat, T$ effects and BP$ effects.
2038     //
2039     // If there's no contention try a 1-0 exit.  That is, exit without
2040     // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
2041     // we detect and recover from the race that the 1-0 exit admits.
2042     //
2043     // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
2044     // before it STs null into _owner, releasing the lock.  Updates
2045     // to data protected by the critical section must be visible before
2046     // we drop the lock (and thus before any other thread could acquire
2047     // the lock and observe the fields protected by the lock).
2048     // IA32's memory-model is SPO, so STs are ordered with respect to
2049     // each other and there's no need for an explicit barrier (fence).
2050     // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
2051 #ifndef _LP64
2052     get_thread (boxReg);
2053     if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
2054       // prefetchw [ebx + Offset(_owner)-2]
2055       prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2056     }
2057 
2058     // Note that we could employ various encoding schemes to reduce
2059     // the number of loads below (currently 4) to just 2 or 3.
2060     // Refer to the comments in synchronizer.cpp.
2061     // In practice the chain of fetches doesn't seem to impact performance, however.
2062     xorptr(boxReg, boxReg);
2063     if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
2064        // Attempt to reduce branch density - AMD's branch predictor.
2065        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2066        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2067        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2068        jccb  (Assembler::notZero, DONE_LABEL);
2069        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2070        jmpb  (DONE_LABEL);
2071     } else {
2072        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2073        jccb  (Assembler::notZero, DONE_LABEL);
2074        movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2075        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2076        jccb  (Assembler::notZero, CheckSucc);
2077        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2078        jmpb  (DONE_LABEL);
2079     }
2080 
2081     // The Following code fragment (EmitSync & 65536) improves the performance of
2082     // contended applications and contended synchronization microbenchmarks.
2083     // Unfortunately the emission of the code - even though not executed - causes regressions
2084     // in scimark and jetstream, evidently because of $ effects.  Replacing the code
2085     // with an equal number of never-executed NOPs results in the same regression.
2086     // We leave it off by default.
2087 
2088     if ((EmitSync & 65536) != 0) {
2089        Label LSuccess, LGoSlowPath ;
2090 
2091        bind  (CheckSucc);
2092 
2093        // Optional pre-test ... it's safe to elide this
2094        cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2095        jccb(Assembler::zero, LGoSlowPath);
2096 
2097        // We have a classic Dekker-style idiom:
2098        //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
2099        // There are a number of ways to implement the barrier:
2100        // (1) lock:andl &m->_owner, 0
2101        //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
2102        //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
2103        //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
2104        // (2) If supported, an explicit MFENCE is appealing.
2105        //     In older IA32 processors MFENCE is slower than lock:add or xchg
2106        //     particularly if the write-buffer is full as might be the case if
2107        //     if stores closely precede the fence or fence-equivalent instruction.
2108        //     See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2109        //     as the situation has changed with Nehalem and Shanghai.
2110        // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
2111        //     The $lines underlying the top-of-stack should be in M-state.
2112        //     The locked add instruction is serializing, of course.
2113        // (4) Use xchg, which is serializing
2114        //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
2115        // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
2116        //     The integer condition codes will tell us if succ was 0.
2117        //     Since _succ and _owner should reside in the same $line and
2118        //     we just stored into _owner, it's likely that the $line
2119        //     remains in M-state for the lock:orl.
2120        //
2121        // We currently use (3), although it's likely that switching to (2)
2122        // is correct for the future.
2123 
2124        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2125        if (os::is_MP()) {
2126          lock(); addptr(Address(rsp, 0), 0);
2127        }
2128        // Ratify _succ remains non-null
2129        cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0);
2130        jccb  (Assembler::notZero, LSuccess);
2131 
2132        xorptr(boxReg, boxReg);                  // box is really EAX
2133        if (os::is_MP()) { lock(); }
2134        cmpxchgptr(rsp, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2135        // There's no successor so we tried to regrab the lock with the
2136        // placeholder value. If that didn't work, then another thread
2137        // grabbed the lock so we're done (and exit was a success).
2138        jccb  (Assembler::notEqual, LSuccess);
2139        // Since we're low on registers we installed rsp as a placeholding in _owner.
2140        // Now install Self over rsp.  This is safe as we're transitioning from
2141        // non-null to non=null
2142        get_thread (boxReg);
2143        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), boxReg);
2144        // Intentional fall-through into LGoSlowPath ...
2145 
2146        bind  (LGoSlowPath);
2147        orptr(boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2148        jmpb  (DONE_LABEL);
2149 
2150        bind  (LSuccess);
2151        xorptr(boxReg, boxReg);                 // set ICC.ZF=1 to indicate success
2152        jmpb  (DONE_LABEL);
2153     }
2154 
2155     bind (Stacked);
2156     // It's not inflated and it's not recursively stack-locked and it's not biased.
2157     // It must be stack-locked.
2158     // Try to reset the header to displaced header.
2159     // The "box" value on the stack is stable, so we can reload
2160     // and be assured we observe the same value as above.
2161     movptr(tmpReg, Address(boxReg, 0));
2162     if (os::is_MP()) {
2163       lock();
2164     }
2165     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2166     // Intention fall-thru into DONE_LABEL
2167 
2168     // DONE_LABEL is a hot target - we'd really like to place it at the
2169     // start of cache line by padding with NOPs.
2170     // See the AMD and Intel software optimization manuals for the
2171     // most efficient "long" NOP encodings.
2172     // Unfortunately none of our alignment mechanisms suffice.
2173     if ((EmitSync & 65536) == 0) {
2174        bind (CheckSucc);
2175     }
2176 #else // _LP64
2177     // It's inflated
2178     if (EmitSync & 1024) {
2179       // Emit code to check that _owner == Self
2180       // We could fold the _owner test into subsequent code more efficiently
2181       // than using a stand-alone check, but since _owner checking is off by
2182       // default we don't bother. We also might consider predicating the
2183       // _owner==Self check on Xcheck:jni or running on a debug build.
2184       movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2185       xorptr(boxReg, r15_thread);
2186     } else {
2187       xorptr(boxReg, boxReg);
2188     }
2189     orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2190     jccb  (Assembler::notZero, DONE_LABEL);
2191     movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2192     orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2193     jccb  (Assembler::notZero, CheckSucc);
2194     movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2195     jmpb  (DONE_LABEL);
2196 
2197     if ((EmitSync & 65536) == 0) {
2198       // Try to avoid passing control into the slow_path ...
2199       Label LSuccess, LGoSlowPath ;
2200       bind  (CheckSucc);
2201 
2202       // The following optional optimization can be elided if necessary
2203       // Effectively: if (succ == null) goto SlowPath
2204       // The code reduces the window for a race, however,
2205       // and thus benefits performance.
2206       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2207       jccb  (Assembler::zero, LGoSlowPath);
2208 
2209       xorptr(boxReg, boxReg);
2210       if ((EmitSync & 16) && os::is_MP()) {
2211         xchgptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2212       } else {
2213         movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2214         if (os::is_MP()) {
2215           // Memory barrier/fence
2216           // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2217           // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2218           // This is faster on Nehalem and AMD Shanghai/Barcelona.
2219           // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2220           // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2221           // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2222           lock(); addl(Address(rsp, 0), 0);
2223         }
2224       }
2225       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2226       jccb  (Assembler::notZero, LSuccess);
2227 
2228       // Rare inopportune interleaving - race.
2229       // The successor vanished in the small window above.
2230       // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2231       // We need to ensure progress and succession.
2232       // Try to reacquire the lock.
2233       // If that fails then the new owner is responsible for succession and this
2234       // thread needs to take no further action and can exit via the fast path (success).
2235       // If the re-acquire succeeds then pass control into the slow path.
2236       // As implemented, this latter mode is horrible because we generated more
2237       // coherence traffic on the lock *and* artifically extended the critical section
2238       // length while by virtue of passing control into the slow path.
2239 
2240       // box is really RAX -- the following CMPXCHG depends on that binding
2241       // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2242       if (os::is_MP()) { lock(); }
2243       cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2244       // There's no successor so we tried to regrab the lock.
2245       // If that didn't work, then another thread grabbed the
2246       // lock so we're done (and exit was a success).
2247       jccb  (Assembler::notEqual, LSuccess);
2248       // Intentional fall-through into slow-path
2249 
2250       bind  (LGoSlowPath);
2251       orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2252       jmpb  (DONE_LABEL);
2253 
2254       bind  (LSuccess);
2255       testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2256       jmpb  (DONE_LABEL);
2257     }
2258 
2259     bind  (Stacked);
2260     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2261     if (os::is_MP()) { lock(); }
2262     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2263 
2264     if (EmitSync & 65536) {
2265        bind (CheckSucc);
2266     }
2267 #endif
2268     bind(DONE_LABEL);
2269   }
2270 }
2271 #endif // COMPILER2
2272 
2273 void MacroAssembler::c2bool(Register x) {
2274   // implements x == 0 ? 0 : 1
2275   // note: must only look at least-significant byte of x
2276   //       since C-style booleans are stored in one byte
2277   //       only! (was bug)
2278   andl(x, 0xFF);
2279   setb(Assembler::notZero, x);
2280 }
2281 
2282 // Wouldn't need if AddressLiteral version had new name
2283 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2284   Assembler::call(L, rtype);
2285 }
2286 
2287 void MacroAssembler::call(Register entry) {
2288   Assembler::call(entry);
2289 }




1704                                Register scrReg, Register cx1Reg, Register cx2Reg,
1705                                BiasedLockingCounters* counters,
1706                                RTMLockingCounters* rtm_counters,
1707                                RTMLockingCounters* stack_rtm_counters,
1708                                Metadata* method_data,
1709                                bool use_rtm, bool profile_rtm) {
1710   // Ensure the register assignments are disjoint
1711   assert(tmpReg == rax, "");
1712 
1713   if (use_rtm) {
1714     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1715   } else {
1716     assert(cx1Reg == noreg, "");
1717     assert(cx2Reg == noreg, "");
1718     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1719   }
1720 
1721   if (counters != NULL) {
1722     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1723   }
1724 





1725   // Possible cases that we'll encounter in fast_lock
1726   // ------------------------------------------------
1727   // * Inflated
1728   //    -- unlocked
1729   //    -- Locked
1730   //       = by self
1731   //       = by other
1732   // * biased
1733   //    -- by Self
1734   //    -- by other
1735   // * neutral
1736   // * stack-locked
1737   //    -- by self
1738   //       = sp-proximity test hits
1739   //       = sp-proximity test generates false-negative
1740   //    -- by other
1741   //
1742 
1743   Label IsInflated, DONE_LABEL;
1744 


1793   bind(IsInflated);
1794   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value
1795 
1796 #if INCLUDE_RTM_OPT
1797   // Use the same RTM locking code in 32- and 64-bit VM.
1798   if (use_rtm) {
1799     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1800                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
1801   } else {
1802 #endif // INCLUDE_RTM_OPT
1803 
1804 #ifndef _LP64
1805   // The object is inflated.
1806 
1807   // boxReg refers to the on-stack BasicLock in the current frame.
1808   // We'd like to write:
1809   //   set box->_displaced_header = markOopDesc::unused_mark().  Any non-0 value suffices.
1810   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
1811   // additional latency as we have another ST in the store buffer that must drain.
1812 
1813   // avoid ST-before-CAS










1814   // register juggle because we need tmpReg for cmpxchgptr below
1815   movptr(scrReg, boxReg);
1816   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
1817 







1818   // Optimistic form: consider XORL tmpReg,tmpReg
1819   movptr(tmpReg, NULL_WORD);







1820 
1821   // Appears unlocked - try to swing _owner from null to non-null.
1822   // Ideally, I'd manifest "Self" with get_thread and then attempt
1823   // to CAS the register containing Self into m->Owner.
1824   // But we don't have enough registers, so instead we can either try to CAS
1825   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1826   // we later store "Self" into m->Owner.  Transiently storing a stack address
1827   // (rsp or the address of the box) into  m->owner is harmless.
1828   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1829   if (os::is_MP()) {
1830     lock();
1831   }
1832   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1833   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1834   // If we weren't able to swing _owner from NULL to the BasicLock
1835   // then take the slow path.
1836   jccb  (Assembler::notZero, DONE_LABEL);
1837   // update _owner from BasicLock to thread
1838   get_thread (scrReg);                    // beware: clobbers ICCs
1839   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1840   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1841 
1842   // If the CAS fails we can either retry or pass control to the slow-path.
1843   // We use the latter tactic.
1844   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1845   // If the CAS was successful ...
1846   //   Self has acquired the lock
1847   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1848   // Intentional fall-through into DONE_LABEL ...





































1849 #else // _LP64
1850   // It's inflated
1851   movq(scrReg, tmpReg);
1852   xorq(tmpReg, tmpReg);
1853 
1854   if (os::is_MP()) {
1855     lock();
1856   }
1857   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1858   // Unconditionally set box->_displaced_header = markOopDesc::unused_mark().
1859   // Without cast to int32_t movptr will destroy r10 which is typically obj.
1860   movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1861   // Intentional fall-through into DONE_LABEL ...
1862   // Propagate ICC.ZF from CAS above into DONE_LABEL.
1863 #endif // _LP64
1864 #if INCLUDE_RTM_OPT
1865   } // use_rtm()
1866 #endif
1867   // DONE_LABEL is a hot target - we'd really like to place it at the
1868   // start of cache line by padding with NOPs.
1869   // See the AMD and Intel software optimization manuals for the
1870   // most efficient "long" NOP encodings.
1871   // Unfortunately none of our alignment mechanisms suffice.
1872   bind(DONE_LABEL);
1873 
1874   // At DONE_LABEL the icc ZFlag is set as follows ...
1875   // Fast_Unlock uses the same protocol.
1876   // ZFlag == 1 -> Success
1877   // ZFlag == 0 -> Failure - force control through the slow-path

1878 }
1879 
1880 // obj: object to unlock
1881 // box: box address (displaced header location), killed.  Must be EAX.
1882 // tmp: killed, cannot be obj nor box.
1883 //
1884 // Some commentary on balanced locking:
1885 //
1886 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1887 // Methods that don't have provably balanced locking are forced to run in the
1888 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1889 // The interpreter provides two properties:
1890 // I1:  At return-time the interpreter automatically and quietly unlocks any
1891 //      objects acquired the current activation (frame).  Recall that the
1892 //      interpreter maintains an on-stack list of locks currently held by
1893 //      a frame.
1894 // I2:  If a method attempts to unlock an object that is not held by the
1895 //      the frame the interpreter throws IMSX.
1896 //
1897 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1898 // B() doesn't have provably balanced locking so it runs in the interpreter.
1899 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1900 // is still locked by A().
1901 //
1902 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1903 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1904 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1905 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1906 // Arguably given that the spec legislates the JNI case as undefined our implementation
1907 // could reasonably *avoid* checking owner in Fast_Unlock().
1908 // In the interest of performance we elide m->Owner==Self check in unlock.
1909 // A perfectly viable alternative is to elide the owner check except when
1910 // Xcheck:jni is enabled.
1911 
1912 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1913   assert(boxReg == rax, "");
1914   assert_different_registers(objReg, boxReg, tmpReg);
1915 




1916   Label DONE_LABEL, Stacked, CheckSucc;
1917 
1918   // Critically, the biased locking test must have precedence over
1919   // and appear before the (box->dhw == 0) recursive stack-lock test.
1920   if (UseBiasedLocking && !UseOptoBiasInlining) {
1921     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1922   }
1923 
1924 #if INCLUDE_RTM_OPT
1925   if (UseRTMForStackLocks && use_rtm) {
1926     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1927     Label L_regular_unlock;
1928     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));           // fetch markword
1929     andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1930     cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1931     jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
1932     xend();                                       // otherwise end...
1933     jmp(DONE_LABEL);                              // ... and we're done
1934     bind(L_regular_unlock);
1935   }


1962   // state in _succ so we can avoid fetching EntryList|cxq.
1963   //
1964   // I'd like to add more cases in fast_lock() and fast_unlock() --
1965   // such as recursive enter and exit -- but we have to be wary of
1966   // I$ bloat, T$ effects and BP$ effects.
1967   //
1968   // If there's no contention try a 1-0 exit.  That is, exit without
1969   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
1970   // we detect and recover from the race that the 1-0 exit admits.
1971   //
1972   // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
1973   // before it STs null into _owner, releasing the lock.  Updates
1974   // to data protected by the critical section must be visible before
1975   // we drop the lock (and thus before any other thread could acquire
1976   // the lock and observe the fields protected by the lock).
1977   // IA32's memory-model is SPO, so STs are ordered with respect to
1978   // each other and there's no need for an explicit barrier (fence).
1979   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
1980 #ifndef _LP64
1981   get_thread (boxReg);




1982 
1983   // Note that we could employ various encoding schemes to reduce
1984   // the number of loads below (currently 4) to just 2 or 3.
1985   // Refer to the comments in synchronizer.cpp.
1986   // In practice the chain of fetches doesn't seem to impact performance, however.
1987   xorptr(boxReg, boxReg);









1988   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1989   jccb  (Assembler::notZero, DONE_LABEL);
1990   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1991   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1992   jccb  (Assembler::notZero, CheckSucc);
1993   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1994   jmpb  (DONE_LABEL);











































































1995 
1996   bind (Stacked);
1997   // It's not inflated and it's not recursively stack-locked and it's not biased.
1998   // It must be stack-locked.
1999   // Try to reset the header to displaced header.
2000   // The "box" value on the stack is stable, so we can reload
2001   // and be assured we observe the same value as above.
2002   movptr(tmpReg, Address(boxReg, 0));
2003   if (os::is_MP()) {
2004     lock();
2005   }
2006   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2007   // Intention fall-thru into DONE_LABEL
2008 
2009   // DONE_LABEL is a hot target - we'd really like to place it at the
2010   // start of cache line by padding with NOPs.
2011   // See the AMD and Intel software optimization manuals for the
2012   // most efficient "long" NOP encodings.
2013   // Unfortunately none of our alignment mechanisms suffice.

2014   bind (CheckSucc);

2015 #else // _LP64
2016   // It's inflated









2017   xorptr(boxReg, boxReg);

2018   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2019   jccb  (Assembler::notZero, DONE_LABEL);
2020   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2021   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2022   jccb  (Assembler::notZero, CheckSucc);
2023   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2024   jmpb  (DONE_LABEL);
2025 

2026   // Try to avoid passing control into the slow_path ...
2027   Label LSuccess, LGoSlowPath ;
2028   bind  (CheckSucc);
2029 
2030   // The following optional optimization can be elided if necessary
2031   // Effectively: if (succ == null) goto SlowPath
2032   // The code reduces the window for a race, however,
2033   // and thus benefits performance.
2034   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2035   jccb  (Assembler::zero, LGoSlowPath);
2036 
2037   xorptr(boxReg, boxReg);



2038   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2039   if (os::is_MP()) {
2040     // Memory barrier/fence
2041     // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2042     // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2043     // This is faster on Nehalem and AMD Shanghai/Barcelona.
2044     // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2045     // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2046     // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2047     lock(); addl(Address(rsp, 0), 0);
2048   }

2049   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2050   jccb  (Assembler::notZero, LSuccess);
2051 
2052   // Rare inopportune interleaving - race.
2053   // The successor vanished in the small window above.
2054   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2055   // We need to ensure progress and succession.
2056   // Try to reacquire the lock.
2057   // If that fails then the new owner is responsible for succession and this
2058   // thread needs to take no further action and can exit via the fast path (success).
2059   // If the re-acquire succeeds then pass control into the slow path.
2060   // As implemented, this latter mode is horrible because we generated more
2061   // coherence traffic on the lock *and* artifically extended the critical section
2062   // length while by virtue of passing control into the slow path.
2063 
2064   // box is really RAX -- the following CMPXCHG depends on that binding
2065   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2066   if (os::is_MP()) { lock(); }
2067   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2068   // There's no successor so we tried to regrab the lock.
2069   // If that didn't work, then another thread grabbed the
2070   // lock so we're done (and exit was a success).
2071   jccb  (Assembler::notEqual, LSuccess);
2072   // Intentional fall-through into slow-path
2073 
2074   bind  (LGoSlowPath);
2075   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2076   jmpb  (DONE_LABEL);
2077 
2078   bind  (LSuccess);
2079   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2080   jmpb  (DONE_LABEL);

2081 
2082   bind  (Stacked);
2083   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2084   if (os::is_MP()) { lock(); }
2085   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2086 



2087 #endif
2088   bind(DONE_LABEL);

2089 }
2090 #endif // COMPILER2
2091 
2092 void MacroAssembler::c2bool(Register x) {
2093   // implements x == 0 ? 0 : 1
2094   // note: must only look at least-significant byte of x
2095   //       since C-style booleans are stored in one byte
2096   //       only! (was bug)
2097   andl(x, 0xFF);
2098   setb(Assembler::notZero, x);
2099 }
2100 
2101 // Wouldn't need if AddressLiteral version had new name
2102 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2103   Assembler::call(L, rtype);
2104 }
2105 
2106 void MacroAssembler::call(Register entry) {
2107   Assembler::call(entry);
2108 }


< prev index next >