1279 return null_check_offset;
1280 }
1281
1282 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1283 assert(UseBiasedLocking, "why call this otherwise?");
1284
1285 // Check for biased locking unlock case, which is a no-op
1286 // Note: we do not have to check the thread ID for two reasons.
1287 // First, the interpreter checks for IllegalMonitorStateException at
1288 // a higher level. Second, if the bias was revoked while we held the
1289 // lock, the object could not be rebiased toward another thread, so
1290 // the bias bit would be clear.
1291 movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1292 andptr(temp_reg, markWord::biased_lock_mask_in_place);
1293 cmpptr(temp_reg, markWord::biased_lock_pattern);
1294 jcc(Assembler::equal, done);
1295 }
1296
1297 #ifdef COMPILER2
1298
1299 #if INCLUDE_RTM_OPT
1300
1301 // Update rtm_counters based on abort status
1302 // input: abort_status
1303 // rtm_counters (RTMLockingCounters*)
1304 // flags are killed
1305 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1306
1307 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1308 if (PrintPreciseRTMLockingStatistics) {
1309 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1310 Label check_abort;
1311 testl(abort_status, (1<<i));
1312 jccb(Assembler::equal, check_abort);
1313 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1314 bind(check_abort);
1315 }
1316 }
1317 }
1318
1512 bind(L_decrement_retry);
1513 if (RTMRetryCount > 0) {
1514 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1515 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1516 }
1517 }
1518
1519 // Use RTM for inflating locks
1520 // inputs: objReg (object to lock)
1521 // boxReg (on-stack box address (displaced header location) - KILLED)
1522 // tmpReg (ObjectMonitor address + markWord::monitor_value)
1523 void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1524 Register scrReg, Register retry_on_busy_count_Reg,
1525 Register retry_on_abort_count_Reg,
1526 RTMLockingCounters* rtm_counters,
1527 Metadata* method_data, bool profile_rtm,
1528 Label& DONE_LABEL) {
1529 assert(UseRTMLocking, "why call this otherwise?");
1530 assert(tmpReg == rax, "");
1531 assert(scrReg == rdx, "");
1532 Label L_rtm_retry, L_decrement_retry, L_on_abort;
1533 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1534
1535 // Without cast to int32_t a movptr will destroy r10 which is typically obj
1536 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
1537 movptr(boxReg, tmpReg); // Save ObjectMonitor address
1538
1539 if (RTMRetryCount > 0) {
1540 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy
1541 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1542 bind(L_rtm_retry);
1543 }
1544 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1545 Label L_noincrement;
1546 if (RTMTotalCountIncrRate > 1) {
1547 // tmpReg, scrReg and flags are killed
1548 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1549 }
1550 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1551 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1552 bind(L_noincrement);
1553 }
1554 xbegin(L_on_abort);
1555 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1556 movptr(tmpReg, Address(tmpReg, owner_offset));
1557 testptr(tmpReg, tmpReg);
1558 jcc(Assembler::zero, DONE_LABEL);
1559 if (UseRTMXendForLockBusy) {
1560 xend();
1561 jmp(L_decrement_retry);
1562 }
1563 else {
1564 xabort(0);
1565 }
1566 bind(L_on_abort);
1567 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1568 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1569 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1570 }
1571 if (RTMRetryCount > 0) {
1572 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1573 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1574 }
1575
1576 movptr(tmpReg, Address(boxReg, owner_offset)) ;
1577 testptr(tmpReg, tmpReg) ;
1578 jccb(Assembler::notZero, L_decrement_retry) ;
1579
1580 // Appears unlocked - try to swing _owner from null to non-null.
1581 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1582 #ifdef _LP64
1583 Register threadReg = r15_thread;
1584 #else
1585 get_thread(scrReg);
1586 Register threadReg = scrReg;
1587 #endif
1588 lock();
1589 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1590
1591 if (RTMRetryCount > 0) {
1592 // success done else retry
1593 jccb(Assembler::equal, DONE_LABEL) ;
1594 bind(L_decrement_retry);
1595 // Spin and retry if lock is busy.
1596 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1597 }
1598 else {
1599 bind(L_decrement_retry);
1600 }
1601 }
1602
1603 #endif // INCLUDE_RTM_OPT
1604
1605 // Fast_Lock and Fast_Unlock used by C2
1606
1607 // Because the transitions from emitted code to the runtime
1608 // monitorenter/exit helper stubs are so slow it's critical that
1609 // we inline both the stack-locking fast-path and the inflated fast path.
1610 //
1611 // See also: cmpFastLock and cmpFastUnlock.
1612 //
1613 // What follows is a specialized inline transliteration of the code
1614 // in enter() and exit(). If we're concerned about I$ bloat another
1615 // option would be to emit TrySlowEnter and TrySlowExit methods
1616 // at startup-time. These methods would accept arguments as
1617 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1618 // indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply
1619 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1620 // In practice, however, the # of lock sites is bounded and is usually small.
1621 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1622 // if the processor uses simple bimodal branch predictors keyed by EIP
1623 // Since the helper routines would be called from multiple synchronization
1624 // sites.
1625 //
1626 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1627 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1628 // to those specialized methods. That'd give us a mostly platform-independent
1629 // implementation that the JITs could optimize and inline at their pleasure.
1630 // Done correctly, the only time we'd need to cross to native could would be
1631 // to park() or unpark() threads. We'd also need a few more unsafe operators
1632 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1633 // (b) explicit barriers or fence operations.
1634 //
1635 // TODO:
1636 //
1637 // * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1638 // This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1639 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
1640 // the lock operators would typically be faster than reifying Self.
1641 //
1642 // * Ideally I'd define the primitives as:
1643 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1644 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1645 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
1646 // Instead, we're stuck with a rather awkward and brittle register assignments below.
1647 // Furthermore the register assignments are overconstrained, possibly resulting in
1648 // sub-optimal code near the synchronization site.
1649 //
1650 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
1651 // Alternately, use a better sp-proximity test.
1652 //
1653 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1654 // Either one is sufficient to uniquely identify a thread.
1655 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1656 //
1657 // * Intrinsify notify() and notifyAll() for the common cases where the
1658 // object is locked by the calling thread but the waitlist is empty.
1659 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1660 //
1661 // * use jccb and jmpb instead of jcc and jmp to improve code density.
1662 // But beware of excessive branch density on AMD Opterons.
1663 //
1664 // * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1665 // or failure of the fast-path. If the fast-path fails then we pass
1666 // control to the slow-path, typically in C. In Fast_Lock and
1667 // Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1668 // will emit a conditional branch immediately after the node.
1669 // So we have branches to branches and lots of ICC.ZF games.
1670 // Instead, it might be better to have C2 pass a "FailureLabel"
1671 // into Fast_Lock and Fast_Unlock. In the case of success, control
1672 // will drop through the node. ICC.ZF is undefined at exit.
1673 // In the case of failure, the node will branch directly to the
1674 // FailureLabel
1675
1676
1677 // obj: object to lock
1678 // box: on-stack box address (displaced header location) - KILLED
1679 // rax,: tmp -- KILLED
1680 // scr: tmp -- KILLED
1681 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1682 Register scrReg, Register cx1Reg, Register cx2Reg,
1683 BiasedLockingCounters* counters,
1684 RTMLockingCounters* rtm_counters,
1685 RTMLockingCounters* stack_rtm_counters,
1686 Metadata* method_data,
1687 bool use_rtm, bool profile_rtm) {
1688 // Ensure the register assignments are disjoint
1689 assert(tmpReg == rax, "");
1690
1691 if (use_rtm) {
1796
1797 // Appears unlocked - try to swing _owner from null to non-null.
1798 // Ideally, I'd manifest "Self" with get_thread and then attempt
1799 // to CAS the register containing Self into m->Owner.
1800 // But we don't have enough registers, so instead we can either try to CAS
1801 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
1802 // we later store "Self" into m->Owner. Transiently storing a stack address
1803 // (rsp or the address of the box) into m->owner is harmless.
1804 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1805 lock();
1806 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1807 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
1808 // If we weren't able to swing _owner from NULL to the BasicLock
1809 // then take the slow path.
1810 jccb (Assembler::notZero, DONE_LABEL);
1811 // update _owner from BasicLock to thread
1812 get_thread (scrReg); // beware: clobbers ICCs
1813 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1814 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
1815
1816 // If the CAS fails we can either retry or pass control to the slow-path.
1817 // We use the latter tactic.
1818 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1819 // If the CAS was successful ...
1820 // Self has acquired the lock
1821 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1822 // Intentional fall-through into DONE_LABEL ...
1823 #else // _LP64
1824 // It's inflated
1825 movq(scrReg, tmpReg);
1826 xorq(tmpReg, tmpReg);
1827
1828 lock();
1829 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1830 // Unconditionally set box->_displaced_header = markWord::unused_mark().
1831 // Without cast to int32_t movptr will destroy r10 which is typically obj.
1832 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
1833 // The following code to verify that the object field still refers
1834 // to the object we are trying to lock is not needed with safepoint
1835 // based deflation. It is also not needed with async deflation when
1836 // the DEFLATER_MARKER is allowed to linger in the owner field in an
1837 // async deflated ObjectMonitor until replaced by the next owner value.
1838 // We keep this code as a sanity check against bugs in other parts
1839 // of the async deflation mechanism.
1840 //
1841 // If we weren't able to swing _owner from NULL to r15_thread
1842 // then take the slow path.
1843 jccb(Assembler::notZero, DONE_LABEL);
1844 // r15_thread is now the owner so verify that the ObjectMonitor
1845 // still refers to the same object.
1846 cmpptr(objReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(object)));
1847 // The ObjectMonitor still refers to the same object so
1848 // r15_thread's ownership is valid.
1849 jccb(Assembler::zero, DONE_LABEL);
1850 // The ObjectMonitor does not refer to the same object so
1851 // drop ownership.
1852 movptr(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1853 // Intentional fall-through into DONE_LABEL ...
1854 // Propagate ICC.ZF from cmpptr() above into DONE_LABEL.
1855 #endif // _LP64
1856 #if INCLUDE_RTM_OPT
1857 } // use_rtm()
1858 #endif
1859 // DONE_LABEL is a hot target - we'd really like to place it at the
1860 // start of cache line by padding with NOPs.
1861 // See the AMD and Intel software optimization manuals for the
1862 // most efficient "long" NOP encodings.
1863 // Unfortunately none of our alignment mechanisms suffice.
1864 bind(DONE_LABEL);
1865
1866 // At DONE_LABEL the icc ZFlag is set as follows ...
1867 // Fast_Unlock uses the same protocol.
1868 // ZFlag == 1 -> Success
1869 // ZFlag == 0 -> Failure - force control through the slow-path
1870 }
1871
1872 // obj: object to unlock
1873 // box: box address (displaced header location), killed. Must be EAX.
1874 // tmp: killed, cannot be obj nor box.
1875 //
1876 // Some commentary on balanced locking:
1877 //
1878 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1879 // Methods that don't have provably balanced locking are forced to run in the
1880 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1881 // The interpreter provides two properties:
1882 // I1: At return-time the interpreter automatically and quietly unlocks any
1883 // objects acquired the current activation (frame). Recall that the
1884 // interpreter maintains an on-stack list of locks currently held by
1885 // a frame.
1886 // I2: If a method attempts to unlock an object that is not held by the
1887 // the frame the interpreter throws IMSX.
1888 //
1889 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1890 // B() doesn't have provably balanced locking so it runs in the interpreter.
1891 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
1892 // is still locked by A().
1893 //
1894 // The only other source of unbalanced locking would be JNI. The "Java Native Interface:
1895 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1896 // should not be unlocked by "normal" java-level locking and vice-versa. The specification
1897 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1898 // Arguably given that the spec legislates the JNI case as undefined our implementation
1899 // could reasonably *avoid* checking owner in Fast_Unlock().
1900 // In the interest of performance we elide m->Owner==Self check in unlock.
1901 // A perfectly viable alternative is to elide the owner check except when
1902 // Xcheck:jni is enabled.
1903
1904 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1905 assert(boxReg == rax, "");
1906 assert_different_registers(objReg, boxReg, tmpReg);
1907
1908 Label DONE_LABEL, Stacked, CheckSucc;
1909
1910 // Critically, the biased locking test must have precedence over
1911 // and appear before the (box->dhw == 0) recursive stack-lock test.
1912 if (UseBiasedLocking && !UseOptoBiasInlining) {
1913 biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1914 }
1915
1916 #if INCLUDE_RTM_OPT
1917 if (UseRTMForStackLocks && use_rtm) {
1918 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1919 Label L_regular_unlock;
1920 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
1921 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
1922 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
1923 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock
1924 xend(); // otherwise end...
1925 jmp(DONE_LABEL); // ... and we're done
1926 bind(L_regular_unlock);
1927 }
1928 #endif
1929
1930 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
1931 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock
1932 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
1933 testptr(tmpReg, markWord::monitor_value); // Inflated?
1934 jccb (Assembler::zero, Stacked);
1935
1936 // It's inflated.
1937 #if INCLUDE_RTM_OPT
1938 if (use_rtm) {
1939 Label L_regular_inflated_unlock;
1940 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1941 movptr(boxReg, Address(tmpReg, owner_offset));
1942 testptr(boxReg, boxReg);
1943 jccb(Assembler::notZero, L_regular_inflated_unlock);
1944 xend();
1945 jmpb(DONE_LABEL);
1946 bind(L_regular_inflated_unlock);
1947 }
1948 #endif
1949
1950 // Despite our balanced locking property we still check that m->_owner == Self
1951 // as java routines or native JNI code called by this thread might
1952 // have released the lock.
1953 // Refer to the comments in synchronizer.cpp for how we might encode extra
1954 // state in _succ so we can avoid fetching EntryList|cxq.
1955 //
1956 // I'd like to add more cases in fast_lock() and fast_unlock() --
1957 // such as recursive enter and exit -- but we have to be wary of
1958 // I$ bloat, T$ effects and BP$ effects.
1959 //
1960 // If there's no contention try a 1-0 exit. That is, exit without
1961 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
1962 // we detect and recover from the race that the 1-0 exit admits.
1963 //
1964 // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
1965 // before it STs null into _owner, releasing the lock. Updates
1966 // to data protected by the critical section must be visible before
1967 // we drop the lock (and thus before any other thread could acquire
1968 // the lock and observe the fields protected by the lock).
1969 // IA32's memory-model is SPO, so STs are ordered with respect to
1970 // each other and there's no need for an explicit barrier (fence).
1971 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
1972 #ifndef _LP64
1973 get_thread (boxReg);
1974
1975 // Note that we could employ various encoding schemes to reduce
1976 // the number of loads below (currently 4) to just 2 or 3.
1977 // Refer to the comments in synchronizer.cpp.
1978 // In practice the chain of fetches doesn't seem to impact performance, however.
1979 xorptr(boxReg, boxReg);
1980 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1981 jccb (Assembler::notZero, DONE_LABEL);
1982 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1983 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1984 jccb (Assembler::notZero, CheckSucc);
1987
1988 bind (Stacked);
1989 // It's not inflated and it's not recursively stack-locked and it's not biased.
1990 // It must be stack-locked.
1991 // Try to reset the header to displaced header.
1992 // The "box" value on the stack is stable, so we can reload
1993 // and be assured we observe the same value as above.
1994 movptr(tmpReg, Address(boxReg, 0));
1995 lock();
1996 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
1997 // Intention fall-thru into DONE_LABEL
1998
1999 // DONE_LABEL is a hot target - we'd really like to place it at the
2000 // start of cache line by padding with NOPs.
2001 // See the AMD and Intel software optimization manuals for the
2002 // most efficient "long" NOP encodings.
2003 // Unfortunately none of our alignment mechanisms suffice.
2004 bind (CheckSucc);
2005 #else // _LP64
2006 // It's inflated
2007 xorptr(boxReg, boxReg);
2008 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2009 jccb (Assembler::notZero, DONE_LABEL);
2010 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2011 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2012 jccb (Assembler::notZero, CheckSucc);
2013 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2014 jmpb (DONE_LABEL);
2015
2016 // Try to avoid passing control into the slow_path ...
2017 Label LSuccess, LGoSlowPath ;
2018 bind (CheckSucc);
2019
2020 // The following optional optimization can be elided if necessary
2021 // Effectively: if (succ == null) goto SlowPath
2022 // The code reduces the window for a race, however,
2023 // and thus benefits performance.
2024 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2025 jccb (Assembler::zero, LGoSlowPath);
2026
2027 xorptr(boxReg, boxReg);
2028 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2029
2030 // Memory barrier/fence
2031 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2032 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2033 // This is faster on Nehalem and AMD Shanghai/Barcelona.
2034 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2035 // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2036 // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2037 lock(); addl(Address(rsp, 0), 0);
2038
2039 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2040 jccb (Assembler::notZero, LSuccess);
2041
2042 // Rare inopportune interleaving - race.
2043 // The successor vanished in the small window above.
2044 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2045 // We need to ensure progress and succession.
2046 // Try to reacquire the lock.
2047 // If that fails then the new owner is responsible for succession and this
2048 // thread needs to take no further action and can exit via the fast path (success).
2049 // If the re-acquire succeeds then pass control into the slow path.
2050 // As implemented, this latter mode is horrible because we generated more
2051 // coherence traffic on the lock *and* artifically extended the critical section
2052 // length while by virtue of passing control into the slow path.
2053
2054 // box is really RAX -- the following CMPXCHG depends on that binding
2055 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2056 lock();
2057 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2058 // There's no successor so we tried to regrab the lock.
2059 // If that didn't work, then another thread grabbed the
2060 // lock so we're done (and exit was a success).
2061 jccb (Assembler::notEqual, LSuccess);
2062 // Intentional fall-through into slow-path
2063
2064 bind (LGoSlowPath);
2065 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
2066 jmpb (DONE_LABEL);
2067
2068 bind (LSuccess);
2069 testl (boxReg, 0); // set ICC.ZF=1 to indicate success
2070 jmpb (DONE_LABEL);
2071
2072 bind (Stacked);
2073 movptr(tmpReg, Address (boxReg, 0)); // re-fetch
2074 lock();
2075 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2076
2077 #endif
2078 bind(DONE_LABEL);
2079 }
2080 #endif // COMPILER2
2081
2082 void MacroAssembler::c2bool(Register x) {
2083 // implements x == 0 ? 0 : 1
2084 // note: must only look at least-significant byte of x
2085 // since C-style booleans are stored in one byte
2086 // only! (was bug)
2087 andl(x, 0xFF);
2088 setb(Assembler::notZero, x);
|
1279 return null_check_offset;
1280 }
1281
1282 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1283 assert(UseBiasedLocking, "why call this otherwise?");
1284
1285 // Check for biased locking unlock case, which is a no-op
1286 // Note: we do not have to check the thread ID for two reasons.
1287 // First, the interpreter checks for IllegalMonitorStateException at
1288 // a higher level. Second, if the bias was revoked while we held the
1289 // lock, the object could not be rebiased toward another thread, so
1290 // the bias bit would be clear.
1291 movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1292 andptr(temp_reg, markWord::biased_lock_mask_in_place);
1293 cmpptr(temp_reg, markWord::biased_lock_pattern);
1294 jcc(Assembler::equal, done);
1295 }
1296
1297 #ifdef COMPILER2
1298
1299 // Increment the ObjectMonitor's ref_count for safety or force a branch
1300 // to 'done' with ICC.ZF=0 to indicate failure/take the slow path.
1301 void MacroAssembler::inc_om_ref_count(Register obj_reg, Register om_reg, Register tmp_reg, Label& done) {
1302 atomic_incl(Address(om_reg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count)));
1303
1304 Label LGoSlowPath;
1305 if (AsyncDeflateIdleMonitors) {
1306 // Race here if monitor is not owned! The above ref_count bump
1307 // will cause subsequent async deflation to skip it. However,
1308 // previous or concurrent async deflation is a race.
1309
1310 // First check: if the owner field == DEFLATER_MARKER:
1311 movptr(tmp_reg, Address(om_reg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1312 // DEFLATER_MARKER == reinterpret_cast<void*>(-1) so the compiler
1313 // doesn't like to use the define here:
1314 cmpptr(tmp_reg, -1);
1315 // If marked for async deflation, then take the slow path. This is a
1316 // simpler check than what ObjectMonitorHandle::save_om_ptr() does
1317 // so ObjectMonitor::install_displaced_markword_in_object() doesn't
1318 // have to be implemented in macro assembler.
1319 jccb(Assembler::equal, LGoSlowPath);
1320
1321 // Second check: if ref_count field <= 0:
1322 movptr(tmp_reg, Address(om_reg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count)));
1323 cmpptr(tmp_reg, 0);
1324 // If async deflation is in the process of bailing out, but has not
1325 // yet restored the ref_count field, then we take the slow path. We
1326 // want a stable ref_count value for the fast path.
1327 jccb(Assembler::lessEqual, LGoSlowPath);
1328
1329 // Final check: if object field == obj_reg:
1330 cmpptr(obj_reg, Address(om_reg, OM_OFFSET_NO_MONITOR_VALUE_TAG(object)));
1331 // If the ObjectMonitor has been deflated and recycled, then take
1332 // the slow path.
1333 jccb(Assembler::notEqual, LGoSlowPath);
1334 }
1335
1336 Label LRetToCaller;
1337 // We leave the ref_count incremented to protect the caller's code
1338 // paths against async deflation.
1339 jmpb(LRetToCaller);
1340
1341 bind(LGoSlowPath);
1342 lock();
1343 decrementl(Address(om_reg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count)));
1344 // Jump to 'done' with ICC.ZF=0 to indicate failure/take the slow path.
1345 orl(tmp_reg, 1);
1346 jmp(done);
1347
1348 bind(LRetToCaller);
1349 }
1350
1351 #if INCLUDE_RTM_OPT
1352
1353 // Update rtm_counters based on abort status
1354 // input: abort_status
1355 // rtm_counters (RTMLockingCounters*)
1356 // flags are killed
1357 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1358
1359 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1360 if (PrintPreciseRTMLockingStatistics) {
1361 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1362 Label check_abort;
1363 testl(abort_status, (1<<i));
1364 jccb(Assembler::equal, check_abort);
1365 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1366 bind(check_abort);
1367 }
1368 }
1369 }
1370
1564 bind(L_decrement_retry);
1565 if (RTMRetryCount > 0) {
1566 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1567 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1568 }
1569 }
1570
1571 // Use RTM for inflating locks
1572 // inputs: objReg (object to lock)
1573 // boxReg (on-stack box address (displaced header location) - KILLED)
1574 // tmpReg (ObjectMonitor address + markWord::monitor_value)
1575 void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1576 Register scrReg, Register retry_on_busy_count_Reg,
1577 Register retry_on_abort_count_Reg,
1578 RTMLockingCounters* rtm_counters,
1579 Metadata* method_data, bool profile_rtm,
1580 Label& DONE_LABEL) {
1581 assert(UseRTMLocking, "why call this otherwise?");
1582 assert(tmpReg == rax, "");
1583 assert(scrReg == rdx, "");
1584 Label L_rtm_retry, L_decrement_retry, L_on_abort, L_local_done;
1585 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1586
1587 if (!HandshakeAfterDeflateIdleMonitors) {
1588 // Increment the ObjectMonitor's ref_count for safety or force the
1589 // enter slow path via DONE_LABEL.
1590 // In rtm_inflated_locking(), initially tmpReg contains the object's
1591 // mark word which, in this case, is the (ObjectMonitor* | monitor_value).
1592 // Also this code uses scrReg as its temporary register.
1593 inc_om_ref_count(objReg, tmpReg /* om_reg */, scrReg /* tmp_reg */, DONE_LABEL);
1594 }
1595
1596 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
1597 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
1598 movptr(boxReg, tmpReg); // Save ObjectMonitor address
1599
1600 if (RTMRetryCount > 0) {
1601 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy
1602 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1603 bind(L_rtm_retry);
1604 }
1605 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1606 Label L_noincrement;
1607 if (RTMTotalCountIncrRate > 1) {
1608 // tmpReg, scrReg and flags are killed
1609 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1610 }
1611 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1612 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1613 bind(L_noincrement);
1614 }
1615 xbegin(L_on_abort);
1616 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1617 movptr(tmpReg, Address(tmpReg, owner_offset));
1618 testptr(tmpReg, tmpReg);
1619 jcc(Assembler::zero, L_local_done);
1620 if (UseRTMXendForLockBusy) {
1621 xend();
1622 jmp(L_decrement_retry);
1623 }
1624 else {
1625 xabort(0);
1626 }
1627 bind(L_on_abort);
1628 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1629 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1630 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1631 }
1632 if (RTMRetryCount > 0) {
1633 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1634 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1635 }
1636
1637 movptr(tmpReg, Address(boxReg, owner_offset)) ;
1638 testptr(tmpReg, tmpReg) ;
1639 jccb(Assembler::notZero, L_decrement_retry) ;
1640
1641 // Appears unlocked - try to swing _owner from null to non-null.
1642 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1643 #ifdef _LP64
1644 Register threadReg = r15_thread;
1645 #else
1646 get_thread(scrReg);
1647 Register threadReg = scrReg;
1648 #endif
1649 lock();
1650 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1651
1652 if (RTMRetryCount > 0) {
1653 // success done else retry
1654 jccb(Assembler::equal, L_local_done);
1655 bind(L_decrement_retry);
1656 // Spin and retry if lock is busy.
1657 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1658 }
1659 else {
1660 bind(L_decrement_retry);
1661 }
1662
1663 // rtm_inflated_locking() exit paths come here except for a failed
1664 // inc_om_ref_count() which goes directly to DONE_LABEL.
1665 bind(L_local_done);
1666 if (!HandshakeAfterDeflateIdleMonitors) {
1667 pushf(); // Preserve flags.
1668 // Decrement the ObjectMonitor's ref_count.
1669 lock();
1670 decrementl(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count)));
1671 popf(); // Restore flags so we have the proper ICC.ZF value.
1672 }
1673
1674 jmp(DONE_LABEL) ;
1675 }
1676
1677 #endif // INCLUDE_RTM_OPT
1678
1679 // fast_lock and fast_unlock used by C2
1680
1681 // Because the transitions from emitted code to the runtime
1682 // monitorenter/exit helper stubs are so slow it's critical that
1683 // we inline both the stack-locking fast path and the inflated fast path.
1684 //
1685 // See also: cmpFastLock and cmpFastUnlock.
1686 //
1687 // What follows is a specialized inline transliteration of the code
1688 // in enter() and exit(). If we're concerned about I$ bloat another
1689 // option would be to emit TrySlowEnter and TrySlowExit methods
1690 // at startup-time. These methods would accept arguments as
1691 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1692 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
1693 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1694 // In practice, however, the # of lock sites is bounded and is usually small.
1695 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1696 // if the processor uses simple bimodal branch predictors keyed by EIP
1697 // Since the helper routines would be called from multiple synchronization
1698 // sites.
1699 //
1700 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1701 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1702 // to those specialized methods. That'd give us a mostly platform-independent
1703 // implementation that the JITs could optimize and inline at their pleasure.
1704 // Done correctly, the only time we'd need to cross to native could would be
1705 // to park() or unpark() threads. We'd also need a few more unsafe operators
1706 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1707 // (b) explicit barriers or fence operations.
1708 //
1709 // TODO:
1710 //
1711 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
1712 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
1713 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
1714 // the lock operators would typically be faster than reifying Self.
1715 //
1716 // * Ideally I'd define the primitives as:
1717 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1718 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1719 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
1720 // Instead, we're stuck with a rather awkward and brittle register assignments below.
1721 // Furthermore the register assignments are overconstrained, possibly resulting in
1722 // sub-optimal code near the synchronization site.
1723 //
1724 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
1725 // Alternately, use a better sp-proximity test.
1726 //
1727 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1728 // Either one is sufficient to uniquely identify a thread.
1729 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1730 //
1731 // * Intrinsify notify() and notifyAll() for the common cases where the
1732 // object is locked by the calling thread but the waitlist is empty.
1733 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1734 //
1735 // * use jccb and jmpb instead of jcc and jmp to improve code density.
1736 // But beware of excessive branch density on AMD Opterons.
1737 //
1738 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
1739 // or failure of the fast path. If the fast path fails then we pass
1740 // control to the slow path, typically in C. In fast_lock and
1741 // fast_unlock we often branch to DONE_LABEL, just to find that C2
1742 // will emit a conditional branch immediately after the node.
1743 // So we have branches to branches and lots of ICC.ZF games.
1744 // Instead, it might be better to have C2 pass a "FailureLabel"
1745 // into fast_lock and fast_unlock. In the case of success, control
1746 // will drop through the node. ICC.ZF is undefined at exit.
1747 // In the case of failure, the node will branch directly to the
1748 // FailureLabel
1749
1750
1751 // obj: object to lock
1752 // box: on-stack box address (displaced header location) - KILLED
1753 // rax,: tmp -- KILLED
1754 // scr: tmp -- KILLED
1755 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1756 Register scrReg, Register cx1Reg, Register cx2Reg,
1757 BiasedLockingCounters* counters,
1758 RTMLockingCounters* rtm_counters,
1759 RTMLockingCounters* stack_rtm_counters,
1760 Metadata* method_data,
1761 bool use_rtm, bool profile_rtm) {
1762 // Ensure the register assignments are disjoint
1763 assert(tmpReg == rax, "");
1764
1765 if (use_rtm) {
1870
1871 // Appears unlocked - try to swing _owner from null to non-null.
1872 // Ideally, I'd manifest "Self" with get_thread and then attempt
1873 // to CAS the register containing Self into m->Owner.
1874 // But we don't have enough registers, so instead we can either try to CAS
1875 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
1876 // we later store "Self" into m->Owner. Transiently storing a stack address
1877 // (rsp or the address of the box) into m->owner is harmless.
1878 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1879 lock();
1880 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1881 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
1882 // If we weren't able to swing _owner from NULL to the BasicLock
1883 // then take the slow path.
1884 jccb (Assembler::notZero, DONE_LABEL);
1885 // update _owner from BasicLock to thread
1886 get_thread (scrReg); // beware: clobbers ICCs
1887 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1888 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
1889
1890 // If the CAS fails we can either retry or pass control to the slow path.
1891 // We use the latter tactic.
1892 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1893 // If the CAS was successful ...
1894 // Self has acquired the lock
1895 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1896 // Intentional fall-through into DONE_LABEL ...
1897 #else // _LP64
1898 // It's inflated and we use scrReg for ObjectMonitor* in this section.
1899 movq(scrReg, tmpReg);
1900
1901 if (!HandshakeAfterDeflateIdleMonitors) {
1902 // Increment the ObjectMonitor's ref_count for safety or force the
1903 // enter slow path via DONE_LABEL.
1904 // In fast_lock(), scrReg contains the object's mark word which,
1905 // in this case, is the (ObjectMonitor* | monitor_value). Also this
1906 // code uses tmpReg as its temporary register.
1907 inc_om_ref_count(objReg, scrReg /* om_reg */, tmpReg /* tmp_reg */, DONE_LABEL);
1908 }
1909
1910 xorq(tmpReg, tmpReg);
1911 lock();
1912 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1913 // Unconditionally set box->_displaced_header = markWord::unused_mark().
1914 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
1915 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
1916 // Intentional fall-through into DONE_LABEL ...
1917 // Propagate ICC.ZF from CAS above into DONE_LABEL.
1918
1919 if (!HandshakeAfterDeflateIdleMonitors) {
1920 pushf(); // Preserve flags.
1921 // Decrement the ObjectMonitor's ref_count.
1922 lock();
1923 decrementl(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count)));
1924 popf(); // Restore flags so we have the proper ICC.ZF value.
1925 }
1926 #endif // _LP64
1927 #if INCLUDE_RTM_OPT
1928 } // use_rtm()
1929 #endif
1930 // DONE_LABEL is a hot target - we'd really like to place it at the
1931 // start of cache line by padding with NOPs.
1932 // See the AMD and Intel software optimization manuals for the
1933 // most efficient "long" NOP encodings.
1934 // Unfortunately none of our alignment mechanisms suffice.
1935 bind(DONE_LABEL);
1936
1937 // At DONE_LABEL the icc ZFlag is set as follows ...
1938 // fast_unlock uses the same protocol.
1939 // ZFlag == 1 -> Success
1940 // ZFlag == 0 -> Failure - force control through the slow path
1941 }
1942
1943 // obj: object to unlock
1944 // box: box address (displaced header location), killed. Must be EAX.
1945 // tmp: killed, cannot be obj nor box.
1946 //
1947 // Some commentary on balanced locking:
1948 //
1949 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
1950 // Methods that don't have provably balanced locking are forced to run in the
1951 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1952 // The interpreter provides two properties:
1953 // I1: At return-time the interpreter automatically and quietly unlocks any
1954 // objects acquired the current activation (frame). Recall that the
1955 // interpreter maintains an on-stack list of locks currently held by
1956 // a frame.
1957 // I2: If a method attempts to unlock an object that is not held by the
1958 // the frame the interpreter throws IMSX.
1959 //
1960 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1961 // B() doesn't have provably balanced locking so it runs in the interpreter.
1962 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
1963 // is still locked by A().
1964 //
1965 // The only other source of unbalanced locking would be JNI. The "Java Native Interface:
1966 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1967 // should not be unlocked by "normal" java-level locking and vice-versa. The specification
1968 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1969 // Arguably given that the spec legislates the JNI case as undefined our implementation
1970 // could reasonably *avoid* checking owner in fast_unlock().
1971 // In the interest of performance we elide m->Owner==Self check in unlock.
1972 // A perfectly viable alternative is to elide the owner check except when
1973 // Xcheck:jni is enabled.
1974
1975 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1976 assert(boxReg == rax, "");
1977 assert_different_registers(objReg, boxReg, tmpReg);
1978
1979 Label DONE_LABEL, Stacked, CheckSucc;
1980
1981 // Critically, the biased locking test must have precedence over
1982 // and appear before the (box->dhw == 0) recursive stack-lock test.
1983 if (UseBiasedLocking && !UseOptoBiasInlining) {
1984 biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1985 }
1986
1987 #if INCLUDE_RTM_OPT
1988 if (UseRTMForStackLocks && use_rtm) {
1989 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1990 Label L_regular_unlock;
1991 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
1992 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
1993 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
1994 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock
1995 xend(); // otherwise end...
1996 jmp(DONE_LABEL); // ... and we're done
1997 bind(L_regular_unlock);
1998 }
1999 #endif
2000
2001 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
2002 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock
2003 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
2004 testptr(tmpReg, markWord::monitor_value); // Inflated?
2005 jcc (Assembler::zero, Stacked);
2006
2007 // It's inflated.
2008 #if INCLUDE_RTM_OPT
2009 if (use_rtm) {
2010 Label L_regular_inflated_unlock;
2011 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
2012 movptr(boxReg, Address(tmpReg, owner_offset));
2013 testptr(boxReg, boxReg);
2014 jccb(Assembler::notZero, L_regular_inflated_unlock);
2015 xend();
2016 jmp(DONE_LABEL);
2017 bind(L_regular_inflated_unlock);
2018 }
2019 #endif
2020
2021 // Despite our balanced locking property we still check that m->_owner == Self
2022 // as java routines or native JNI code called by this thread might
2023 // have released the lock.
2024 // Refer to the comments in synchronizer.cpp for how we might encode extra
2025 // state in _succ so we can avoid fetching EntryList|cxq.
2026 //
2027 // I'd like to add more cases in fast_lock() and fast_unlock() --
2028 // such as recursive enter and exit -- but we have to be wary of
2029 // I$ bloat, T$ effects and BP$ effects.
2030 //
2031 // If there's no contention try a 1-0 exit. That is, exit without
2032 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
2033 // we detect and recover from the race that the 1-0 exit admits.
2034 //
2035 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
2036 // before it STs null into _owner, releasing the lock. Updates
2037 // to data protected by the critical section must be visible before
2038 // we drop the lock (and thus before any other thread could acquire
2039 // the lock and observe the fields protected by the lock).
2040 // IA32's memory-model is SPO, so STs are ordered with respect to
2041 // each other and there's no need for an explicit barrier (fence).
2042 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
2043 #ifndef _LP64
2044 get_thread (boxReg);
2045
2046 // Note that we could employ various encoding schemes to reduce
2047 // the number of loads below (currently 4) to just 2 or 3.
2048 // Refer to the comments in synchronizer.cpp.
2049 // In practice the chain of fetches doesn't seem to impact performance, however.
2050 xorptr(boxReg, boxReg);
2051 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2052 jccb (Assembler::notZero, DONE_LABEL);
2053 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2054 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2055 jccb (Assembler::notZero, CheckSucc);
2058
2059 bind (Stacked);
2060 // It's not inflated and it's not recursively stack-locked and it's not biased.
2061 // It must be stack-locked.
2062 // Try to reset the header to displaced header.
2063 // The "box" value on the stack is stable, so we can reload
2064 // and be assured we observe the same value as above.
2065 movptr(tmpReg, Address(boxReg, 0));
2066 lock();
2067 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2068 // Intention fall-thru into DONE_LABEL
2069
2070 // DONE_LABEL is a hot target - we'd really like to place it at the
2071 // start of cache line by padding with NOPs.
2072 // See the AMD and Intel software optimization manuals for the
2073 // most efficient "long" NOP encodings.
2074 // Unfortunately none of our alignment mechanisms suffice.
2075 bind (CheckSucc);
2076 #else // _LP64
2077 // It's inflated
2078
2079 if (!HandshakeAfterDeflateIdleMonitors) {
2080 // Increment the ObjectMonitor's ref_count for safety or force the
2081 // exit slow path via DONE_LABEL.
2082 // In fast_unlock(), tmpReg contains the object's mark word which,
2083 // in this case, is the (ObjectMonitor* | monitor_value). Also this
2084 // code uses boxReg as its temporary register.
2085 inc_om_ref_count(objReg, tmpReg /* om_reg */, boxReg /* tmp_reg */, DONE_LABEL);
2086 }
2087
2088 // Try to avoid passing control into the slow path ...
2089 Label LSuccess, LGoSlowPath;
2090 xorptr(boxReg, boxReg);
2091 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2092 jccb(Assembler::notZero, LGoSlowPath);
2093 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2094 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2095 jccb (Assembler::notZero, CheckSucc);
2096 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
2097 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2098 jmpb(LSuccess);
2099
2100 bind (CheckSucc);
2101
2102 // The following optional optimization can be elided if necessary
2103 // Effectively: if (succ == null) goto slow path
2104 // The code reduces the window for a race, however,
2105 // and thus benefits performance.
2106 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2107 jccb (Assembler::zero, LGoSlowPath);
2108
2109 xorptr(boxReg, boxReg);
2110 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
2111 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2112
2113 // Memory barrier/fence
2114 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2115 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2116 // This is faster on Nehalem and AMD Shanghai/Barcelona.
2117 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2118 // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2119 // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2120 lock(); addl(Address(rsp, 0), 0);
2121
2122 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2123 jccb (Assembler::notZero, LSuccess);
2124
2125 // Rare inopportune interleaving - race.
2126 // The successor vanished in the small window above.
2127 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2128 // We need to ensure progress and succession.
2129 // Try to reacquire the lock.
2130 // If that fails then the new owner is responsible for succession and this
2131 // thread needs to take no further action and can exit via the fast path (success).
2132 // If the re-acquire succeeds then pass control into the slow path.
2133 // As implemented, this latter mode is horrible because we generated more
2134 // coherence traffic on the lock *and* artifically extended the critical section
2135 // length while by virtue of passing control into the slow path.
2136
2137 // box is really RAX -- the following CMPXCHG depends on that binding
2138 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2139 lock();
2140 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2141 // There's no successor so we tried to regrab the lock.
2142 // If that didn't work, then another thread grabbed the
2143 // lock so we're done (and exit was a success).
2144 jccb (Assembler::notEqual, LSuccess);
2145 // Intentional fall-through into slow path
2146
2147 bind (LGoSlowPath);
2148 if (!HandshakeAfterDeflateIdleMonitors) {
2149 lock();
2150 decrementl(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count)));
2151 }
2152 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
2153 jmpb (DONE_LABEL);
2154
2155 bind (LSuccess);
2156 if (!HandshakeAfterDeflateIdleMonitors) {
2157 lock();
2158 decrementl(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count)));
2159 }
2160 testl (boxReg, 0); // set ICC.ZF=1 to indicate success
2161 jmpb (DONE_LABEL);
2162
2163 bind (Stacked);
2164 movptr(tmpReg, Address (boxReg, 0)); // re-fetch
2165 lock();
2166 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2167
2168 #endif
2169 bind(DONE_LABEL);
2170 }
2171 #endif // COMPILER2
2172
2173 void MacroAssembler::c2bool(Register x) {
2174 // implements x == 0 ? 0 : 1
2175 // note: must only look at least-significant byte of x
2176 // since C-style booleans are stored in one byte
2177 // only! (was bug)
2178 andl(x, 0xFF);
2179 setb(Assembler::notZero, x);
|