< prev index next >

src/hotspot/cpu/x86/macroAssembler_x86.cpp

Print this page
rev 56635 : v2.00 -> v2.05 (CR5/v2.05/8-for-jdk13) patches combined into one; merge with 8229212.patch; merge with jdk-14+11; merge with 8230184.patch; merge with 8230876.patch; merge with jdk-14+15; merge with jdk-14+18.
rev 56639 : loosen a couple more counter checks due to races observed in testing; simplify om_release() extraction of mid since list head or cur_mid_in_use is marked; simplify deflate_monitor_list() extraction of mid since there are no parallel deleters due to the safepoint; simplify deflate_monitor_list_using_JT() extraction of mid since list head or cur_mid_in_use is marked; prepend_block_to_lists() - simplify based on David H's comments; does not need load_acquire() or release_store() because of the cmpxchg(); prepend_to_common() - simplify to use mark_next_loop() for m and use mark_list_head() and release_store() for the non-empty list case; add more debugging for "Non-balanced monitor enter/exit" failure mode; fix race in inflate() in the "CASE: neutral" code path; install_displaced_markword_in_object() does not need to clear the header field since that is handled when the ObjectMonitor is moved from the global free list; LSuccess should clear boxReg to set ICC.ZF=1 to avoid depending on existing boxReg contents; update fast_unlock() to detect when object no longer refers to the same ObjectMonitor and take fast path exit instead; clarify fast_lock() code where we detect when object no longer refers to the same ObjectMonitor; add/update comments for movptr() calls where we move a literal into an Address; remove set_owner(); refactor setting of owner field into set_owner_from(2 versions), set_owner_from_BasicLock(), and try_set_owner_from(); the new functions include monitorinflation+owner logging; extract debug code from v2.06 and v2.07 and move to v2.07.debug; change 'jccb' -> 'jcc' and 'jmpb' -> 'jmp' as needed; checkpoint initial version of MacroAssembler::inc_om_ref_count(); update LP64 MacroAssembler::fast_lock() and fast_unlock() to use inc_om_ref_count(); fast_lock() return flag setting logic can use 'testptr(tmpReg, tmpReg)' instead of 'cmpptr(tmpReg, 0)' since that's more efficient; fast_unlock() LSuccess return flag setting logic can use 'testl (boxReg, 0)' instead of 'xorptr(boxReg, boxReg)' since that's more efficient; cleanup "fast-path" vs "fast path" and "slow-path" vs "slow path"; update MacroAssembler::rtm_inflated_locking() to use inc_om_ref_count(); update MacroAssembler::fast_lock() to preserve the flags before decrementing ref_count and restore the flags afterwards; this is more clean than depending on the contents of rax/tmpReg; coleenp CR - refactor async monitor deflation work from ServiceThread::service_thread_entry() to ObjectSynchronizer::deflate_idle_monitors_using_JT(); rehn,eosterlund CR - add support for HandshakeAfterDeflateIdleMonitors for platforms that don't have ObjectMonitor ref_count support implemented in C2 fast_lock() and fast_unlock().


1279   return null_check_offset;
1280 }
1281 
1282 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1283   assert(UseBiasedLocking, "why call this otherwise?");
1284 
1285   // Check for biased locking unlock case, which is a no-op
1286   // Note: we do not have to check the thread ID for two reasons.
1287   // First, the interpreter checks for IllegalMonitorStateException at
1288   // a higher level. Second, if the bias was revoked while we held the
1289   // lock, the object could not be rebiased toward another thread, so
1290   // the bias bit would be clear.
1291   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1292   andptr(temp_reg, markWord::biased_lock_mask_in_place);
1293   cmpptr(temp_reg, markWord::biased_lock_pattern);
1294   jcc(Assembler::equal, done);
1295 }
1296 
1297 #ifdef COMPILER2
1298 




















































1299 #if INCLUDE_RTM_OPT
1300 
1301 // Update rtm_counters based on abort status
1302 // input: abort_status
1303 //        rtm_counters (RTMLockingCounters*)
1304 // flags are killed
1305 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1306 
1307   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1308   if (PrintPreciseRTMLockingStatistics) {
1309     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1310       Label check_abort;
1311       testl(abort_status, (1<<i));
1312       jccb(Assembler::equal, check_abort);
1313       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1314       bind(check_abort);
1315     }
1316   }
1317 }
1318 


1512   bind(L_decrement_retry);
1513   if (RTMRetryCount > 0) {
1514     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1515     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1516   }
1517 }
1518 
1519 // Use RTM for inflating locks
1520 // inputs: objReg (object to lock)
1521 //         boxReg (on-stack box address (displaced header location) - KILLED)
1522 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
1523 void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1524                                           Register scrReg, Register retry_on_busy_count_Reg,
1525                                           Register retry_on_abort_count_Reg,
1526                                           RTMLockingCounters* rtm_counters,
1527                                           Metadata* method_data, bool profile_rtm,
1528                                           Label& DONE_LABEL) {
1529   assert(UseRTMLocking, "why call this otherwise?");
1530   assert(tmpReg == rax, "");
1531   assert(scrReg == rdx, "");
1532   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1533   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1534 
1535   // Without cast to int32_t a movptr will destroy r10 which is typically obj









1536   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
1537   movptr(boxReg, tmpReg); // Save ObjectMonitor address
1538 
1539   if (RTMRetryCount > 0) {
1540     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
1541     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1542     bind(L_rtm_retry);
1543   }
1544   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1545     Label L_noincrement;
1546     if (RTMTotalCountIncrRate > 1) {
1547       // tmpReg, scrReg and flags are killed
1548       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1549     }
1550     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1551     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1552     bind(L_noincrement);
1553   }
1554   xbegin(L_on_abort);
1555   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1556   movptr(tmpReg, Address(tmpReg, owner_offset));
1557   testptr(tmpReg, tmpReg);
1558   jcc(Assembler::zero, DONE_LABEL);
1559   if (UseRTMXendForLockBusy) {
1560     xend();
1561     jmp(L_decrement_retry);
1562   }
1563   else {
1564     xabort(0);
1565   }
1566   bind(L_on_abort);
1567   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1568   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1569     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1570   }
1571   if (RTMRetryCount > 0) {
1572     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1573     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1574   }
1575 
1576   movptr(tmpReg, Address(boxReg, owner_offset)) ;
1577   testptr(tmpReg, tmpReg) ;
1578   jccb(Assembler::notZero, L_decrement_retry) ;
1579 
1580   // Appears unlocked - try to swing _owner from null to non-null.
1581   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1582 #ifdef _LP64
1583   Register threadReg = r15_thread;
1584 #else
1585   get_thread(scrReg);
1586   Register threadReg = scrReg;
1587 #endif
1588   lock();
1589   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1590 
1591   if (RTMRetryCount > 0) {
1592     // success done else retry
1593     jccb(Assembler::equal, DONE_LABEL) ;
1594     bind(L_decrement_retry);
1595     // Spin and retry if lock is busy.
1596     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1597   }
1598   else {
1599     bind(L_decrement_retry);
1600   }













1601 }
1602 
1603 #endif //  INCLUDE_RTM_OPT
1604 
1605 // Fast_Lock and Fast_Unlock used by C2
1606 
1607 // Because the transitions from emitted code to the runtime
1608 // monitorenter/exit helper stubs are so slow it's critical that
1609 // we inline both the stack-locking fast-path and the inflated fast path.
1610 //
1611 // See also: cmpFastLock and cmpFastUnlock.
1612 //
1613 // What follows is a specialized inline transliteration of the code
1614 // in enter() and exit(). If we're concerned about I$ bloat another
1615 // option would be to emit TrySlowEnter and TrySlowExit methods
1616 // at startup-time.  These methods would accept arguments as
1617 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1618 // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
1619 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1620 // In practice, however, the # of lock sites is bounded and is usually small.
1621 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1622 // if the processor uses simple bimodal branch predictors keyed by EIP
1623 // Since the helper routines would be called from multiple synchronization
1624 // sites.
1625 //
1626 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1627 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1628 // to those specialized methods.  That'd give us a mostly platform-independent
1629 // implementation that the JITs could optimize and inline at their pleasure.
1630 // Done correctly, the only time we'd need to cross to native could would be
1631 // to park() or unpark() threads.  We'd also need a few more unsafe operators
1632 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1633 // (b) explicit barriers or fence operations.
1634 //
1635 // TODO:
1636 //
1637 // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1638 //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1639 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
1640 //    the lock operators would typically be faster than reifying Self.
1641 //
1642 // *  Ideally I'd define the primitives as:
1643 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1644 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1645 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
1646 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
1647 //    Furthermore the register assignments are overconstrained, possibly resulting in
1648 //    sub-optimal code near the synchronization site.
1649 //
1650 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
1651 //    Alternately, use a better sp-proximity test.
1652 //
1653 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1654 //    Either one is sufficient to uniquely identify a thread.
1655 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1656 //
1657 // *  Intrinsify notify() and notifyAll() for the common cases where the
1658 //    object is locked by the calling thread but the waitlist is empty.
1659 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1660 //
1661 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
1662 //    But beware of excessive branch density on AMD Opterons.
1663 //
1664 // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1665 //    or failure of the fast-path.  If the fast-path fails then we pass
1666 //    control to the slow-path, typically in C.  In Fast_Lock and
1667 //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1668 //    will emit a conditional branch immediately after the node.
1669 //    So we have branches to branches and lots of ICC.ZF games.
1670 //    Instead, it might be better to have C2 pass a "FailureLabel"
1671 //    into Fast_Lock and Fast_Unlock.  In the case of success, control
1672 //    will drop through the node.  ICC.ZF is undefined at exit.
1673 //    In the case of failure, the node will branch directly to the
1674 //    FailureLabel
1675 
1676 
1677 // obj: object to lock
1678 // box: on-stack box address (displaced header location) - KILLED
1679 // rax,: tmp -- KILLED
1680 // scr: tmp -- KILLED
1681 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1682                                Register scrReg, Register cx1Reg, Register cx2Reg,
1683                                BiasedLockingCounters* counters,
1684                                RTMLockingCounters* rtm_counters,
1685                                RTMLockingCounters* stack_rtm_counters,
1686                                Metadata* method_data,
1687                                bool use_rtm, bool profile_rtm) {
1688   // Ensure the register assignments are disjoint
1689   assert(tmpReg == rax, "");
1690 
1691   if (use_rtm) {


1796 
1797   // Appears unlocked - try to swing _owner from null to non-null.
1798   // Ideally, I'd manifest "Self" with get_thread and then attempt
1799   // to CAS the register containing Self into m->Owner.
1800   // But we don't have enough registers, so instead we can either try to CAS
1801   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1802   // we later store "Self" into m->Owner.  Transiently storing a stack address
1803   // (rsp or the address of the box) into  m->owner is harmless.
1804   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1805   lock();
1806   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1807   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1808   // If we weren't able to swing _owner from NULL to the BasicLock
1809   // then take the slow path.
1810   jccb  (Assembler::notZero, DONE_LABEL);
1811   // update _owner from BasicLock to thread
1812   get_thread (scrReg);                    // beware: clobbers ICCs
1813   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1814   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1815 
1816   // If the CAS fails we can either retry or pass control to the slow-path.
1817   // We use the latter tactic.
1818   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1819   // If the CAS was successful ...
1820   //   Self has acquired the lock
1821   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1822   // Intentional fall-through into DONE_LABEL ...
1823 #else // _LP64
1824   // It's inflated
1825   movq(scrReg, tmpReg);
1826   xorq(tmpReg, tmpReg);
1827 










1828   lock();
1829   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1830   // Unconditionally set box->_displaced_header = markWord::unused_mark().
1831   // Without cast to int32_t movptr will destroy r10 which is typically obj.
1832   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
1833   // The following code to verify that the object field still refers
1834   // to the object we are trying to lock is not needed with safepoint
1835   // based deflation. It is also not needed with async deflation when
1836   // the DEFLATER_MARKER is allowed to linger in the owner field in an
1837   // async deflated ObjectMonitor until replaced by the next owner value.
1838   // We keep this code as a sanity check against bugs in other parts
1839   // of the async deflation mechanism.
1840   //
1841   // If we weren't able to swing _owner from NULL to r15_thread
1842   // then take the slow path.
1843   jccb(Assembler::notZero, DONE_LABEL);
1844   // r15_thread is now the owner so verify that the ObjectMonitor
1845   // still refers to the same object.
1846   cmpptr(objReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(object)));
1847   // The ObjectMonitor still refers to the same object so
1848   // r15_thread's ownership is valid.
1849   jccb(Assembler::zero, DONE_LABEL);
1850   // The ObjectMonitor does not refer to the same object so
1851   // drop ownership.
1852   movptr(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1853   // Intentional fall-through into DONE_LABEL ...
1854   // Propagate ICC.ZF from cmpptr() above into DONE_LABEL.








1855 #endif // _LP64
1856 #if INCLUDE_RTM_OPT
1857   } // use_rtm()
1858 #endif
1859   // DONE_LABEL is a hot target - we'd really like to place it at the
1860   // start of cache line by padding with NOPs.
1861   // See the AMD and Intel software optimization manuals for the
1862   // most efficient "long" NOP encodings.
1863   // Unfortunately none of our alignment mechanisms suffice.
1864   bind(DONE_LABEL);
1865 
1866   // At DONE_LABEL the icc ZFlag is set as follows ...
1867   // Fast_Unlock uses the same protocol.
1868   // ZFlag == 1 -> Success
1869   // ZFlag == 0 -> Failure - force control through the slow-path
1870 }
1871 
1872 // obj: object to unlock
1873 // box: box address (displaced header location), killed.  Must be EAX.
1874 // tmp: killed, cannot be obj nor box.
1875 //
1876 // Some commentary on balanced locking:
1877 //
1878 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1879 // Methods that don't have provably balanced locking are forced to run in the
1880 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1881 // The interpreter provides two properties:
1882 // I1:  At return-time the interpreter automatically and quietly unlocks any
1883 //      objects acquired the current activation (frame).  Recall that the
1884 //      interpreter maintains an on-stack list of locks currently held by
1885 //      a frame.
1886 // I2:  If a method attempts to unlock an object that is not held by the
1887 //      the frame the interpreter throws IMSX.
1888 //
1889 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1890 // B() doesn't have provably balanced locking so it runs in the interpreter.
1891 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1892 // is still locked by A().
1893 //
1894 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1895 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1896 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1897 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1898 // Arguably given that the spec legislates the JNI case as undefined our implementation
1899 // could reasonably *avoid* checking owner in Fast_Unlock().
1900 // In the interest of performance we elide m->Owner==Self check in unlock.
1901 // A perfectly viable alternative is to elide the owner check except when
1902 // Xcheck:jni is enabled.
1903 
1904 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1905   assert(boxReg == rax, "");
1906   assert_different_registers(objReg, boxReg, tmpReg);
1907 
1908   Label DONE_LABEL, Stacked, CheckSucc;
1909 
1910   // Critically, the biased locking test must have precedence over
1911   // and appear before the (box->dhw == 0) recursive stack-lock test.
1912   if (UseBiasedLocking && !UseOptoBiasInlining) {
1913     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1914   }
1915 
1916 #if INCLUDE_RTM_OPT
1917   if (UseRTMForStackLocks && use_rtm) {
1918     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1919     Label L_regular_unlock;
1920     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
1921     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
1922     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
1923     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
1924     xend();                                                           // otherwise end...
1925     jmp(DONE_LABEL);                                                  // ... and we're done
1926     bind(L_regular_unlock);
1927   }
1928 #endif
1929 
1930   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
1931   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
1932   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
1933   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
1934   jccb  (Assembler::zero, Stacked);
1935 
1936   // It's inflated.
1937 #if INCLUDE_RTM_OPT
1938   if (use_rtm) {
1939     Label L_regular_inflated_unlock;
1940     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1941     movptr(boxReg, Address(tmpReg, owner_offset));
1942     testptr(boxReg, boxReg);
1943     jccb(Assembler::notZero, L_regular_inflated_unlock);
1944     xend();
1945     jmpb(DONE_LABEL);
1946     bind(L_regular_inflated_unlock);
1947   }
1948 #endif
1949 
1950   // Despite our balanced locking property we still check that m->_owner == Self
1951   // as java routines or native JNI code called by this thread might
1952   // have released the lock.
1953   // Refer to the comments in synchronizer.cpp for how we might encode extra
1954   // state in _succ so we can avoid fetching EntryList|cxq.
1955   //
1956   // I'd like to add more cases in fast_lock() and fast_unlock() --
1957   // such as recursive enter and exit -- but we have to be wary of
1958   // I$ bloat, T$ effects and BP$ effects.
1959   //
1960   // If there's no contention try a 1-0 exit.  That is, exit without
1961   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
1962   // we detect and recover from the race that the 1-0 exit admits.
1963   //
1964   // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
1965   // before it STs null into _owner, releasing the lock.  Updates
1966   // to data protected by the critical section must be visible before
1967   // we drop the lock (and thus before any other thread could acquire
1968   // the lock and observe the fields protected by the lock).
1969   // IA32's memory-model is SPO, so STs are ordered with respect to
1970   // each other and there's no need for an explicit barrier (fence).
1971   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
1972 #ifndef _LP64
1973   get_thread (boxReg);
1974 
1975   // Note that we could employ various encoding schemes to reduce
1976   // the number of loads below (currently 4) to just 2 or 3.
1977   // Refer to the comments in synchronizer.cpp.
1978   // In practice the chain of fetches doesn't seem to impact performance, however.
1979   xorptr(boxReg, boxReg);
1980   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1981   jccb  (Assembler::notZero, DONE_LABEL);
1982   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1983   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1984   jccb  (Assembler::notZero, CheckSucc);


1987 
1988   bind (Stacked);
1989   // It's not inflated and it's not recursively stack-locked and it's not biased.
1990   // It must be stack-locked.
1991   // Try to reset the header to displaced header.
1992   // The "box" value on the stack is stable, so we can reload
1993   // and be assured we observe the same value as above.
1994   movptr(tmpReg, Address(boxReg, 0));
1995   lock();
1996   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
1997   // Intention fall-thru into DONE_LABEL
1998 
1999   // DONE_LABEL is a hot target - we'd really like to place it at the
2000   // start of cache line by padding with NOPs.
2001   // See the AMD and Intel software optimization manuals for the
2002   // most efficient "long" NOP encodings.
2003   // Unfortunately none of our alignment mechanisms suffice.
2004   bind (CheckSucc);
2005 #else // _LP64
2006   // It's inflated












2007   xorptr(boxReg, boxReg);
2008   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2009   jccb  (Assembler::notZero, DONE_LABEL);
2010   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2011   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2012   jccb  (Assembler::notZero, CheckSucc);

2013   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2014   jmpb  (DONE_LABEL);
2015 
2016   // Try to avoid passing control into the slow_path ...
2017   Label LSuccess, LGoSlowPath ;
2018   bind  (CheckSucc);
2019 
2020   // The following optional optimization can be elided if necessary
2021   // Effectively: if (succ == null) goto SlowPath
2022   // The code reduces the window for a race, however,
2023   // and thus benefits performance.
2024   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2025   jccb  (Assembler::zero, LGoSlowPath);
2026 
2027   xorptr(boxReg, boxReg);

2028   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2029 
2030   // Memory barrier/fence
2031   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2032   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2033   // This is faster on Nehalem and AMD Shanghai/Barcelona.
2034   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2035   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2036   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2037   lock(); addl(Address(rsp, 0), 0);
2038 
2039   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2040   jccb  (Assembler::notZero, LSuccess);
2041 
2042   // Rare inopportune interleaving - race.
2043   // The successor vanished in the small window above.
2044   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2045   // We need to ensure progress and succession.
2046   // Try to reacquire the lock.
2047   // If that fails then the new owner is responsible for succession and this
2048   // thread needs to take no further action and can exit via the fast path (success).
2049   // If the re-acquire succeeds then pass control into the slow path.
2050   // As implemented, this latter mode is horrible because we generated more
2051   // coherence traffic on the lock *and* artifically extended the critical section
2052   // length while by virtue of passing control into the slow path.
2053 
2054   // box is really RAX -- the following CMPXCHG depends on that binding
2055   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2056   lock();
2057   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2058   // There's no successor so we tried to regrab the lock.
2059   // If that didn't work, then another thread grabbed the
2060   // lock so we're done (and exit was a success).
2061   jccb  (Assembler::notEqual, LSuccess);
2062   // Intentional fall-through into slow-path
2063 
2064   bind  (LGoSlowPath);




2065   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2066   jmpb  (DONE_LABEL);
2067 
2068   bind  (LSuccess);




2069   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2070   jmpb  (DONE_LABEL);
2071 
2072   bind  (Stacked);
2073   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2074   lock();
2075   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2076 
2077 #endif
2078   bind(DONE_LABEL);
2079 }
2080 #endif // COMPILER2
2081 
2082 void MacroAssembler::c2bool(Register x) {
2083   // implements x == 0 ? 0 : 1
2084   // note: must only look at least-significant byte of x
2085   //       since C-style booleans are stored in one byte
2086   //       only! (was bug)
2087   andl(x, 0xFF);
2088   setb(Assembler::notZero, x);




1279   return null_check_offset;
1280 }
1281 
1282 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1283   assert(UseBiasedLocking, "why call this otherwise?");
1284 
1285   // Check for biased locking unlock case, which is a no-op
1286   // Note: we do not have to check the thread ID for two reasons.
1287   // First, the interpreter checks for IllegalMonitorStateException at
1288   // a higher level. Second, if the bias was revoked while we held the
1289   // lock, the object could not be rebiased toward another thread, so
1290   // the bias bit would be clear.
1291   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1292   andptr(temp_reg, markWord::biased_lock_mask_in_place);
1293   cmpptr(temp_reg, markWord::biased_lock_pattern);
1294   jcc(Assembler::equal, done);
1295 }
1296 
1297 #ifdef COMPILER2
1298 
1299 // Increment the ObjectMonitor's ref_count for safety or force a branch
1300 // to 'done' with ICC.ZF=0 to indicate failure/take the slow path.
1301 void MacroAssembler::inc_om_ref_count(Register obj_reg, Register om_reg, Register tmp_reg, Label& done) {
1302   atomic_incl(Address(om_reg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count)));
1303 
1304   Label LGoSlowPath;
1305   if (AsyncDeflateIdleMonitors) {
1306     // Race here if monitor is not owned! The above ref_count bump
1307     // will cause subsequent async deflation to skip it. However,
1308     // previous or concurrent async deflation is a race.
1309 
1310     // First check: if the owner field == DEFLATER_MARKER:
1311     movptr(tmp_reg, Address(om_reg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1312     // DEFLATER_MARKER == reinterpret_cast<void*>(-1) so the compiler
1313     // doesn't like to use the define here:
1314     cmpptr(tmp_reg, -1);
1315     // If marked for async deflation, then take the slow path. This is a
1316     // simpler check than what ObjectMonitorHandle::save_om_ptr() does
1317     // so ObjectMonitor::install_displaced_markword_in_object() doesn't
1318     // have to be implemented in macro assembler.
1319     jccb(Assembler::equal, LGoSlowPath);
1320 
1321     // Second check: if ref_count field <= 0:
1322     movptr(tmp_reg, Address(om_reg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count)));
1323     cmpptr(tmp_reg, 0);
1324     // If async deflation is in the process of bailing out, but has not
1325     // yet restored the ref_count field, then we take the slow path. We
1326     // want a stable ref_count value for the fast path.
1327     jccb(Assembler::lessEqual, LGoSlowPath);
1328 
1329     // Final check: if object field == obj_reg:
1330     cmpptr(obj_reg, Address(om_reg, OM_OFFSET_NO_MONITOR_VALUE_TAG(object)));
1331     // If the ObjectMonitor has been deflated and recycled, then take
1332     // the slow path.
1333     jccb(Assembler::notEqual, LGoSlowPath);
1334   }
1335 
1336   Label LRetToCaller;
1337   // We leave the ref_count incremented to protect the caller's code
1338   // paths against async deflation.
1339   jmpb(LRetToCaller);
1340 
1341   bind(LGoSlowPath);
1342   lock();
1343   decrementl(Address(om_reg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count)));
1344   // Jump to 'done' with ICC.ZF=0 to indicate failure/take the slow path.
1345   orl(tmp_reg, 1);
1346   jmp(done);
1347 
1348   bind(LRetToCaller);
1349 }
1350 
1351 #if INCLUDE_RTM_OPT
1352 
1353 // Update rtm_counters based on abort status
1354 // input: abort_status
1355 //        rtm_counters (RTMLockingCounters*)
1356 // flags are killed
1357 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1358 
1359   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1360   if (PrintPreciseRTMLockingStatistics) {
1361     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1362       Label check_abort;
1363       testl(abort_status, (1<<i));
1364       jccb(Assembler::equal, check_abort);
1365       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1366       bind(check_abort);
1367     }
1368   }
1369 }
1370 


1564   bind(L_decrement_retry);
1565   if (RTMRetryCount > 0) {
1566     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1567     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1568   }
1569 }
1570 
1571 // Use RTM for inflating locks
1572 // inputs: objReg (object to lock)
1573 //         boxReg (on-stack box address (displaced header location) - KILLED)
1574 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
1575 void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1576                                           Register scrReg, Register retry_on_busy_count_Reg,
1577                                           Register retry_on_abort_count_Reg,
1578                                           RTMLockingCounters* rtm_counters,
1579                                           Metadata* method_data, bool profile_rtm,
1580                                           Label& DONE_LABEL) {
1581   assert(UseRTMLocking, "why call this otherwise?");
1582   assert(tmpReg == rax, "");
1583   assert(scrReg == rdx, "");
1584   Label L_rtm_retry, L_decrement_retry, L_on_abort, L_local_done;
1585   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1586 
1587   if (!HandshakeAfterDeflateIdleMonitors) {
1588     // Increment the ObjectMonitor's ref_count for safety or force the
1589     // enter slow path via DONE_LABEL.
1590     // In rtm_inflated_locking(), initially tmpReg contains the object's
1591     // mark word which, in this case, is the (ObjectMonitor* | monitor_value).
1592     // Also this code uses scrReg as its temporary register.
1593     inc_om_ref_count(objReg, tmpReg /* om_reg */, scrReg /* tmp_reg */, DONE_LABEL);
1594   }
1595 
1596   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
1597   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
1598   movptr(boxReg, tmpReg); // Save ObjectMonitor address
1599 
1600   if (RTMRetryCount > 0) {
1601     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
1602     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1603     bind(L_rtm_retry);
1604   }
1605   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1606     Label L_noincrement;
1607     if (RTMTotalCountIncrRate > 1) {
1608       // tmpReg, scrReg and flags are killed
1609       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1610     }
1611     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1612     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1613     bind(L_noincrement);
1614   }
1615   xbegin(L_on_abort);
1616   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1617   movptr(tmpReg, Address(tmpReg, owner_offset));
1618   testptr(tmpReg, tmpReg);
1619   jcc(Assembler::zero, L_local_done);
1620   if (UseRTMXendForLockBusy) {
1621     xend();
1622     jmp(L_decrement_retry);
1623   }
1624   else {
1625     xabort(0);
1626   }
1627   bind(L_on_abort);
1628   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1629   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1630     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1631   }
1632   if (RTMRetryCount > 0) {
1633     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1634     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1635   }
1636 
1637   movptr(tmpReg, Address(boxReg, owner_offset)) ;
1638   testptr(tmpReg, tmpReg) ;
1639   jccb(Assembler::notZero, L_decrement_retry) ;
1640 
1641   // Appears unlocked - try to swing _owner from null to non-null.
1642   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1643 #ifdef _LP64
1644   Register threadReg = r15_thread;
1645 #else
1646   get_thread(scrReg);
1647   Register threadReg = scrReg;
1648 #endif
1649   lock();
1650   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1651 
1652   if (RTMRetryCount > 0) {
1653     // success done else retry
1654     jccb(Assembler::equal, L_local_done);
1655     bind(L_decrement_retry);
1656     // Spin and retry if lock is busy.
1657     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1658   }
1659   else {
1660     bind(L_decrement_retry);
1661   }
1662 
1663   // rtm_inflated_locking() exit paths come here except for a failed
1664   // inc_om_ref_count() which goes directly to DONE_LABEL.
1665   bind(L_local_done);
1666   if (!HandshakeAfterDeflateIdleMonitors) {
1667     pushf();  // Preserve flags.
1668     // Decrement the ObjectMonitor's ref_count.
1669     lock();
1670     decrementl(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count)));
1671     popf();  // Restore flags so we have the proper ICC.ZF value.
1672   }
1673 
1674   jmp(DONE_LABEL) ;
1675 }
1676 
1677 #endif //  INCLUDE_RTM_OPT
1678 
1679 // fast_lock and fast_unlock used by C2
1680 
1681 // Because the transitions from emitted code to the runtime
1682 // monitorenter/exit helper stubs are so slow it's critical that
1683 // we inline both the stack-locking fast path and the inflated fast path.
1684 //
1685 // See also: cmpFastLock and cmpFastUnlock.
1686 //
1687 // What follows is a specialized inline transliteration of the code
1688 // in enter() and exit(). If we're concerned about I$ bloat another
1689 // option would be to emit TrySlowEnter and TrySlowExit methods
1690 // at startup-time.  These methods would accept arguments as
1691 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1692 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
1693 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1694 // In practice, however, the # of lock sites is bounded and is usually small.
1695 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1696 // if the processor uses simple bimodal branch predictors keyed by EIP
1697 // Since the helper routines would be called from multiple synchronization
1698 // sites.
1699 //
1700 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1701 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1702 // to those specialized methods.  That'd give us a mostly platform-independent
1703 // implementation that the JITs could optimize and inline at their pleasure.
1704 // Done correctly, the only time we'd need to cross to native could would be
1705 // to park() or unpark() threads.  We'd also need a few more unsafe operators
1706 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1707 // (b) explicit barriers or fence operations.
1708 //
1709 // TODO:
1710 //
1711 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
1712 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
1713 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
1714 //    the lock operators would typically be faster than reifying Self.
1715 //
1716 // *  Ideally I'd define the primitives as:
1717 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1718 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1719 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
1720 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
1721 //    Furthermore the register assignments are overconstrained, possibly resulting in
1722 //    sub-optimal code near the synchronization site.
1723 //
1724 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
1725 //    Alternately, use a better sp-proximity test.
1726 //
1727 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1728 //    Either one is sufficient to uniquely identify a thread.
1729 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1730 //
1731 // *  Intrinsify notify() and notifyAll() for the common cases where the
1732 //    object is locked by the calling thread but the waitlist is empty.
1733 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1734 //
1735 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
1736 //    But beware of excessive branch density on AMD Opterons.
1737 //
1738 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
1739 //    or failure of the fast path.  If the fast path fails then we pass
1740 //    control to the slow path, typically in C.  In fast_lock and
1741 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
1742 //    will emit a conditional branch immediately after the node.
1743 //    So we have branches to branches and lots of ICC.ZF games.
1744 //    Instead, it might be better to have C2 pass a "FailureLabel"
1745 //    into fast_lock and fast_unlock.  In the case of success, control
1746 //    will drop through the node.  ICC.ZF is undefined at exit.
1747 //    In the case of failure, the node will branch directly to the
1748 //    FailureLabel
1749 
1750 
1751 // obj: object to lock
1752 // box: on-stack box address (displaced header location) - KILLED
1753 // rax,: tmp -- KILLED
1754 // scr: tmp -- KILLED
1755 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1756                                Register scrReg, Register cx1Reg, Register cx2Reg,
1757                                BiasedLockingCounters* counters,
1758                                RTMLockingCounters* rtm_counters,
1759                                RTMLockingCounters* stack_rtm_counters,
1760                                Metadata* method_data,
1761                                bool use_rtm, bool profile_rtm) {
1762   // Ensure the register assignments are disjoint
1763   assert(tmpReg == rax, "");
1764 
1765   if (use_rtm) {


1870 
1871   // Appears unlocked - try to swing _owner from null to non-null.
1872   // Ideally, I'd manifest "Self" with get_thread and then attempt
1873   // to CAS the register containing Self into m->Owner.
1874   // But we don't have enough registers, so instead we can either try to CAS
1875   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1876   // we later store "Self" into m->Owner.  Transiently storing a stack address
1877   // (rsp or the address of the box) into  m->owner is harmless.
1878   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1879   lock();
1880   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1881   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1882   // If we weren't able to swing _owner from NULL to the BasicLock
1883   // then take the slow path.
1884   jccb  (Assembler::notZero, DONE_LABEL);
1885   // update _owner from BasicLock to thread
1886   get_thread (scrReg);                    // beware: clobbers ICCs
1887   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1888   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1889 
1890   // If the CAS fails we can either retry or pass control to the slow path.
1891   // We use the latter tactic.
1892   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1893   // If the CAS was successful ...
1894   //   Self has acquired the lock
1895   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1896   // Intentional fall-through into DONE_LABEL ...
1897 #else // _LP64
1898   // It's inflated and we use scrReg for ObjectMonitor* in this section.
1899   movq(scrReg, tmpReg);

1900 
1901   if (!HandshakeAfterDeflateIdleMonitors) {
1902     // Increment the ObjectMonitor's ref_count for safety or force the
1903     // enter slow path via DONE_LABEL.
1904     // In fast_lock(), scrReg contains the object's mark word which,
1905     // in this case, is the (ObjectMonitor* | monitor_value). Also this
1906     // code uses tmpReg as its temporary register.
1907     inc_om_ref_count(objReg, scrReg /* om_reg */, tmpReg /* tmp_reg */, DONE_LABEL);
1908   }
1909 
1910   xorq(tmpReg, tmpReg);
1911   lock();
1912   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1913   // Unconditionally set box->_displaced_header = markWord::unused_mark().
1914   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
1915   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));




















1916   // Intentional fall-through into DONE_LABEL ...
1917   // Propagate ICC.ZF from CAS above into DONE_LABEL.
1918 
1919   if (!HandshakeAfterDeflateIdleMonitors) {
1920     pushf();  // Preserve flags.
1921     // Decrement the ObjectMonitor's ref_count.
1922     lock();
1923     decrementl(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count)));
1924     popf();  // Restore flags so we have the proper ICC.ZF value.
1925   }
1926 #endif // _LP64
1927 #if INCLUDE_RTM_OPT
1928   } // use_rtm()
1929 #endif
1930   // DONE_LABEL is a hot target - we'd really like to place it at the
1931   // start of cache line by padding with NOPs.
1932   // See the AMD and Intel software optimization manuals for the
1933   // most efficient "long" NOP encodings.
1934   // Unfortunately none of our alignment mechanisms suffice.
1935   bind(DONE_LABEL);
1936 
1937   // At DONE_LABEL the icc ZFlag is set as follows ...
1938   // fast_unlock uses the same protocol.
1939   // ZFlag == 1 -> Success
1940   // ZFlag == 0 -> Failure - force control through the slow path
1941 }
1942 
1943 // obj: object to unlock
1944 // box: box address (displaced header location), killed.  Must be EAX.
1945 // tmp: killed, cannot be obj nor box.
1946 //
1947 // Some commentary on balanced locking:
1948 //
1949 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
1950 // Methods that don't have provably balanced locking are forced to run in the
1951 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1952 // The interpreter provides two properties:
1953 // I1:  At return-time the interpreter automatically and quietly unlocks any
1954 //      objects acquired the current activation (frame).  Recall that the
1955 //      interpreter maintains an on-stack list of locks currently held by
1956 //      a frame.
1957 // I2:  If a method attempts to unlock an object that is not held by the
1958 //      the frame the interpreter throws IMSX.
1959 //
1960 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1961 // B() doesn't have provably balanced locking so it runs in the interpreter.
1962 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1963 // is still locked by A().
1964 //
1965 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1966 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1967 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1968 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1969 // Arguably given that the spec legislates the JNI case as undefined our implementation
1970 // could reasonably *avoid* checking owner in fast_unlock().
1971 // In the interest of performance we elide m->Owner==Self check in unlock.
1972 // A perfectly viable alternative is to elide the owner check except when
1973 // Xcheck:jni is enabled.
1974 
1975 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1976   assert(boxReg == rax, "");
1977   assert_different_registers(objReg, boxReg, tmpReg);
1978 
1979   Label DONE_LABEL, Stacked, CheckSucc;
1980 
1981   // Critically, the biased locking test must have precedence over
1982   // and appear before the (box->dhw == 0) recursive stack-lock test.
1983   if (UseBiasedLocking && !UseOptoBiasInlining) {
1984     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1985   }
1986 
1987 #if INCLUDE_RTM_OPT
1988   if (UseRTMForStackLocks && use_rtm) {
1989     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1990     Label L_regular_unlock;
1991     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
1992     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
1993     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
1994     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
1995     xend();                                                           // otherwise end...
1996     jmp(DONE_LABEL);                                                  // ... and we're done
1997     bind(L_regular_unlock);
1998   }
1999 #endif
2000 
2001   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
2002   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
2003   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
2004   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
2005   jcc  (Assembler::zero, Stacked);
2006 
2007   // It's inflated.
2008 #if INCLUDE_RTM_OPT
2009   if (use_rtm) {
2010     Label L_regular_inflated_unlock;
2011     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
2012     movptr(boxReg, Address(tmpReg, owner_offset));
2013     testptr(boxReg, boxReg);
2014     jccb(Assembler::notZero, L_regular_inflated_unlock);
2015     xend();
2016     jmp(DONE_LABEL);
2017     bind(L_regular_inflated_unlock);
2018   }
2019 #endif
2020 
2021   // Despite our balanced locking property we still check that m->_owner == Self
2022   // as java routines or native JNI code called by this thread might
2023   // have released the lock.
2024   // Refer to the comments in synchronizer.cpp for how we might encode extra
2025   // state in _succ so we can avoid fetching EntryList|cxq.
2026   //
2027   // I'd like to add more cases in fast_lock() and fast_unlock() --
2028   // such as recursive enter and exit -- but we have to be wary of
2029   // I$ bloat, T$ effects and BP$ effects.
2030   //
2031   // If there's no contention try a 1-0 exit.  That is, exit without
2032   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
2033   // we detect and recover from the race that the 1-0 exit admits.
2034   //
2035   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
2036   // before it STs null into _owner, releasing the lock.  Updates
2037   // to data protected by the critical section must be visible before
2038   // we drop the lock (and thus before any other thread could acquire
2039   // the lock and observe the fields protected by the lock).
2040   // IA32's memory-model is SPO, so STs are ordered with respect to
2041   // each other and there's no need for an explicit barrier (fence).
2042   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
2043 #ifndef _LP64
2044   get_thread (boxReg);
2045 
2046   // Note that we could employ various encoding schemes to reduce
2047   // the number of loads below (currently 4) to just 2 or 3.
2048   // Refer to the comments in synchronizer.cpp.
2049   // In practice the chain of fetches doesn't seem to impact performance, however.
2050   xorptr(boxReg, boxReg);
2051   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2052   jccb  (Assembler::notZero, DONE_LABEL);
2053   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2054   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2055   jccb  (Assembler::notZero, CheckSucc);


2058 
2059   bind (Stacked);
2060   // It's not inflated and it's not recursively stack-locked and it's not biased.
2061   // It must be stack-locked.
2062   // Try to reset the header to displaced header.
2063   // The "box" value on the stack is stable, so we can reload
2064   // and be assured we observe the same value as above.
2065   movptr(tmpReg, Address(boxReg, 0));
2066   lock();
2067   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2068   // Intention fall-thru into DONE_LABEL
2069 
2070   // DONE_LABEL is a hot target - we'd really like to place it at the
2071   // start of cache line by padding with NOPs.
2072   // See the AMD and Intel software optimization manuals for the
2073   // most efficient "long" NOP encodings.
2074   // Unfortunately none of our alignment mechanisms suffice.
2075   bind (CheckSucc);
2076 #else // _LP64
2077   // It's inflated
2078 
2079   if (!HandshakeAfterDeflateIdleMonitors) {
2080     // Increment the ObjectMonitor's ref_count for safety or force the
2081     // exit slow path via DONE_LABEL.
2082     // In fast_unlock(), tmpReg contains the object's mark word which,
2083     // in this case, is the (ObjectMonitor* | monitor_value). Also this
2084     // code uses boxReg as its temporary register.
2085     inc_om_ref_count(objReg, tmpReg /* om_reg */, boxReg /* tmp_reg */, DONE_LABEL);
2086   }
2087 
2088   // Try to avoid passing control into the slow path ...
2089   Label LSuccess, LGoSlowPath;
2090   xorptr(boxReg, boxReg);
2091   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2092   jccb(Assembler::notZero, LGoSlowPath);
2093   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2094   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2095   jccb  (Assembler::notZero, CheckSucc);
2096   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
2097   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2098   jmpb(LSuccess);
2099 


2100   bind  (CheckSucc);
2101 
2102   // The following optional optimization can be elided if necessary
2103   // Effectively: if (succ == null) goto slow path
2104   // The code reduces the window for a race, however,
2105   // and thus benefits performance.
2106   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2107   jccb  (Assembler::zero, LGoSlowPath);
2108 
2109   xorptr(boxReg, boxReg);
2110   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
2111   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2112 
2113   // Memory barrier/fence
2114   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2115   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2116   // This is faster on Nehalem and AMD Shanghai/Barcelona.
2117   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2118   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2119   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2120   lock(); addl(Address(rsp, 0), 0);
2121 
2122   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2123   jccb  (Assembler::notZero, LSuccess);
2124 
2125   // Rare inopportune interleaving - race.
2126   // The successor vanished in the small window above.
2127   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2128   // We need to ensure progress and succession.
2129   // Try to reacquire the lock.
2130   // If that fails then the new owner is responsible for succession and this
2131   // thread needs to take no further action and can exit via the fast path (success).
2132   // If the re-acquire succeeds then pass control into the slow path.
2133   // As implemented, this latter mode is horrible because we generated more
2134   // coherence traffic on the lock *and* artifically extended the critical section
2135   // length while by virtue of passing control into the slow path.
2136 
2137   // box is really RAX -- the following CMPXCHG depends on that binding
2138   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2139   lock();
2140   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2141   // There's no successor so we tried to regrab the lock.
2142   // If that didn't work, then another thread grabbed the
2143   // lock so we're done (and exit was a success).
2144   jccb  (Assembler::notEqual, LSuccess);
2145   // Intentional fall-through into slow path
2146 
2147   bind  (LGoSlowPath);
2148   if (!HandshakeAfterDeflateIdleMonitors) {
2149     lock();
2150     decrementl(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count)));
2151   }
2152   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2153   jmpb  (DONE_LABEL);
2154 
2155   bind  (LSuccess);
2156   if (!HandshakeAfterDeflateIdleMonitors) {
2157     lock();
2158     decrementl(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(ref_count)));
2159   }
2160   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2161   jmpb  (DONE_LABEL);
2162 
2163   bind  (Stacked);
2164   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2165   lock();
2166   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2167 
2168 #endif
2169   bind(DONE_LABEL);
2170 }
2171 #endif // COMPILER2
2172 
2173 void MacroAssembler::c2bool(Register x) {
2174   // implements x == 0 ? 0 : 1
2175   // note: must only look at least-significant byte of x
2176   //       since C-style booleans are stored in one byte
2177   //       only! (was bug)
2178   andl(x, 0xFF);
2179   setb(Assembler::notZero, x);


< prev index next >