--- old/src/cpu/x86/vm/macroAssembler_x86.cpp 2014-03-17 11:39:15.000000000 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.cpp 2014-03-17 11:39:15.000000000 -0700 @@ -301,7 +301,7 @@ mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate()); } -void MacroAssembler::movptr(Register dst, AddressLiteral src) { +void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) { if (src.is_lval()) { mov_literal32(dst, (intptr_t)src.target(), src.rspec()); } else { @@ -613,6 +613,15 @@ /* else */ { subq(dst, value) ; return; } } +void MacroAssembler::incrementq(AddressLiteral dst) { + if (reachable(dst)) { + incrementq(as_Address(dst)); + } else { + lea(rscratch1, dst); + incrementq(Address(rscratch1, 0)); + } +} + void MacroAssembler::incrementq(Register reg, int value) { if (value == min_jint) { addq(reg, value); return; } if (value < 0) { decrementq(reg, -value); return; } @@ -681,15 +690,15 @@ movq(dst, rscratch1); } -void MacroAssembler::movptr(Register dst, AddressLiteral src) { +void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) { if (src.is_lval()) { mov_literal64(dst, (intptr_t)src.target(), src.rspec()); } else { if (reachable(src)) { movq(dst, as_Address(src)); } else { - lea(rscratch1, src); - movq(dst, Address(rscratch1,0)); + lea(scratch, src); + movq(dst, Address(scratch,0)); } } } @@ -988,20 +997,37 @@ LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32)); } -void MacroAssembler::atomic_incl(AddressLiteral counter_addr) { - pushf(); +void MacroAssembler::atomic_incl(Address counter_addr) { + if (os::is_MP()) + lock(); + incrementl(counter_addr); +} + +void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) { if (reachable(counter_addr)) { - if (os::is_MP()) - lock(); - incrementl(as_Address(counter_addr)); + atomic_incl(as_Address(counter_addr)); } else { - lea(rscratch1, counter_addr); - if (os::is_MP()) - lock(); - incrementl(Address(rscratch1, 0)); + lea(scr, counter_addr); + atomic_incl(Address(scr, 0)); + } +} + +#ifdef _LP64 +void MacroAssembler::atomic_incq(Address counter_addr) { + if (os::is_MP()) + lock(); + incrementq(counter_addr); +} + +void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) { + if (reachable(counter_addr)) { + atomic_incq(as_Address(counter_addr)); + } else { + lea(scr, counter_addr); + atomic_incq(Address(scr, 0)); } - popf(); } +#endif // Writes to stack successive pages until offset reached to check for // stack overflow + shadow pages. This clobbers tmp. @@ -1274,6 +1300,156 @@ } #ifdef COMPILER2 + +#if INCLUDE_RTM_OPT + +// Update rtmcounters based on abort status +// input: tmpReg (abort status) +// scrReg (RTMLockingCounters*) +// flag register as scratch +void MacroAssembler::rtmcounters_update(Register tmpReg, Register scrReg) { + + atomic_incptr(Address(scrReg, RTMLockingCounters::abort_count_offset())); + if (PrintPreciseRTMLockingStatistics) { + for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { + Label check_abort; + testl(tmpReg, (1< 0) { + // Delay calculation + movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag()), tmpReg); + testptr(tmpReg, tmpReg); + jccb(Assembler::equal, L_done); + } + // Abort ratio calculation only if abort_count > RTMAbortThreshold + // Aborted transactions = abort_count * 100 + // All transactions = total_count * RTMTotalCountIncrRate + // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) + + movptr(tmpReg, Address(scrReg, RTMLockingCounters::abort_count_offset())); + cmpptr(tmpReg, RTMAbortThreshold); + jccb(Assembler::below, L_check_always_rtm2); + imulptr(tmpReg, tmpReg, 100); + movptr(scrReg, Address(scrReg, RTMLockingCounters::total_count_offset())); + imulptr(scrReg, scrReg, RTMTotalCountIncrRate); + imulptr(scrReg, scrReg, RTMAbortRatio); + cmpptr(tmpReg, scrReg); + jccb(Assembler::below, L_check_always_rtm1); + if (isStackLock) { + increment(boxReg); // = 1 + } + if (method_data != NULL) { + // set rtm_state to "no rtm" in method oop + mov_metadata(tmpReg, method_data); + if (os::is_MP()) { + lock(); + } + orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); + } + jmpb(L_done); + bind(L_check_always_rtm1); + // Reload RTMLockingCounters* address + lea(scrReg, ExternalAddress((address)rtmcounters)); + bind(L_check_always_rtm2); + movptr(tmpReg, Address(scrReg, RTMLockingCounters::total_count_offset())); + cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); + jccb(Assembler::below, L_done); + if (method_data != NULL) { + // set rtm_state to "always rtm" in method oop + mov_metadata(tmpReg, method_data); + if (os::is_MP()) { + lock(); + } + orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); + } + bind(L_done); +} + +// Retry on lock abort if abort status is 0x2 +// inputs: boxReg (monitor address), countReg (retry count), +// : tmpReg(abort status) +// output: tmpReg set to boxReg, countReg decremented by 1 +// flags as scratch +void MacroAssembler::rtm_retry_lockabort(Register countReg, Register boxReg, Register tmpReg, Label& retryLabel, bool isStackLock) { + Label doneRetry; + + assert(tmpReg == rax, ""); + // The abort reason bits are in eax (see all states in rtmLocking.hpp) + // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) + // if reason is in 0x6 and retry count != 0 then retry + andptr(tmpReg, 0x6); + jccb(Assembler::zero, doneRetry); + testl(countReg, countReg); + jccb(Assembler::zero, doneRetry); + pause(); + decrementl(countReg); + if (!isStackLock) { + movptr(tmpReg, boxReg); + } + jmp(retryLabel); + bind(doneRetry); +} + +// Spin and retry if lock is busy, +// inputs: boxReg (monitor address), countReg (retry count) +// output: tmpReg set to boxReg, countReg decremented by 1 +// : clear z flags if retry count exceeded +// scrReg as scratch +void MacroAssembler::rtm_retry_lockbusy(Register countReg, Register boxReg, Register tmpReg, Register scrReg, Label& retryLabel) { + Label SpinLoop, SpinExit, doneRetry; + + testl(countReg, countReg); + jccb(Assembler::zero, doneRetry); + decrementl(countReg); + movptr(scrReg, RTMSpinLoopCount); + + bind(SpinLoop); + pause(); + decrementl(scrReg); + jccb(Assembler::lessEqual, SpinExit); + movptr(tmpReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; + testptr(tmpReg, tmpReg) ; + jccb(Assembler::notZero, SpinLoop) ; + + bind(SpinExit); + movptr(tmpReg, boxReg); + jmp(retryLabel); + bind(doneRetry); + incrementl(countReg); // clear z flag +} + +#endif // INCLUDE_RTM_OPT + // Fast_Lock and Fast_Unlock used by C2 // Because the transitions from emitted code to the runtime @@ -1350,17 +1526,26 @@ // box: on-stack box address (displaced header location) - KILLED // rax,: tmp -- KILLED // scr: tmp -- KILLED -void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, Register scrReg, BiasedLockingCounters* counters) { +void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, + Register scrReg, Register cx1Reg, Register cx2Reg, + BiasedLockingCounters* counters, + RTMLockingCounters* rtmcounters, + RTMLockingCounters* stackrtmcounters, + Metadata* method_data, + bool use_rtm, bool profile_rtm) { // Ensure the register assignents are disjoint - guarantee (objReg != boxReg, ""); - guarantee (objReg != tmpReg, ""); - guarantee (objReg != scrReg, ""); - guarantee (boxReg != tmpReg, ""); - guarantee (boxReg != scrReg, ""); - guarantee (tmpReg == rax, ""); + assert(tmpReg == rax, ""); + + if (use_rtm) { + assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); + } else { + assert(cx1Reg == noreg, ""); + assert(cx2Reg == noreg, ""); + assert_different_registers(objReg, boxReg, tmpReg, scrReg); + } if (counters != NULL) { - atomic_incl(ExternalAddress((address)counters->total_entry_count_addr())); + atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg); } if (EmitSync & 1) { // set box->dhw = unused_mark (3) @@ -1419,8 +1604,84 @@ biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, counters); } +#if INCLUDE_RTM_OPT + if (UseRTMForStackLocks && use_rtm) { + assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); + Label L_rtm_retry, L_decrement_retry, L_on_abort; + + if (RTMRetryCount > 0) { + movl(cx2Reg, RTMRetryCount); // Retry on abort + bind(L_rtm_retry); + } + if (!UseRTMXendForLockBusy) { + movptr(tmpReg, Address(objReg, 0)); + testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased + jcc(Assembler::notZero, IsInflated); + } + if (PrintPreciseRTMLockingStatistics || profile_rtm) { + Label L_noincrement; + if (RTMTotalCountIncrRate > 1) { + // tmpReg, scrReg and flags as scratch + branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement); + } + assert(stackrtmcounters != NULL, "should not be NULL when profiling RTM"); + atomic_incptr(ExternalAddress((address)stackrtmcounters->total_count_addr()), scrReg); + bind(L_noincrement); + } + xbegin(L_on_abort); + movptr(tmpReg, Address(objReg, 0)); // fetch markword + andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits + cmpptr(tmpReg, markOopDesc::unlocked_value); // bits = 001 unlocked + jcc(Assembler::equal, DONE_LABEL); // all done if unlocked + if (UseRTMXendForLockBusy) { + xend(); + movptr(tmpReg, Address(objReg, 0)); + testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased + jcc(Assembler::notZero, IsInflated); + movptr(tmpReg,0x1); // Set the transaction status in rax (tmpReg) + jmp(L_decrement_retry); + } + else { + xabort(0); + } + bind(L_on_abort); + if (PrintPreciseRTMLockingStatistics || profile_rtm) { + assert(stackrtmcounters != NULL, "should not be NULL when profiling RTM"); + // update rtm counters based on rax value at abort + // reads tmpReg(rax), updates flags + lea(scrReg, ExternalAddress((address)stackrtmcounters)); + rtmcounters_update(tmpReg, scrReg); + } + if (profile_rtm) { + if (RTMRetryCount > 0) { + // Save abort status + push(tmpReg); + } + // Perform abort ratio calculation, set dontelide bit and rtm_state + // input: scrReg (stackrtmcounters address) + // output: cx1Reg (=1 if dont elide, =0 if elide) + // tmpReg, scrReg, flags as scratch + assert(stackrtmcounters != NULL, "should not be NULL when profiling RTM"); + rtm_abortratio_calculation(cx1Reg, tmpReg, scrReg, stackrtmcounters, method_data, true); + + // restore abort status + if (RTMRetryCount > 0) { + pop(tmpReg); + } + } + bind(L_decrement_retry); + if (RTMRetryCount > 0) { + // retry on lock abort if abort status is one of 0xD + // inputs: cx2Reg (retry count), + // : tmpReg(abort status) + // output: cx2Reg decremented by 1 + rtm_retry_lockabort(cx2Reg, noreg, tmpReg, L_rtm_retry, true); + } + } +#endif // INCLUDE_RTM_OPT + movptr(tmpReg, Address(objReg, 0)); // [FETCH] - testl (tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased + testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased jccb (Assembler::notZero, IsInflated); // Attempt stack-locking ... @@ -1434,7 +1695,7 @@ cond_inc32(Assembler::equal, ExternalAddress((address)counters->fast_path_entry_count_addr())); } - jccb(Assembler::equal, DONE_LABEL); + jcc(Assembler::equal, DONE_LABEL); // Recursive locking subptr(tmpReg, rsp); @@ -1444,9 +1705,113 @@ cond_inc32(Assembler::equal, ExternalAddress((address)counters->fast_path_entry_count_addr())); } - jmpb(DONE_LABEL); + jmp(DONE_LABEL); bind(IsInflated); + // The object is inflated. + +#if INCLUDE_RTM_OPT + // Use the same RTM locking code in 32- and 64-bit VM. + if (use_rtm) { + Label L_rtm_retry, L_decrement_retry, L_on_abort; + + // Without cast to int32_t a movptr will destroy r10 which is typically obj + movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())); + movptr(boxReg, tmpReg); + + if (RTMRetryCount > 0) { + movl(cx1Reg, RTMRetryCount); // Retry on lock busy + movl(cx2Reg, RTMRetryCount); // Retry on abort + bind(L_rtm_retry); + } + if (PrintPreciseRTMLockingStatistics || profile_rtm) { + Label L_noincrement; + if (RTMTotalCountIncrRate > 1) { + // tmpReg, scrReg and flags as scratch + branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement); + } + assert(rtmcounters != NULL, "should not be NULL when profiling RTM"); + atomic_incptr(ExternalAddress((address)rtmcounters->total_count_addr()), scrReg); + bind(L_noincrement); + } + xbegin(L_on_abort); + movptr(tmpReg, Address(objReg,0)); + movptr(tmpReg, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); + testptr(tmpReg, tmpReg) ; + jcc(Assembler::zero, DONE_LABEL); + if (UseRTMXendForLockBusy) { + xend(); + jmp(L_decrement_retry); + } + else { + xabort(0); + } + bind(L_on_abort); + if (PrintPreciseRTMLockingStatistics || profile_rtm) { + assert(rtmcounters != NULL, "should not be NULL when profiling RTM"); + // update rtm counters based on rax value at abort + // reads tmpReg(rax), updates flags + lea(scrReg, ExternalAddress((address)rtmcounters)); + rtmcounters_update(tmpReg, scrReg); + } + if (profile_rtm) { + // Save abort status + if (RTMRetryCount > 0) + push(tmpReg); + + // Perform abort ratio calculation, set dontelide bit and rtm_state + // input: boxReg (object monitor address) + // input: scrReg (rtmcounters address) + // tmpReg, scrReg, flags as scratch + assert(rtmcounters != NULL, "should not be NULL when profiling RTM"); + rtm_abortratio_calculation(boxReg, tmpReg, scrReg, rtmcounters, method_data, false); + + // restore abort status + if (RTMRetryCount > 0) + pop(tmpReg); + } + if (RTMRetryCount > 0) { + // retry on lock abort if abort status is one of 0xD + // inputs: boxReg (monitor address), cx2Reg (retry count), + // : tmpReg(abort status) + // output: tmpReg set to boxReg, cx2Reg decremented by 1 + rtm_retry_lockabort(cx2Reg, boxReg, tmpReg, L_rtm_retry, false); + } + movptr(tmpReg, boxReg); + + movptr(tmpReg, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; + testptr(tmpReg, tmpReg) ; + jccb(Assembler::notZero, L_decrement_retry) ; + + // Appears unlocked - try to swing _owner from null to non-null. + // Use either "Self" (in threadReg) or rsp as thread identity in _owner. + // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. +#ifdef _LP64 + Register threadReg = r15_thread; +#else + get_thread(scrReg); + Register threadReg = scrReg; +#endif + if (os::is_MP()) { + lock(); + } + cmpxchgptr(threadReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)); // Updates tmpReg + + if (RTMRetryCount > 0) { + // success done else retry + jccb(Assembler::equal, DONE_LABEL) ; + // inputs: boxReg (monitor address), cx1Reg (retry count) + // output: tmpReg set to boxReg, cx1Reg decremented by 1 + // : clear z flags if retry count exceeded, scrReg scratch + bind(L_decrement_retry); + rtm_retry_lockbusy(cx1Reg, boxReg, tmpReg, scrReg, L_rtm_retry); + } + else { + bind(L_decrement_retry); + } + } else { // !use_rtm() +#endif // INCLUDE_RTM_OPT + #ifndef _LP64 // The object is inflated. // @@ -1576,7 +1941,7 @@ // Without cast to int32_t a movptr will destroy r10 which is typically obj movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())); - mov (boxReg, tmpReg); + movptr (boxReg, tmpReg); movptr (tmpReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)); testptr(tmpReg, tmpReg); jccb (Assembler::notZero, DONE_LABEL); @@ -1587,9 +1952,11 @@ } cmpxchgptr(r15_thread, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)); // Intentional fall-through into DONE_LABEL ... +#endif // _LP64 +#if INCLUDE_RTM_OPT + } // use_rtm() #endif - // DONE_LABEL is a hot target - we'd really like to place it at the // start of cache line by padding with NOPs. // See the AMD and Intel software optimization manuals for the @@ -1631,11 +1998,9 @@ // should not be unlocked by "normal" java-level locking and vice-versa. The specification // doesn't specify what will occur if a program engages in such mixed-mode locking, however. -void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { - guarantee (objReg != boxReg, ""); - guarantee (objReg != tmpReg, ""); - guarantee (boxReg != tmpReg, ""); - guarantee (boxReg == rax, ""); +void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { + assert(boxReg == rax, ""); + assert_different_registers(objReg, boxReg, tmpReg); if (EmitSync & 4) { // Disable - inhibit all inlining. Force control through the slow-path @@ -1667,14 +2032,39 @@ biased_locking_exit(objReg, tmpReg, DONE_LABEL); } +#if INCLUDE_RTM_OPT + if (UseRTMForStackLocks && use_rtm) { + assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); + Label L_regular_unlock; + movptr(tmpReg, Address(objReg, 0)); // fetch markword + andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits + cmpptr(tmpReg, markOopDesc::unlocked_value); // bits = 001 unlocked + jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock + xend(); // otherwise end... + jmp(DONE_LABEL); // ... and we're done + bind(L_regular_unlock); + } +#endif + cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header + jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock movptr(tmpReg, Address(objReg, 0)); // Examine the object's markword - jccb (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock - - testptr(tmpReg, 0x02); // Inflated? - jccb (Assembler::zero, Stacked); + testptr(tmpReg, markOopDesc::monitor_value); // Inflated? + jcc (Assembler::zero, Stacked); // It's inflated. +#if INCLUDE_RTM_OPT + if (use_rtm) { + Label L_regular_inflated_unlock; + movptr(boxReg, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; + testptr(boxReg, boxReg) ; + jccb(Assembler::notZero, L_regular_inflated_unlock) ; + xend(); + jmp(DONE_LABEL) ; + bind(L_regular_inflated_unlock); + } +#endif + // Despite our balanced locking property we still check that m->_owner == Self // as java routines or native JNI code called by this thread might // have released the lock. @@ -2448,7 +2838,9 @@ Condition negated_cond = negate_condition(cond); Label L; jcc(negated_cond, L); + pushf(); // Preserve flags atomic_incl(counter_addr); + popf(); bind(L); }