2901 emit_rm(cbuf,0x3, $tmp$$reg, $tmp$$reg); 2902 // CMP $tmp,$src.lo 2903 emit_opcode( cbuf, 0x3B ); 2904 emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg ); 2905 // SBB $tmp,$src.hi 2906 emit_opcode( cbuf, 0x1B ); 2907 emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src$$reg) ); 2908 %} 2909 2910 // Sniff, sniff... smells like Gnu Superoptimizer 2911 enc_class neg_long( eRegL dst ) %{ 2912 emit_opcode(cbuf,0xF7); // NEG hi 2913 emit_rm (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg)); 2914 emit_opcode(cbuf,0xF7); // NEG lo 2915 emit_rm (cbuf,0x3, 0x3, $dst$$reg ); 2916 emit_opcode(cbuf,0x83); // SBB hi,0 2917 emit_rm (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg)); 2918 emit_d8 (cbuf,0 ); 2919 %} 2920 2921 2922 // Because the transitions from emitted code to the runtime 2923 // monitorenter/exit helper stubs are so slow it's critical that 2924 // we inline both the stack-locking fast-path and the inflated fast path. 2925 // 2926 // See also: cmpFastLock and cmpFastUnlock. 2927 // 2928 // What follows is a specialized inline transliteration of the code 2929 // in slow_enter() and slow_exit(). If we're concerned about I$ bloat 2930 // another option would be to emit TrySlowEnter and TrySlowExit methods 2931 // at startup-time. These methods would accept arguments as 2932 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 2933 // indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply 2934 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 2935 // In practice, however, the # of lock sites is bounded and is usually small. 2936 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 2937 // if the processor uses simple bimodal branch predictors keyed by EIP 2938 // Since the helper routines would be called from multiple synchronization 2939 // sites. 2940 // 2941 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 2942 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 2943 // to those specialized methods. That'd give us a mostly platform-independent 2944 // implementation that the JITs could optimize and inline at their pleasure. 2945 // Done correctly, the only time we'd need to cross to native could would be 2946 // to park() or unpark() threads. We'd also need a few more unsafe operators 2947 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 2948 // (b) explicit barriers or fence operations. 2949 // 2950 // TODO: 2951 // 2952 // * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr). 2953 // This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals. 2954 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 2955 // the lock operators would typically be faster than reifying Self. 2956 // 2957 // * Ideally I'd define the primitives as: 2958 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 2959 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 2960 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 2961 // Instead, we're stuck with a rather awkward and brittle register assignments below. 2962 // Furthermore the register assignments are overconstrained, possibly resulting in 2963 // sub-optimal code near the synchronization site. 2964 // 2965 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 2966 // Alternately, use a better sp-proximity test. 2967 // 2968 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 2969 // Either one is sufficient to uniquely identify a thread. 2970 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 2971 // 2972 // * Intrinsify notify() and notifyAll() for the common cases where the 2973 // object is locked by the calling thread but the waitlist is empty. 2974 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 2975 // 2976 // * use jccb and jmpb instead of jcc and jmp to improve code density. 2977 // But beware of excessive branch density on AMD Opterons. 2978 // 2979 // * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success 2980 // or failure of the fast-path. If the fast-path fails then we pass 2981 // control to the slow-path, typically in C. In Fast_Lock and 2982 // Fast_Unlock we often branch to DONE_LABEL, just to find that C2 2983 // will emit a conditional branch immediately after the node. 2984 // So we have branches to branches and lots of ICC.ZF games. 2985 // Instead, it might be better to have C2 pass a "FailureLabel" 2986 // into Fast_Lock and Fast_Unlock. In the case of success, control 2987 // will drop through the node. ICC.ZF is undefined at exit. 2988 // In the case of failure, the node will branch directly to the 2989 // FailureLabel 2990 2991 2992 // obj: object to lock 2993 // box: on-stack box address (displaced header location) - KILLED 2994 // rax,: tmp -- KILLED 2995 // scr: tmp -- KILLED 2996 enc_class Fast_Lock( eRegP obj, eRegP box, eAXRegI tmp, eRegP scr ) %{ 2997 2998 Register objReg = as_Register($obj$$reg); 2999 Register boxReg = as_Register($box$$reg); 3000 Register tmpReg = as_Register($tmp$$reg); 3001 Register scrReg = as_Register($scr$$reg); 3002 3003 // Ensure the register assignents are disjoint 3004 guarantee (objReg != boxReg, "") ; 3005 guarantee (objReg != tmpReg, "") ; 3006 guarantee (objReg != scrReg, "") ; 3007 guarantee (boxReg != tmpReg, "") ; 3008 guarantee (boxReg != scrReg, "") ; 3009 guarantee (tmpReg == as_Register(EAX_enc), "") ; 3010 3011 MacroAssembler masm(&cbuf); 3012 3013 if (_counters != NULL) { 3014 masm.atomic_incl(ExternalAddress((address) _counters->total_entry_count_addr())); 3015 } 3016 if (EmitSync & 1) { 3017 // set box->dhw = unused_mark (3) 3018 // Force all sync thru slow-path: slow_enter() and slow_exit() 3019 masm.movptr (Address(boxReg, 0), int32_t(markOopDesc::unused_mark())) ; 3020 masm.cmpptr (rsp, (int32_t)0) ; 3021 } else 3022 if (EmitSync & 2) { 3023 Label DONE_LABEL ; 3024 if (UseBiasedLocking) { 3025 // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument. 3026 masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters); 3027 } 3028 3029 masm.movptr(tmpReg, Address(objReg, 0)) ; // fetch markword 3030 masm.orptr (tmpReg, 0x1); 3031 masm.movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 3032 if (os::is_MP()) { masm.lock(); } 3033 masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg 3034 masm.jcc(Assembler::equal, DONE_LABEL); 3035 // Recursive locking 3036 masm.subptr(tmpReg, rsp); 3037 masm.andptr(tmpReg, (int32_t) 0xFFFFF003 ); 3038 masm.movptr(Address(boxReg, 0), tmpReg); 3039 masm.bind(DONE_LABEL) ; 3040 } else { 3041 // Possible cases that we'll encounter in fast_lock 3042 // ------------------------------------------------ 3043 // * Inflated 3044 // -- unlocked 3045 // -- Locked 3046 // = by self 3047 // = by other 3048 // * biased 3049 // -- by Self 3050 // -- by other 3051 // * neutral 3052 // * stack-locked 3053 // -- by self 3054 // = sp-proximity test hits 3055 // = sp-proximity test generates false-negative 3056 // -- by other 3057 // 3058 3059 Label IsInflated, DONE_LABEL, PopDone ; 3060 3061 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage 3062 // order to reduce the number of conditional branches in the most common cases. 3063 // Beware -- there's a subtle invariant that fetch of the markword 3064 // at [FETCH], below, will never observe a biased encoding (*101b). 3065 // If this invariant is not held we risk exclusion (safety) failure. 3066 if (UseBiasedLocking && !UseOptoBiasInlining) { 3067 masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters); 3068 } 3069 3070 masm.movptr(tmpReg, Address(objReg, 0)) ; // [FETCH] 3071 masm.testptr(tmpReg, 0x02) ; // Inflated v (Stack-locked or neutral) 3072 masm.jccb (Assembler::notZero, IsInflated) ; 3073 3074 // Attempt stack-locking ... 3075 masm.orptr (tmpReg, 0x1); 3076 masm.movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 3077 if (os::is_MP()) { masm.lock(); } 3078 masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg 3079 if (_counters != NULL) { 3080 masm.cond_inc32(Assembler::equal, 3081 ExternalAddress((address)_counters->fast_path_entry_count_addr())); 3082 } 3083 masm.jccb (Assembler::equal, DONE_LABEL); 3084 3085 // Recursive locking 3086 masm.subptr(tmpReg, rsp); 3087 masm.andptr(tmpReg, 0xFFFFF003 ); 3088 masm.movptr(Address(boxReg, 0), tmpReg); 3089 if (_counters != NULL) { 3090 masm.cond_inc32(Assembler::equal, 3091 ExternalAddress((address)_counters->fast_path_entry_count_addr())); 3092 } 3093 masm.jmp (DONE_LABEL) ; 3094 3095 masm.bind (IsInflated) ; 3096 3097 // The object is inflated. 3098 // 3099 // TODO-FIXME: eliminate the ugly use of manifest constants: 3100 // Use markOopDesc::monitor_value instead of "2". 3101 // use markOop::unused_mark() instead of "3". 3102 // The tmpReg value is an objectMonitor reference ORed with 3103 // markOopDesc::monitor_value (2). We can either convert tmpReg to an 3104 // objectmonitor pointer by masking off the "2" bit or we can just 3105 // use tmpReg as an objectmonitor pointer but bias the objectmonitor 3106 // field offsets with "-2" to compensate for and annul the low-order tag bit. 3107 // 3108 // I use the latter as it avoids AGI stalls. 3109 // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]" 3110 // instead of "mov r, [tmpReg+OFFSETOF(Owner)]". 3111 // 3112 #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2) 3113 3114 // boxReg refers to the on-stack BasicLock in the current frame. 3115 // We'd like to write: 3116 // set box->_displaced_header = markOop::unused_mark(). Any non-0 value suffices. 3117 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 3118 // additional latency as we have another ST in the store buffer that must drain. 3119 3120 if (EmitSync & 8192) { 3121 masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty 3122 masm.get_thread (scrReg) ; 3123 masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 3124 masm.movptr(tmpReg, NULL_WORD); // consider: xor vs mov 3125 if (os::is_MP()) { masm.lock(); } 3126 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 3127 } else 3128 if ((EmitSync & 128) == 0) { // avoid ST-before-CAS 3129 masm.movptr(scrReg, boxReg) ; 3130 masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 3131 3132 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes 3133 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { 3134 // prefetchw [eax + Offset(_owner)-2] 3135 masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2)); 3136 } 3137 3138 if ((EmitSync & 64) == 0) { 3139 // Optimistic form: consider XORL tmpReg,tmpReg 3140 masm.movptr(tmpReg, NULL_WORD) ; 3141 } else { 3142 // Can suffer RTS->RTO upgrades on shared or cold $ lines 3143 // Test-And-CAS instead of CAS 3144 masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner 3145 masm.testptr(tmpReg, tmpReg) ; // Locked ? 3146 masm.jccb (Assembler::notZero, DONE_LABEL) ; 3147 } 3148 3149 // Appears unlocked - try to swing _owner from null to non-null. 3150 // Ideally, I'd manifest "Self" with get_thread and then attempt 3151 // to CAS the register containing Self into m->Owner. 3152 // But we don't have enough registers, so instead we can either try to CAS 3153 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 3154 // we later store "Self" into m->Owner. Transiently storing a stack address 3155 // (rsp or the address of the box) into m->owner is harmless. 3156 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 3157 if (os::is_MP()) { masm.lock(); } 3158 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 3159 masm.movptr(Address(scrReg, 0), 3) ; // box->_displaced_header = 3 3160 masm.jccb (Assembler::notZero, DONE_LABEL) ; 3161 masm.get_thread (scrReg) ; // beware: clobbers ICCs 3162 masm.movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg) ; 3163 masm.xorptr(boxReg, boxReg) ; // set icc.ZFlag = 1 to indicate success 3164 3165 // If the CAS fails we can either retry or pass control to the slow-path. 3166 // We use the latter tactic. 3167 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 3168 // If the CAS was successful ... 3169 // Self has acquired the lock 3170 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 3171 // Intentional fall-through into DONE_LABEL ... 3172 } else { 3173 masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty 3174 masm.movptr(boxReg, tmpReg) ; 3175 3176 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes 3177 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { 3178 // prefetchw [eax + Offset(_owner)-2] 3179 masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2)); 3180 } 3181 3182 if ((EmitSync & 64) == 0) { 3183 // Optimistic form 3184 masm.xorptr (tmpReg, tmpReg) ; 3185 } else { 3186 // Can suffer RTS->RTO upgrades on shared or cold $ lines 3187 masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner 3188 masm.testptr(tmpReg, tmpReg) ; // Locked ? 3189 masm.jccb (Assembler::notZero, DONE_LABEL) ; 3190 } 3191 3192 // Appears unlocked - try to swing _owner from null to non-null. 3193 // Use either "Self" (in scr) or rsp as thread identity in _owner. 3194 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 3195 masm.get_thread (scrReg) ; 3196 if (os::is_MP()) { masm.lock(); } 3197 masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 3198 3199 // If the CAS fails we can either retry or pass control to the slow-path. 3200 // We use the latter tactic. 3201 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 3202 // If the CAS was successful ... 3203 // Self has acquired the lock 3204 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 3205 // Intentional fall-through into DONE_LABEL ... 3206 } 3207 3208 // DONE_LABEL is a hot target - we'd really like to place it at the 3209 // start of cache line by padding with NOPs. 3210 // See the AMD and Intel software optimization manuals for the 3211 // most efficient "long" NOP encodings. 3212 // Unfortunately none of our alignment mechanisms suffice. 3213 masm.bind(DONE_LABEL); 3214 3215 // Avoid branch-to-branch on AMD processors 3216 // This appears to be superstition. 3217 if (EmitSync & 32) masm.nop() ; 3218 3219 3220 // At DONE_LABEL the icc ZFlag is set as follows ... 3221 // Fast_Unlock uses the same protocol. 3222 // ZFlag == 1 -> Success 3223 // ZFlag == 0 -> Failure - force control through the slow-path 3224 } 3225 %} 3226 3227 // obj: object to unlock 3228 // box: box address (displaced header location), killed. Must be EAX. 3229 // rbx,: killed tmp; cannot be obj nor box. 3230 // 3231 // Some commentary on balanced locking: 3232 // 3233 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites. 3234 // Methods that don't have provably balanced locking are forced to run in the 3235 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 3236 // The interpreter provides two properties: 3237 // I1: At return-time the interpreter automatically and quietly unlocks any 3238 // objects acquired the current activation (frame). Recall that the 3239 // interpreter maintains an on-stack list of locks currently held by 3240 // a frame. 3241 // I2: If a method attempts to unlock an object that is not held by the 3242 // the frame the interpreter throws IMSX. 3243 // 3244 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 3245 // B() doesn't have provably balanced locking so it runs in the interpreter. 3246 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 3247 // is still locked by A(). 3248 // 3249 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 3250 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 3251 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 3252 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 3253 3254 enc_class Fast_Unlock( nabxRegP obj, eAXRegP box, eRegP tmp) %{ 3255 3256 Register objReg = as_Register($obj$$reg); 3257 Register boxReg = as_Register($box$$reg); 3258 Register tmpReg = as_Register($tmp$$reg); 3259 3260 guarantee (objReg != boxReg, "") ; 3261 guarantee (objReg != tmpReg, "") ; 3262 guarantee (boxReg != tmpReg, "") ; 3263 guarantee (boxReg == as_Register(EAX_enc), "") ; 3264 MacroAssembler masm(&cbuf); 3265 3266 if (EmitSync & 4) { 3267 // Disable - inhibit all inlining. Force control through the slow-path 3268 masm.cmpptr (rsp, 0) ; 3269 } else 3270 if (EmitSync & 8) { 3271 Label DONE_LABEL ; 3272 if (UseBiasedLocking) { 3273 masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL); 3274 } 3275 // classic stack-locking code ... 3276 masm.movptr(tmpReg, Address(boxReg, 0)) ; 3277 masm.testptr(tmpReg, tmpReg) ; 3278 masm.jcc (Assembler::zero, DONE_LABEL) ; 3279 if (os::is_MP()) { masm.lock(); } 3280 masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box 3281 masm.bind(DONE_LABEL); 3282 } else { 3283 Label DONE_LABEL, Stacked, CheckSucc, Inflated ; 3284 3285 // Critically, the biased locking test must have precedence over 3286 // and appear before the (box->dhw == 0) recursive stack-lock test. 3287 if (UseBiasedLocking && !UseOptoBiasInlining) { 3288 masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL); 3289 } 3290 3291 masm.cmpptr(Address(boxReg, 0), 0) ; // Examine the displaced header 3292 masm.movptr(tmpReg, Address(objReg, 0)) ; // Examine the object's markword 3293 masm.jccb (Assembler::zero, DONE_LABEL) ; // 0 indicates recursive stack-lock 3294 3295 masm.testptr(tmpReg, 0x02) ; // Inflated? 3296 masm.jccb (Assembler::zero, Stacked) ; 3297 3298 masm.bind (Inflated) ; 3299 // It's inflated. 3300 // Despite our balanced locking property we still check that m->_owner == Self 3301 // as java routines or native JNI code called by this thread might 3302 // have released the lock. 3303 // Refer to the comments in synchronizer.cpp for how we might encode extra 3304 // state in _succ so we can avoid fetching EntryList|cxq. 3305 // 3306 // I'd like to add more cases in fast_lock() and fast_unlock() -- 3307 // such as recursive enter and exit -- but we have to be wary of 3308 // I$ bloat, T$ effects and BP$ effects. 3309 // 3310 // If there's no contention try a 1-0 exit. That is, exit without 3311 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 3312 // we detect and recover from the race that the 1-0 exit admits. 3313 // 3314 // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier 3315 // before it STs null into _owner, releasing the lock. Updates 3316 // to data protected by the critical section must be visible before 3317 // we drop the lock (and thus before any other thread could acquire 3318 // the lock and observe the fields protected by the lock). 3319 // IA32's memory-model is SPO, so STs are ordered with respect to 3320 // each other and there's no need for an explicit barrier (fence). 3321 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 3322 3323 masm.get_thread (boxReg) ; 3324 if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { 3325 // prefetchw [ebx + Offset(_owner)-2] 3326 masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2)); 3327 } 3328 3329 // Note that we could employ various encoding schemes to reduce 3330 // the number of loads below (currently 4) to just 2 or 3. 3331 // Refer to the comments in synchronizer.cpp. 3332 // In practice the chain of fetches doesn't seem to impact performance, however. 3333 if ((EmitSync & 65536) == 0 && (EmitSync & 256)) { 3334 // Attempt to reduce branch density - AMD's branch predictor. 3335 masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 3336 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ; 3337 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 3338 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 3339 masm.jccb (Assembler::notZero, DONE_LABEL) ; 3340 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 3341 masm.jmpb (DONE_LABEL) ; 3342 } else { 3343 masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 3344 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ; 3345 masm.jccb (Assembler::notZero, DONE_LABEL) ; 3346 masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 3347 masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 3348 masm.jccb (Assembler::notZero, CheckSucc) ; 3349 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 3350 masm.jmpb (DONE_LABEL) ; 3351 } 3352 3353 // The Following code fragment (EmitSync & 65536) improves the performance of 3354 // contended applications and contended synchronization microbenchmarks. 3355 // Unfortunately the emission of the code - even though not executed - causes regressions 3356 // in scimark and jetstream, evidently because of $ effects. Replacing the code 3357 // with an equal number of never-executed NOPs results in the same regression. 3358 // We leave it off by default. 3359 3360 if ((EmitSync & 65536) != 0) { 3361 Label LSuccess, LGoSlowPath ; 3362 3363 masm.bind (CheckSucc) ; 3364 3365 // Optional pre-test ... it's safe to elide this 3366 if ((EmitSync & 16) == 0) { 3367 masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 3368 masm.jccb (Assembler::zero, LGoSlowPath) ; 3369 } 3370 3371 // We have a classic Dekker-style idiom: 3372 // ST m->_owner = 0 ; MEMBAR; LD m->_succ 3373 // There are a number of ways to implement the barrier: 3374 // (1) lock:andl &m->_owner, 0 3375 // is fast, but mask doesn't currently support the "ANDL M,IMM32" form. 3376 // LOCK: ANDL [ebx+Offset(_Owner)-2], 0 3377 // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8 3378 // (2) If supported, an explicit MFENCE is appealing. 3379 // In older IA32 processors MFENCE is slower than lock:add or xchg 3380 // particularly if the write-buffer is full as might be the case if 3381 // if stores closely precede the fence or fence-equivalent instruction. 3382 // In more modern implementations MFENCE appears faster, however. 3383 // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack 3384 // The $lines underlying the top-of-stack should be in M-state. 3385 // The locked add instruction is serializing, of course. 3386 // (4) Use xchg, which is serializing 3387 // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works 3388 // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0. 3389 // The integer condition codes will tell us if succ was 0. 3390 // Since _succ and _owner should reside in the same $line and 3391 // we just stored into _owner, it's likely that the $line 3392 // remains in M-state for the lock:orl. 3393 // 3394 // We currently use (3), although it's likely that switching to (2) 3395 // is correct for the future. 3396 3397 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 3398 if (os::is_MP()) { 3399 if (VM_Version::supports_sse2() && 1 == FenceInstruction) { 3400 masm.mfence(); 3401 } else { 3402 masm.lock () ; masm.addptr(Address(rsp, 0), 0) ; 3403 } 3404 } 3405 // Ratify _succ remains non-null 3406 masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 3407 masm.jccb (Assembler::notZero, LSuccess) ; 3408 3409 masm.xorptr(boxReg, boxReg) ; // box is really EAX 3410 if (os::is_MP()) { masm.lock(); } 3411 masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); 3412 masm.jccb (Assembler::notEqual, LSuccess) ; 3413 // Since we're low on registers we installed rsp as a placeholding in _owner. 3414 // Now install Self over rsp. This is safe as we're transitioning from 3415 // non-null to non=null 3416 masm.get_thread (boxReg) ; 3417 masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ; 3418 // Intentional fall-through into LGoSlowPath ... 3419 3420 masm.bind (LGoSlowPath) ; 3421 masm.orptr(boxReg, 1) ; // set ICC.ZF=0 to indicate failure 3422 masm.jmpb (DONE_LABEL) ; 3423 3424 masm.bind (LSuccess) ; 3425 masm.xorptr(boxReg, boxReg) ; // set ICC.ZF=1 to indicate success 3426 masm.jmpb (DONE_LABEL) ; 3427 } 3428 3429 masm.bind (Stacked) ; 3430 // It's not inflated and it's not recursively stack-locked and it's not biased. 3431 // It must be stack-locked. 3432 // Try to reset the header to displaced header. 3433 // The "box" value on the stack is stable, so we can reload 3434 // and be assured we observe the same value as above. 3435 masm.movptr(tmpReg, Address(boxReg, 0)) ; 3436 if (os::is_MP()) { masm.lock(); } 3437 masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box 3438 // Intention fall-thru into DONE_LABEL 3439 3440 3441 // DONE_LABEL is a hot target - we'd really like to place it at the 3442 // start of cache line by padding with NOPs. 3443 // See the AMD and Intel software optimization manuals for the 3444 // most efficient "long" NOP encodings. 3445 // Unfortunately none of our alignment mechanisms suffice. 3446 if ((EmitSync & 65536) == 0) { 3447 masm.bind (CheckSucc) ; 3448 } 3449 masm.bind(DONE_LABEL); 3450 3451 // Avoid branch to branch on AMD processors 3452 if (EmitSync & 32768) { masm.nop() ; } 3453 } 3454 %} 3455 3456 3457 enc_class enc_pop_rdx() %{ 3458 emit_opcode(cbuf,0x5A); 3459 %} 3460 3461 enc_class enc_rethrow() %{ 3462 cbuf.set_insts_mark(); 3463 emit_opcode(cbuf, 0xE9); // jmp entry 3464 emit_d32_reloc(cbuf, (int)OptoRuntime::rethrow_stub() - ((int)cbuf.insts_end())-4, 3465 runtime_call_Relocation::spec(), RELOC_IMM32 ); 3466 %} 3467 3468 3469 // Convert a double to an int. Java semantics require we do complex 3470 // manglelations in the corner cases. So we set the rounding mode to 3471 // 'zero', store the darned double down as an int, and reset the 3472 // rounding mode to 'nearest'. The hardware throws an exception which 3473 // patches up the correct value directly to the stack. 3474 enc_class DPR2I_encoding( regDPR src ) %{ 3475 // Flip to round-to-zero mode. We attempted to allow invalid-op 3476 // exceptions here, so that a NAN or other corner-case value will 13140 ins_encode(); 13141 ins_pipe( empty ); 13142 %} 13143 13144 13145 // Rethrow exception: 13146 // The exception oop will come in the first argument position. 13147 // Then JUMP (not call) to the rethrow stub code. 13148 instruct RethrowException() 13149 %{ 13150 match(Rethrow); 13151 13152 // use the following format syntax 13153 format %{ "JMP rethrow_stub" %} 13154 ins_encode(enc_rethrow); 13155 ins_pipe( pipe_jmp ); 13156 %} 13157 13158 // inlined locking and unlocking 13159 13160 13161 instruct cmpFastLock( eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{ 13162 match( Set cr (FastLock object box) ); 13163 effect( TEMP tmp, TEMP scr, USE_KILL box ); 13164 ins_cost(300); 13165 format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %} 13166 ins_encode( Fast_Lock(object,box,tmp,scr) ); 13167 ins_pipe( pipe_slow ); 13168 %} 13169 13170 instruct cmpFastUnlock( eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{ 13171 match( Set cr (FastUnlock object box) ); 13172 effect( TEMP tmp, USE_KILL box ); 13173 ins_cost(300); 13174 format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %} 13175 ins_encode( Fast_Unlock(object,box,tmp) ); 13176 ins_pipe( pipe_slow ); 13177 %} 13178 13179 13180 13181 // ============================================================================ 13182 // Safepoint Instruction 13183 instruct safePoint_poll(eFlagsReg cr) %{ 13184 match(SafePoint); 13185 effect(KILL cr); 13186 13187 // TODO-FIXME: we currently poll at offset 0 of the safepoint polling page. 13188 // On SPARC that might be acceptable as we can generate the address with 13189 // just a sethi, saving an or. By polling at offset 0 we can end up 13190 // putting additional pressure on the index-0 in the D$. Because of 13191 // alignment (just like the situation at hand) the lower indices tend 13192 // to see more traffic. It'd be better to change the polling address 13193 // to offset 0 of the last $line in the polling page. 13194 13195 format %{ "TSTL #polladdr,EAX\t! Safepoint: poll for GC" %} 13196 ins_cost(125); | 2901 emit_rm(cbuf,0x3, $tmp$$reg, $tmp$$reg); 2902 // CMP $tmp,$src.lo 2903 emit_opcode( cbuf, 0x3B ); 2904 emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg ); 2905 // SBB $tmp,$src.hi 2906 emit_opcode( cbuf, 0x1B ); 2907 emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src$$reg) ); 2908 %} 2909 2910 // Sniff, sniff... smells like Gnu Superoptimizer 2911 enc_class neg_long( eRegL dst ) %{ 2912 emit_opcode(cbuf,0xF7); // NEG hi 2913 emit_rm (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg)); 2914 emit_opcode(cbuf,0xF7); // NEG lo 2915 emit_rm (cbuf,0x3, 0x3, $dst$$reg ); 2916 emit_opcode(cbuf,0x83); // SBB hi,0 2917 emit_rm (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg)); 2918 emit_d8 (cbuf,0 ); 2919 %} 2920 2921 enc_class enc_pop_rdx() %{ 2922 emit_opcode(cbuf,0x5A); 2923 %} 2924 2925 enc_class enc_rethrow() %{ 2926 cbuf.set_insts_mark(); 2927 emit_opcode(cbuf, 0xE9); // jmp entry 2928 emit_d32_reloc(cbuf, (int)OptoRuntime::rethrow_stub() - ((int)cbuf.insts_end())-4, 2929 runtime_call_Relocation::spec(), RELOC_IMM32 ); 2930 %} 2931 2932 2933 // Convert a double to an int. Java semantics require we do complex 2934 // manglelations in the corner cases. So we set the rounding mode to 2935 // 'zero', store the darned double down as an int, and reset the 2936 // rounding mode to 'nearest'. The hardware throws an exception which 2937 // patches up the correct value directly to the stack. 2938 enc_class DPR2I_encoding( regDPR src ) %{ 2939 // Flip to round-to-zero mode. We attempted to allow invalid-op 2940 // exceptions here, so that a NAN or other corner-case value will 12604 ins_encode(); 12605 ins_pipe( empty ); 12606 %} 12607 12608 12609 // Rethrow exception: 12610 // The exception oop will come in the first argument position. 12611 // Then JUMP (not call) to the rethrow stub code. 12612 instruct RethrowException() 12613 %{ 12614 match(Rethrow); 12615 12616 // use the following format syntax 12617 format %{ "JMP rethrow_stub" %} 12618 ins_encode(enc_rethrow); 12619 ins_pipe( pipe_jmp ); 12620 %} 12621 12622 // inlined locking and unlocking 12623 12624 instruct cmpFastLock(eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{ 12625 match(Set cr (FastLock object box)); 12626 effect(TEMP tmp, TEMP scr, USE_KILL box); 12627 ins_cost(300); 12628 format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %} 12629 ins_encode %{ 12630 __ fast_lock($object$$Register, $box$$Register, $tmp$$Register, $scr$$Register, _counters); 12631 %} 12632 ins_pipe(pipe_slow); 12633 %} 12634 12635 instruct cmpFastUnlock(eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{ 12636 match(Set cr (FastUnlock object box)); 12637 effect(TEMP tmp, USE_KILL box); 12638 ins_cost(300); 12639 format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %} 12640 ins_encode %{ 12641 __ fast_unlock($object$$Register, $box$$Register, $tmp$$Register); 12642 %} 12643 ins_pipe(pipe_slow); 12644 %} 12645 12646 12647 12648 // ============================================================================ 12649 // Safepoint Instruction 12650 instruct safePoint_poll(eFlagsReg cr) %{ 12651 match(SafePoint); 12652 effect(KILL cr); 12653 12654 // TODO-FIXME: we currently poll at offset 0 of the safepoint polling page. 12655 // On SPARC that might be acceptable as we can generate the address with 12656 // just a sethi, saving an or. By polling at offset 0 we can end up 12657 // putting additional pressure on the index-0 in the D$. Because of 12658 // alignment (just like the situation at hand) the lower indices tend 12659 // to see more traffic. It'd be better to change the polling address 12660 // to offset 0 of the last $line in the polling page. 12661 12662 format %{ "TSTL #polladdr,EAX\t! Safepoint: poll for GC" %} 12663 ins_cost(125); |