src/cpu/x86/vm/x86_32.ad
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File 8033805 Sdiff src/cpu/x86/vm

src/cpu/x86/vm/x86_32.ad

Print this page




2901     emit_rm(cbuf,0x3, $tmp$$reg, $tmp$$reg);
2902     // CMP    $tmp,$src.lo
2903     emit_opcode( cbuf, 0x3B );
2904     emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg );
2905     // SBB    $tmp,$src.hi
2906     emit_opcode( cbuf, 0x1B );
2907     emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src$$reg) );
2908   %}
2909 
2910  // Sniff, sniff... smells like Gnu Superoptimizer
2911   enc_class neg_long( eRegL dst ) %{
2912     emit_opcode(cbuf,0xF7);    // NEG hi
2913     emit_rm    (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg));
2914     emit_opcode(cbuf,0xF7);    // NEG lo
2915     emit_rm    (cbuf,0x3, 0x3,               $dst$$reg );
2916     emit_opcode(cbuf,0x83);    // SBB hi,0
2917     emit_rm    (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg));
2918     emit_d8    (cbuf,0 );
2919   %}
2920 
2921 
2922   // Because the transitions from emitted code to the runtime
2923   // monitorenter/exit helper stubs are so slow it's critical that
2924   // we inline both the stack-locking fast-path and the inflated fast path.
2925   //
2926   // See also: cmpFastLock and cmpFastUnlock.
2927   //
2928   // What follows is a specialized inline transliteration of the code
2929   // in slow_enter() and slow_exit().  If we're concerned about I$ bloat
2930   // another option would be to emit TrySlowEnter and TrySlowExit methods
2931   // at startup-time.  These methods would accept arguments as
2932   // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
2933   // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
2934   // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
2935   // In practice, however, the # of lock sites is bounded and is usually small.
2936   // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
2937   // if the processor uses simple bimodal branch predictors keyed by EIP
2938   // Since the helper routines would be called from multiple synchronization
2939   // sites.
2940   //
2941   // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
2942   // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
2943   // to those specialized methods.  That'd give us a mostly platform-independent
2944   // implementation that the JITs could optimize and inline at their pleasure.
2945   // Done correctly, the only time we'd need to cross to native could would be
2946   // to park() or unpark() threads.  We'd also need a few more unsafe operators
2947   // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
2948   // (b) explicit barriers or fence operations.
2949   //
2950   // TODO:
2951   //
2952   // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
2953   //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
2954   //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
2955   //    the lock operators would typically be faster than reifying Self.
2956   //
2957   // *  Ideally I'd define the primitives as:
2958   //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
2959   //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
2960   //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
2961   //    Instead, we're stuck with a rather awkward and brittle register assignments below.
2962   //    Furthermore the register assignments are overconstrained, possibly resulting in
2963   //    sub-optimal code near the synchronization site.
2964   //
2965   // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
2966   //    Alternately, use a better sp-proximity test.
2967   //
2968   // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
2969   //    Either one is sufficient to uniquely identify a thread.
2970   //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
2971   //
2972   // *  Intrinsify notify() and notifyAll() for the common cases where the
2973   //    object is locked by the calling thread but the waitlist is empty.
2974   //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
2975   //
2976   // *  use jccb and jmpb instead of jcc and jmp to improve code density.
2977   //    But beware of excessive branch density on AMD Opterons.
2978   //
2979   // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
2980   //    or failure of the fast-path.  If the fast-path fails then we pass
2981   //    control to the slow-path, typically in C.  In Fast_Lock and
2982   //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
2983   //    will emit a conditional branch immediately after the node.
2984   //    So we have branches to branches and lots of ICC.ZF games.
2985   //    Instead, it might be better to have C2 pass a "FailureLabel"
2986   //    into Fast_Lock and Fast_Unlock.  In the case of success, control
2987   //    will drop through the node.  ICC.ZF is undefined at exit.
2988   //    In the case of failure, the node will branch directly to the
2989   //    FailureLabel
2990 
2991 
2992   // obj: object to lock
2993   // box: on-stack box address (displaced header location) - KILLED
2994   // rax,: tmp -- KILLED
2995   // scr: tmp -- KILLED
2996   enc_class Fast_Lock( eRegP obj, eRegP box, eAXRegI tmp, eRegP scr ) %{
2997 
2998     Register objReg = as_Register($obj$$reg);
2999     Register boxReg = as_Register($box$$reg);
3000     Register tmpReg = as_Register($tmp$$reg);
3001     Register scrReg = as_Register($scr$$reg);
3002 
3003     // Ensure the register assignents are disjoint
3004     guarantee (objReg != boxReg, "") ;
3005     guarantee (objReg != tmpReg, "") ;
3006     guarantee (objReg != scrReg, "") ;
3007     guarantee (boxReg != tmpReg, "") ;
3008     guarantee (boxReg != scrReg, "") ;
3009     guarantee (tmpReg == as_Register(EAX_enc), "") ;
3010 
3011     MacroAssembler masm(&cbuf);
3012 
3013     if (_counters != NULL) {
3014       masm.atomic_incl(ExternalAddress((address) _counters->total_entry_count_addr()));
3015     }
3016     if (EmitSync & 1) {
3017         // set box->dhw = unused_mark (3)
3018         // Force all sync thru slow-path: slow_enter() and slow_exit() 
3019         masm.movptr (Address(boxReg, 0), int32_t(markOopDesc::unused_mark())) ;             
3020         masm.cmpptr (rsp, (int32_t)0) ;                        
3021     } else 
3022     if (EmitSync & 2) { 
3023         Label DONE_LABEL ;           
3024         if (UseBiasedLocking) {
3025            // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
3026            masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
3027         }
3028 
3029         masm.movptr(tmpReg, Address(objReg, 0)) ;          // fetch markword 
3030         masm.orptr (tmpReg, 0x1);
3031         masm.movptr(Address(boxReg, 0), tmpReg);           // Anticipate successful CAS 
3032         if (os::is_MP()) { masm.lock();  }
3033         masm.cmpxchgptr(boxReg, Address(objReg, 0));          // Updates tmpReg
3034         masm.jcc(Assembler::equal, DONE_LABEL);
3035         // Recursive locking
3036         masm.subptr(tmpReg, rsp);
3037         masm.andptr(tmpReg, (int32_t) 0xFFFFF003 );
3038         masm.movptr(Address(boxReg, 0), tmpReg);
3039         masm.bind(DONE_LABEL) ; 
3040     } else {  
3041       // Possible cases that we'll encounter in fast_lock 
3042       // ------------------------------------------------
3043       // * Inflated
3044       //    -- unlocked
3045       //    -- Locked
3046       //       = by self
3047       //       = by other
3048       // * biased
3049       //    -- by Self
3050       //    -- by other
3051       // * neutral
3052       // * stack-locked
3053       //    -- by self
3054       //       = sp-proximity test hits
3055       //       = sp-proximity test generates false-negative
3056       //    -- by other
3057       //
3058 
3059       Label IsInflated, DONE_LABEL, PopDone ;
3060 
3061       // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
3062       // order to reduce the number of conditional branches in the most common cases.
3063       // Beware -- there's a subtle invariant that fetch of the markword
3064       // at [FETCH], below, will never observe a biased encoding (*101b).
3065       // If this invariant is not held we risk exclusion (safety) failure.
3066       if (UseBiasedLocking && !UseOptoBiasInlining) {
3067         masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
3068       }
3069 
3070       masm.movptr(tmpReg, Address(objReg, 0)) ;         // [FETCH]
3071       masm.testptr(tmpReg, 0x02) ;                      // Inflated v (Stack-locked or neutral)
3072       masm.jccb  (Assembler::notZero, IsInflated) ;
3073 
3074       // Attempt stack-locking ...
3075       masm.orptr (tmpReg, 0x1);
3076       masm.movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
3077       if (os::is_MP()) { masm.lock();  }
3078       masm.cmpxchgptr(boxReg, Address(objReg, 0));           // Updates tmpReg
3079       if (_counters != NULL) {
3080         masm.cond_inc32(Assembler::equal,
3081                         ExternalAddress((address)_counters->fast_path_entry_count_addr()));
3082       }
3083       masm.jccb (Assembler::equal, DONE_LABEL);
3084 
3085       // Recursive locking
3086       masm.subptr(tmpReg, rsp);
3087       masm.andptr(tmpReg, 0xFFFFF003 );
3088       masm.movptr(Address(boxReg, 0), tmpReg);
3089       if (_counters != NULL) {
3090         masm.cond_inc32(Assembler::equal,
3091                         ExternalAddress((address)_counters->fast_path_entry_count_addr()));
3092       }
3093       masm.jmp  (DONE_LABEL) ;
3094 
3095       masm.bind (IsInflated) ;
3096 
3097       // The object is inflated.
3098       //
3099       // TODO-FIXME: eliminate the ugly use of manifest constants:
3100       //   Use markOopDesc::monitor_value instead of "2".
3101       //   use markOop::unused_mark() instead of "3".
3102       // The tmpReg value is an objectMonitor reference ORed with
3103       // markOopDesc::monitor_value (2).   We can either convert tmpReg to an
3104       // objectmonitor pointer by masking off the "2" bit or we can just
3105       // use tmpReg as an objectmonitor pointer but bias the objectmonitor
3106       // field offsets with "-2" to compensate for and annul the low-order tag bit.
3107       //
3108       // I use the latter as it avoids AGI stalls.
3109       // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
3110       // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
3111       //
3112       #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
3113 
3114       // boxReg refers to the on-stack BasicLock in the current frame.
3115       // We'd like to write:
3116       //   set box->_displaced_header = markOop::unused_mark().  Any non-0 value suffices.
3117       // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
3118       // additional latency as we have another ST in the store buffer that must drain.
3119 
3120       if (EmitSync & 8192) { 
3121          masm.movptr(Address(boxReg, 0), 3) ;            // results in ST-before-CAS penalty
3122          masm.get_thread (scrReg) ; 
3123          masm.movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2] 
3124          masm.movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
3125          if (os::is_MP()) { masm.lock(); } 
3126          masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
3127       } else 
3128       if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
3129          masm.movptr(scrReg, boxReg) ; 
3130          masm.movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2] 
3131 
3132          // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
3133          if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
3134             // prefetchw [eax + Offset(_owner)-2]
3135             masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
3136          }
3137 
3138          if ((EmitSync & 64) == 0) {
3139            // Optimistic form: consider XORL tmpReg,tmpReg
3140            masm.movptr(tmpReg, NULL_WORD) ; 
3141          } else { 
3142            // Can suffer RTS->RTO upgrades on shared or cold $ lines
3143            // Test-And-CAS instead of CAS
3144            masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
3145            masm.testptr(tmpReg, tmpReg) ;                   // Locked ? 
3146            masm.jccb  (Assembler::notZero, DONE_LABEL) ;                   
3147          }
3148 
3149          // Appears unlocked - try to swing _owner from null to non-null.
3150          // Ideally, I'd manifest "Self" with get_thread and then attempt
3151          // to CAS the register containing Self into m->Owner.
3152          // But we don't have enough registers, so instead we can either try to CAS
3153          // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
3154          // we later store "Self" into m->Owner.  Transiently storing a stack address
3155          // (rsp or the address of the box) into  m->owner is harmless.
3156          // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
3157          if (os::is_MP()) { masm.lock();  }
3158          masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
3159          masm.movptr(Address(scrReg, 0), 3) ;          // box->_displaced_header = 3
3160          masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
3161          masm.get_thread (scrReg) ;                    // beware: clobbers ICCs
3162          masm.movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg) ; 
3163          masm.xorptr(boxReg, boxReg) ;                 // set icc.ZFlag = 1 to indicate success
3164                        
3165          // If the CAS fails we can either retry or pass control to the slow-path.  
3166          // We use the latter tactic.  
3167          // Pass the CAS result in the icc.ZFlag into DONE_LABEL
3168          // If the CAS was successful ...
3169          //   Self has acquired the lock
3170          //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
3171          // Intentional fall-through into DONE_LABEL ...
3172       } else {
3173          masm.movptr(Address(boxReg, 0), 3) ;       // results in ST-before-CAS penalty
3174          masm.movptr(boxReg, tmpReg) ; 
3175 
3176          // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
3177          if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
3178             // prefetchw [eax + Offset(_owner)-2]
3179             masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
3180          }
3181 
3182          if ((EmitSync & 64) == 0) {
3183            // Optimistic form
3184            masm.xorptr  (tmpReg, tmpReg) ; 
3185          } else { 
3186            // Can suffer RTS->RTO upgrades on shared or cold $ lines
3187            masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
3188            masm.testptr(tmpReg, tmpReg) ;                   // Locked ? 
3189            masm.jccb  (Assembler::notZero, DONE_LABEL) ;                   
3190          }
3191 
3192          // Appears unlocked - try to swing _owner from null to non-null.
3193          // Use either "Self" (in scr) or rsp as thread identity in _owner.
3194          // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
3195          masm.get_thread (scrReg) ;
3196          if (os::is_MP()) { masm.lock(); }
3197          masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3198 
3199          // If the CAS fails we can either retry or pass control to the slow-path.
3200          // We use the latter tactic.
3201          // Pass the CAS result in the icc.ZFlag into DONE_LABEL
3202          // If the CAS was successful ...
3203          //   Self has acquired the lock
3204          //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
3205          // Intentional fall-through into DONE_LABEL ...
3206       }
3207 
3208       // DONE_LABEL is a hot target - we'd really like to place it at the
3209       // start of cache line by padding with NOPs.
3210       // See the AMD and Intel software optimization manuals for the
3211       // most efficient "long" NOP encodings.
3212       // Unfortunately none of our alignment mechanisms suffice.
3213       masm.bind(DONE_LABEL);
3214 
3215       // Avoid branch-to-branch on AMD processors
3216       // This appears to be superstition.
3217       if (EmitSync & 32) masm.nop() ;
3218 
3219 
3220       // At DONE_LABEL the icc ZFlag is set as follows ...
3221       // Fast_Unlock uses the same protocol.
3222       // ZFlag == 1 -> Success
3223       // ZFlag == 0 -> Failure - force control through the slow-path
3224     }
3225   %}
3226 
3227   // obj: object to unlock
3228   // box: box address (displaced header location), killed.  Must be EAX.
3229   // rbx,: killed tmp; cannot be obj nor box.
3230   //
3231   // Some commentary on balanced locking:
3232   //
3233   // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
3234   // Methods that don't have provably balanced locking are forced to run in the
3235   // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
3236   // The interpreter provides two properties:
3237   // I1:  At return-time the interpreter automatically and quietly unlocks any
3238   //      objects acquired the current activation (frame).  Recall that the
3239   //      interpreter maintains an on-stack list of locks currently held by
3240   //      a frame.
3241   // I2:  If a method attempts to unlock an object that is not held by the
3242   //      the frame the interpreter throws IMSX.
3243   //
3244   // Lets say A(), which has provably balanced locking, acquires O and then calls B().
3245   // B() doesn't have provably balanced locking so it runs in the interpreter.
3246   // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
3247   // is still locked by A().
3248   //
3249   // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
3250   // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
3251   // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
3252   // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
3253 
3254   enc_class Fast_Unlock( nabxRegP obj, eAXRegP box, eRegP tmp) %{
3255 
3256     Register objReg = as_Register($obj$$reg);
3257     Register boxReg = as_Register($box$$reg);
3258     Register tmpReg = as_Register($tmp$$reg);
3259 
3260     guarantee (objReg != boxReg, "") ;
3261     guarantee (objReg != tmpReg, "") ;
3262     guarantee (boxReg != tmpReg, "") ;
3263     guarantee (boxReg == as_Register(EAX_enc), "") ;
3264     MacroAssembler masm(&cbuf);
3265 
3266     if (EmitSync & 4) {
3267       // Disable - inhibit all inlining.  Force control through the slow-path
3268       masm.cmpptr (rsp, 0) ; 
3269     } else 
3270     if (EmitSync & 8) {
3271       Label DONE_LABEL ;
3272       if (UseBiasedLocking) {
3273          masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
3274       }
3275       // classic stack-locking code ...
3276       masm.movptr(tmpReg, Address(boxReg, 0)) ;
3277       masm.testptr(tmpReg, tmpReg) ;
3278       masm.jcc   (Assembler::zero, DONE_LABEL) ;
3279       if (os::is_MP()) { masm.lock(); }
3280       masm.cmpxchgptr(tmpReg, Address(objReg, 0));          // Uses EAX which is box
3281       masm.bind(DONE_LABEL);
3282     } else {
3283       Label DONE_LABEL, Stacked, CheckSucc, Inflated ;
3284 
3285       // Critically, the biased locking test must have precedence over
3286       // and appear before the (box->dhw == 0) recursive stack-lock test.
3287       if (UseBiasedLocking && !UseOptoBiasInlining) {
3288          masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
3289       }
3290       
3291       masm.cmpptr(Address(boxReg, 0), 0) ;            // Examine the displaced header
3292       masm.movptr(tmpReg, Address(objReg, 0)) ;       // Examine the object's markword
3293       masm.jccb  (Assembler::zero, DONE_LABEL) ;      // 0 indicates recursive stack-lock
3294 
3295       masm.testptr(tmpReg, 0x02) ;                     // Inflated? 
3296       masm.jccb  (Assembler::zero, Stacked) ;
3297 
3298       masm.bind  (Inflated) ;
3299       // It's inflated.
3300       // Despite our balanced locking property we still check that m->_owner == Self
3301       // as java routines or native JNI code called by this thread might
3302       // have released the lock.
3303       // Refer to the comments in synchronizer.cpp for how we might encode extra
3304       // state in _succ so we can avoid fetching EntryList|cxq.
3305       //
3306       // I'd like to add more cases in fast_lock() and fast_unlock() --
3307       // such as recursive enter and exit -- but we have to be wary of
3308       // I$ bloat, T$ effects and BP$ effects.
3309       //
3310       // If there's no contention try a 1-0 exit.  That is, exit without
3311       // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
3312       // we detect and recover from the race that the 1-0 exit admits.
3313       //
3314       // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
3315       // before it STs null into _owner, releasing the lock.  Updates
3316       // to data protected by the critical section must be visible before
3317       // we drop the lock (and thus before any other thread could acquire
3318       // the lock and observe the fields protected by the lock).
3319       // IA32's memory-model is SPO, so STs are ordered with respect to
3320       // each other and there's no need for an explicit barrier (fence).
3321       // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
3322 
3323       masm.get_thread (boxReg) ;
3324       if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
3325         // prefetchw [ebx + Offset(_owner)-2]
3326         masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2));
3327       }
3328 
3329       // Note that we could employ various encoding schemes to reduce
3330       // the number of loads below (currently 4) to just 2 or 3.
3331       // Refer to the comments in synchronizer.cpp.
3332       // In practice the chain of fetches doesn't seem to impact performance, however.
3333       if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
3334          // Attempt to reduce branch density - AMD's branch predictor.
3335          masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
3336          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3337          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
3338          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
3339          masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
3340          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 
3341          masm.jmpb  (DONE_LABEL) ; 
3342       } else { 
3343          masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
3344          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3345          masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
3346          masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
3347          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
3348          masm.jccb  (Assembler::notZero, CheckSucc) ; 
3349          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 
3350          masm.jmpb  (DONE_LABEL) ; 
3351       }
3352 
3353       // The Following code fragment (EmitSync & 65536) improves the performance of
3354       // contended applications and contended synchronization microbenchmarks.
3355       // Unfortunately the emission of the code - even though not executed - causes regressions
3356       // in scimark and jetstream, evidently because of $ effects.  Replacing the code
3357       // with an equal number of never-executed NOPs results in the same regression.
3358       // We leave it off by default.
3359 
3360       if ((EmitSync & 65536) != 0) {
3361          Label LSuccess, LGoSlowPath ;
3362 
3363          masm.bind  (CheckSucc) ;
3364 
3365          // Optional pre-test ... it's safe to elide this
3366          if ((EmitSync & 16) == 0) { 
3367             masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
3368             masm.jccb  (Assembler::zero, LGoSlowPath) ; 
3369          }
3370 
3371          // We have a classic Dekker-style idiom:
3372          //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
3373          // There are a number of ways to implement the barrier:
3374          // (1) lock:andl &m->_owner, 0
3375          //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
3376          //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
3377          //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
3378          // (2) If supported, an explicit MFENCE is appealing.
3379          //     In older IA32 processors MFENCE is slower than lock:add or xchg
3380          //     particularly if the write-buffer is full as might be the case if
3381          //     if stores closely precede the fence or fence-equivalent instruction.
3382          //     In more modern implementations MFENCE appears faster, however.
3383          // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
3384          //     The $lines underlying the top-of-stack should be in M-state.
3385          //     The locked add instruction is serializing, of course.
3386          // (4) Use xchg, which is serializing
3387          //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
3388          // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
3389          //     The integer condition codes will tell us if succ was 0.
3390          //     Since _succ and _owner should reside in the same $line and
3391          //     we just stored into _owner, it's likely that the $line
3392          //     remains in M-state for the lock:orl.
3393          //
3394          // We currently use (3), although it's likely that switching to (2)
3395          // is correct for the future.
3396             
3397          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 
3398          if (os::is_MP()) { 
3399             if (VM_Version::supports_sse2() && 1 == FenceInstruction) { 
3400               masm.mfence();
3401             } else { 
3402               masm.lock () ; masm.addptr(Address(rsp, 0), 0) ; 
3403             }
3404          }
3405          // Ratify _succ remains non-null
3406          masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
3407          masm.jccb  (Assembler::notZero, LSuccess) ; 
3408 
3409          masm.xorptr(boxReg, boxReg) ;                  // box is really EAX
3410          if (os::is_MP()) { masm.lock(); }
3411          masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
3412          masm.jccb  (Assembler::notEqual, LSuccess) ;
3413          // Since we're low on registers we installed rsp as a placeholding in _owner.
3414          // Now install Self over rsp.  This is safe as we're transitioning from
3415          // non-null to non=null
3416          masm.get_thread (boxReg) ;
3417          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ;
3418          // Intentional fall-through into LGoSlowPath ...
3419 
3420          masm.bind  (LGoSlowPath) ; 
3421          masm.orptr(boxReg, 1) ;                      // set ICC.ZF=0 to indicate failure
3422          masm.jmpb  (DONE_LABEL) ; 
3423 
3424          masm.bind  (LSuccess) ; 
3425          masm.xorptr(boxReg, boxReg) ;                 // set ICC.ZF=1 to indicate success
3426          masm.jmpb  (DONE_LABEL) ; 
3427       }
3428 
3429       masm.bind (Stacked) ;
3430       // It's not inflated and it's not recursively stack-locked and it's not biased.
3431       // It must be stack-locked.
3432       // Try to reset the header to displaced header.
3433       // The "box" value on the stack is stable, so we can reload
3434       // and be assured we observe the same value as above.
3435       masm.movptr(tmpReg, Address(boxReg, 0)) ;
3436       if (os::is_MP()) {   masm.lock();    }
3437       masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box
3438       // Intention fall-thru into DONE_LABEL
3439 
3440 
3441       // DONE_LABEL is a hot target - we'd really like to place it at the
3442       // start of cache line by padding with NOPs.
3443       // See the AMD and Intel software optimization manuals for the
3444       // most efficient "long" NOP encodings.
3445       // Unfortunately none of our alignment mechanisms suffice.
3446       if ((EmitSync & 65536) == 0) {
3447          masm.bind (CheckSucc) ;
3448       }
3449       masm.bind(DONE_LABEL);
3450 
3451       // Avoid branch to branch on AMD processors
3452       if (EmitSync & 32768) { masm.nop() ; }
3453     }
3454   %}
3455 
3456 
3457   enc_class enc_pop_rdx() %{
3458     emit_opcode(cbuf,0x5A);
3459   %}
3460 
3461   enc_class enc_rethrow() %{
3462     cbuf.set_insts_mark();
3463     emit_opcode(cbuf, 0xE9);        // jmp    entry
3464     emit_d32_reloc(cbuf, (int)OptoRuntime::rethrow_stub() - ((int)cbuf.insts_end())-4,
3465                    runtime_call_Relocation::spec(), RELOC_IMM32 );
3466   %}
3467 
3468 
3469   // Convert a double to an int.  Java semantics require we do complex
3470   // manglelations in the corner cases.  So we set the rounding mode to
3471   // 'zero', store the darned double down as an int, and reset the
3472   // rounding mode to 'nearest'.  The hardware throws an exception which
3473   // patches up the correct value directly to the stack.
3474   enc_class DPR2I_encoding( regDPR src ) %{
3475     // Flip to round-to-zero mode.  We attempted to allow invalid-op
3476     // exceptions here, so that a NAN or other corner-case value will


13140   ins_encode();
13141   ins_pipe( empty );
13142 %}
13143 
13144 
13145 // Rethrow exception:
13146 // The exception oop will come in the first argument position.
13147 // Then JUMP (not call) to the rethrow stub code.
13148 instruct RethrowException()
13149 %{
13150   match(Rethrow);
13151 
13152   // use the following format syntax
13153   format %{ "JMP    rethrow_stub" %}
13154   ins_encode(enc_rethrow);
13155   ins_pipe( pipe_jmp );
13156 %}
13157 
13158 // inlined locking and unlocking
13159 
13160 
13161 instruct cmpFastLock( eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{
13162   match( Set cr (FastLock object box) );
13163   effect( TEMP tmp, TEMP scr, USE_KILL box );
13164   ins_cost(300);
13165   format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %}
13166   ins_encode( Fast_Lock(object,box,tmp,scr) );
13167   ins_pipe( pipe_slow );


13168 %}
13169 
13170 instruct cmpFastUnlock( eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{
13171   match( Set cr (FastUnlock object box) );
13172   effect( TEMP tmp, USE_KILL box );
13173   ins_cost(300);
13174   format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %}
13175   ins_encode( Fast_Unlock(object,box,tmp) );
13176   ins_pipe( pipe_slow );


13177 %}
13178 
13179 
13180 
13181 // ============================================================================
13182 // Safepoint Instruction
13183 instruct safePoint_poll(eFlagsReg cr) %{
13184   match(SafePoint);
13185   effect(KILL cr);
13186 
13187   // TODO-FIXME: we currently poll at offset 0 of the safepoint polling page.
13188   // On SPARC that might be acceptable as we can generate the address with
13189   // just a sethi, saving an or.  By polling at offset 0 we can end up
13190   // putting additional pressure on the index-0 in the D$.  Because of
13191   // alignment (just like the situation at hand) the lower indices tend
13192   // to see more traffic.  It'd be better to change the polling address
13193   // to offset 0 of the last $line in the polling page.
13194 
13195   format %{ "TSTL   #polladdr,EAX\t! Safepoint: poll for GC" %}
13196   ins_cost(125);




2901     emit_rm(cbuf,0x3, $tmp$$reg, $tmp$$reg);
2902     // CMP    $tmp,$src.lo
2903     emit_opcode( cbuf, 0x3B );
2904     emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg );
2905     // SBB    $tmp,$src.hi
2906     emit_opcode( cbuf, 0x1B );
2907     emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src$$reg) );
2908   %}
2909 
2910  // Sniff, sniff... smells like Gnu Superoptimizer
2911   enc_class neg_long( eRegL dst ) %{
2912     emit_opcode(cbuf,0xF7);    // NEG hi
2913     emit_rm    (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg));
2914     emit_opcode(cbuf,0xF7);    // NEG lo
2915     emit_rm    (cbuf,0x3, 0x3,               $dst$$reg );
2916     emit_opcode(cbuf,0x83);    // SBB hi,0
2917     emit_rm    (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg));
2918     emit_d8    (cbuf,0 );
2919   %}
2920 
























































































































































































































































































































































































































































































































































2921   enc_class enc_pop_rdx() %{
2922     emit_opcode(cbuf,0x5A);
2923   %}
2924 
2925   enc_class enc_rethrow() %{
2926     cbuf.set_insts_mark();
2927     emit_opcode(cbuf, 0xE9);        // jmp    entry
2928     emit_d32_reloc(cbuf, (int)OptoRuntime::rethrow_stub() - ((int)cbuf.insts_end())-4,
2929                    runtime_call_Relocation::spec(), RELOC_IMM32 );
2930   %}
2931 
2932 
2933   // Convert a double to an int.  Java semantics require we do complex
2934   // manglelations in the corner cases.  So we set the rounding mode to
2935   // 'zero', store the darned double down as an int, and reset the
2936   // rounding mode to 'nearest'.  The hardware throws an exception which
2937   // patches up the correct value directly to the stack.
2938   enc_class DPR2I_encoding( regDPR src ) %{
2939     // Flip to round-to-zero mode.  We attempted to allow invalid-op
2940     // exceptions here, so that a NAN or other corner-case value will


12604   ins_encode();
12605   ins_pipe( empty );
12606 %}
12607 
12608 
12609 // Rethrow exception:
12610 // The exception oop will come in the first argument position.
12611 // Then JUMP (not call) to the rethrow stub code.
12612 instruct RethrowException()
12613 %{
12614   match(Rethrow);
12615 
12616   // use the following format syntax
12617   format %{ "JMP    rethrow_stub" %}
12618   ins_encode(enc_rethrow);
12619   ins_pipe( pipe_jmp );
12620 %}
12621 
12622 // inlined locking and unlocking
12623 
12624 instruct cmpFastLock(eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{
12625   match(Set cr (FastLock object box));
12626   effect(TEMP tmp, TEMP scr, USE_KILL box);

12627   ins_cost(300);
12628   format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %}
12629   ins_encode %{
12630     __ fast_lock($object$$Register, $box$$Register, $tmp$$Register, $scr$$Register, _counters);
12631   %}
12632   ins_pipe(pipe_slow);
12633 %}
12634 
12635 instruct cmpFastUnlock(eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{
12636   match(Set cr (FastUnlock object box));
12637   effect(TEMP tmp, USE_KILL box);
12638   ins_cost(300);
12639   format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %}
12640   ins_encode %{
12641     __ fast_unlock($object$$Register, $box$$Register, $tmp$$Register);
12642   %}
12643   ins_pipe(pipe_slow);
12644 %}
12645 
12646 
12647 
12648 // ============================================================================
12649 // Safepoint Instruction
12650 instruct safePoint_poll(eFlagsReg cr) %{
12651   match(SafePoint);
12652   effect(KILL cr);
12653 
12654   // TODO-FIXME: we currently poll at offset 0 of the safepoint polling page.
12655   // On SPARC that might be acceptable as we can generate the address with
12656   // just a sethi, saving an or.  By polling at offset 0 we can end up
12657   // putting additional pressure on the index-0 in the D$.  Because of
12658   // alignment (just like the situation at hand) the lower indices tend
12659   // to see more traffic.  It'd be better to change the polling address
12660   // to offset 0 of the last $line in the polling page.
12661 
12662   format %{ "TSTL   #polladdr,EAX\t! Safepoint: poll for GC" %}
12663   ins_cost(125);


src/cpu/x86/vm/x86_32.ad
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File