hotspot/src/cpu/x86/vm/x86_32.ad

Print this page
rev 611 : Merge

@@ -1,7 +1,7 @@
 //
-// Copyright 1997-2007 Sun Microsystems, Inc.  All Rights Reserved.
+// Copyright 1997-2008 Sun Microsystems, Inc.  All Rights Reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
 // under the terms of the GNU General Public License version 2 only, as
 // published by the Free Software Foundation.

@@ -234,11 +234,11 @@
 
 //----------SOURCE BLOCK-------------------------------------------------------
 // This is a block of C++ code which provides values, functions, and
 // definitions necessary in the rest of the architecture description
 source %{
-#define   RELOC_IMM32    Assembler::imm32_operand
+#define   RELOC_IMM32    Assembler::imm_operand
 #define   RELOC_DISP32   Assembler::disp32_operand
 
 #define __ _masm.
 
 // How to find the high register of a Long pair, given the low register

@@ -493,12 +493,12 @@
 //=============================================================================
 #ifndef PRODUCT
 void MachPrologNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
   Compile* C = ra_->C;
   if( C->in_24_bit_fp_mode() ) {
-    tty->print("FLDCW  24 bit fpu control word");
-    tty->print_cr(""); tty->print("\t"); 
+    st->print("FLDCW  24 bit fpu control word");
+    st->print_cr(""); st->print("\t");
   }
 
   int framesize = C->frame_slots() << LogBytesPerInt;
   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
   // Remove two words for return addr and rbp,

@@ -508,26 +508,26 @@
   // We require that their callers must bang for them.  But be careful, because
   // some VM calls (such as call site linkage) can use several kilobytes of
   // stack.  But the stack safety zone should account for that.
   // See bugs 4446381, 4468289, 4497237.
   if (C->need_stack_bang(framesize)) {
-    tty->print_cr("# stack bang"); tty->print("\t"); 
+    st->print_cr("# stack bang"); st->print("\t");
   }
-  tty->print_cr("PUSHL  EBP"); tty->print("\t");
+  st->print_cr("PUSHL  EBP"); st->print("\t");
 
   if( VerifyStackAtCalls ) { // Majik cookie to verify stack depth
-    tty->print("PUSH   0xBADB100D\t# Majik cookie for stack depth check");
-    tty->print_cr(""); tty->print("\t"); 
+    st->print("PUSH   0xBADB100D\t# Majik cookie for stack depth check");
+    st->print_cr(""); st->print("\t");
     framesize -= wordSize;  
   }
 
   if ((C->in_24_bit_fp_mode() || VerifyStackAtCalls ) && framesize < 128 ) {
     if (framesize) {
-      tty->print("SUB    ESP,%d\t# Create frame",framesize);
+      st->print("SUB    ESP,%d\t# Create frame",framesize);
     }
   } else {
-    tty->print("SUB    ESP,%d\t# Create frame",framesize);
+    st->print("SUB    ESP,%d\t# Create frame",framesize);
   }
 }
 #endif
 
 

@@ -591,15 +591,15 @@
 
 #ifdef ASSERT 
   if (VerifyStackAtCalls) { 
     Label L;
     MacroAssembler masm(&cbuf);
-    masm.pushl(rax);
-    masm.movl(rax, rsp);
-    masm.andl(rax, StackAlignmentInBytes-1);
-    masm.cmpl(rax, StackAlignmentInBytes-wordSize);
-    masm.popl(rax);
+    masm.push(rax);
+    masm.mov(rax, rsp);
+    masm.andptr(rax, StackAlignmentInBytes-1);
+    masm.cmpptr(rax, StackAlignmentInBytes-wordSize);
+    masm.pop(rax);
     masm.jcc(Assembler::equal, L);
     masm.stop("Stack is not properly aligned!");
     masm.bind(L);
   }
 #endif

@@ -723,32 +723,33 @@
   }
   assert(r->is_XMMRegister(), "must be");
   return rc_xmm;
 }
 
-static int impl_helper( CodeBuffer *cbuf, bool do_size, bool is_load, int offset, int reg, int opcode, const char *op_str, int size ) {
+static int impl_helper( CodeBuffer *cbuf, bool do_size, bool is_load, int offset, int reg,
+                        int opcode, const char *op_str, int size, outputStream* st ) {
   if( cbuf ) {
     emit_opcode  (*cbuf, opcode );
     encode_RegMem(*cbuf, Matcher::_regEncode[reg], ESP_enc, 0x4, 0, offset, false);
 #ifndef PRODUCT
   } else if( !do_size ) { 
-    if( size != 0 ) tty->print("\n\t"); 
+    if( size != 0 ) st->print("\n\t");
     if( opcode == 0x8B || opcode == 0x89 ) { // MOV
-      if( is_load ) tty->print("%s   %s,[ESP + #%d]",op_str,Matcher::regName[reg],offset); 
-      else          tty->print("%s   [ESP + #%d],%s",op_str,offset,Matcher::regName[reg]); 
+      if( is_load ) st->print("%s   %s,[ESP + #%d]",op_str,Matcher::regName[reg],offset);
+      else          st->print("%s   [ESP + #%d],%s",op_str,offset,Matcher::regName[reg]);
     } else { // FLD, FST, PUSH, POP
-      tty->print("%s [ESP + #%d]",op_str,offset); 
+      st->print("%s [ESP + #%d]",op_str,offset);
     }
 #endif
   }
   int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
   return size+3+offset_size;
 }
 
 // Helper for XMM registers.  Extra opcode bits, limited syntax.
 static int impl_x_helper( CodeBuffer *cbuf, bool do_size, bool is_load, 
-                         int offset, int reg_lo, int reg_hi, int size ) {
+                         int offset, int reg_lo, int reg_hi, int size, outputStream* st ) {
   if( cbuf ) {
     if( reg_lo+1 == reg_hi ) { // double move?
       if( is_load && !UseXmmLoadAndClearUpper )
         emit_opcode(*cbuf, 0x66 ); // use 'movlpd' for load
       else

@@ -762,47 +763,47 @@
     else
       emit_opcode(*cbuf, is_load ? 0x10 : 0x11 );
     encode_RegMem(*cbuf, Matcher::_regEncode[reg_lo], ESP_enc, 0x4, 0, offset, false);
 #ifndef PRODUCT
   } else if( !do_size ) { 
-    if( size != 0 ) tty->print("\n\t"); 
+    if( size != 0 ) st->print("\n\t");
     if( reg_lo+1 == reg_hi ) { // double move?
-      if( is_load ) tty->print("%s %s,[ESP + #%d]",
+      if( is_load ) st->print("%s %s,[ESP + #%d]",
                                UseXmmLoadAndClearUpper ? "MOVSD " : "MOVLPD",
                                Matcher::regName[reg_lo], offset); 
-      else          tty->print("MOVSD  [ESP + #%d],%s", 
+      else          st->print("MOVSD  [ESP + #%d],%s",
                                offset, Matcher::regName[reg_lo]); 
     } else {
-      if( is_load ) tty->print("MOVSS  %s,[ESP + #%d]", 
+      if( is_load ) st->print("MOVSS  %s,[ESP + #%d]",
                                Matcher::regName[reg_lo], offset); 
-      else          tty->print("MOVSS  [ESP + #%d],%s", 
+      else          st->print("MOVSS  [ESP + #%d],%s",
                                offset, Matcher::regName[reg_lo]); 
     }
 #endif
   }
   int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
   return size+5+offset_size;
 }
 
 
 static int impl_movx_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo, 
-                            int src_hi, int dst_hi, int size ) {
+                            int src_hi, int dst_hi, int size, outputStream* st ) {
   if( UseXmmRegToRegMoveAll ) {//Use movaps,movapd to move between xmm registers
     if( cbuf ) {
       if( (src_lo+1 == src_hi && dst_lo+1 == dst_hi) ) {
         emit_opcode(*cbuf, 0x66 );
       }
       emit_opcode(*cbuf, 0x0F );
       emit_opcode(*cbuf, 0x28 );
       emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst_lo], Matcher::_regEncode[src_lo] );
 #ifndef PRODUCT
     } else if( !do_size ) { 
-      if( size != 0 ) tty->print("\n\t"); 
+      if( size != 0 ) st->print("\n\t");
       if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double move?
-        tty->print("MOVAPD %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]); 
+        st->print("MOVAPD %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
       } else {
-        tty->print("MOVAPS %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]); 
+        st->print("MOVAPS %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
       }
 #endif
     }
     return size + ((src_lo+1 == src_hi && dst_lo+1 == dst_hi) ? 4 : 3);
   } else {

@@ -811,44 +812,45 @@
       emit_opcode(*cbuf, 0x0F );
       emit_opcode(*cbuf, 0x10 );
       emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst_lo], Matcher::_regEncode[src_lo] );
 #ifndef PRODUCT
     } else if( !do_size ) { 
-      if( size != 0 ) tty->print("\n\t"); 
+      if( size != 0 ) st->print("\n\t");
       if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double move?
-        tty->print("MOVSD  %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]); 
+        st->print("MOVSD  %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
       } else {
-        tty->print("MOVSS  %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]); 
+        st->print("MOVSS  %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
       }
 #endif
     }
     return size+4;
   }
 }
 
-static int impl_mov_helper( CodeBuffer *cbuf, bool do_size, int src, int dst, int size ) {
+static int impl_mov_helper( CodeBuffer *cbuf, bool do_size, int src, int dst, int size, outputStream* st ) {
   if( cbuf ) {
     emit_opcode(*cbuf, 0x8B );
     emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst], Matcher::_regEncode[src] );
 #ifndef PRODUCT
   } else if( !do_size ) { 
-    if( size != 0 ) tty->print("\n\t"); 
-    tty->print("MOV    %s,%s",Matcher::regName[dst],Matcher::regName[src]); 
+    if( size != 0 ) st->print("\n\t");
+    st->print("MOV    %s,%s",Matcher::regName[dst],Matcher::regName[src]);
 #endif
   }
   return size+2;
 }
 
-static int impl_fp_store_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int src_hi, int dst_lo, int dst_hi, int offset, int size ) {
+static int impl_fp_store_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int src_hi, int dst_lo, int dst_hi,
+                                 int offset, int size, outputStream* st ) {
   if( src_lo != FPR1L_num ) {      // Move value to top of FP stack, if not already there
     if( cbuf ) {
       emit_opcode( *cbuf, 0xD9 );  // FLD (i.e., push it)
       emit_d8( *cbuf, 0xC0-1+Matcher::_regEncode[src_lo] );
 #ifndef PRODUCT
     } else if( !do_size ) { 
-      if( size != 0 ) tty->print("\n\t"); 
-      tty->print("FLD    %s",Matcher::regName[src_lo]);
+      if( size != 0 ) st->print("\n\t");
+      st->print("FLD    %s",Matcher::regName[src_lo]);
 #endif
     }
     size += 2;
   }
 

@@ -862,11 +864,11 @@
     op_str = (src_lo != FPR1L_num) ? "FSTP_S" : "FST_S ";
     op = 0xD9;
     assert( !OptoReg::is_valid(src_hi) && !OptoReg::is_valid(dst_hi), "no non-adjacent float-stores" );
   }
 
-  return impl_helper(cbuf,do_size,false,offset,st_op,op,op_str,size);
+  return impl_helper(cbuf,do_size,false,offset,st_op,op,op_str,size, st);
 }
 
 uint MachSpillCopyNode::implementation( CodeBuffer *cbuf, PhaseRegAlloc *ra_, bool do_size, outputStream* st ) const {
   // Get registers to move 
   OptoReg::Name src_second = ra_->get_reg_second(in(1));

@@ -890,36 +892,36 @@
   // --------------------------------------
   // Check for mem-mem move.  push/pop to move.
   if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
     if( src_second == dst_first ) { // overlapping stack copy ranges
       assert( src_second_rc == rc_stack && dst_second_rc == rc_stack, "we only expect a stk-stk copy here" );
-      size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),ESI_num,0xFF,"PUSH  ",size);
-      size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_second),EAX_num,0x8F,"POP   ",size);
+      size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),ESI_num,0xFF,"PUSH  ",size, st);
+      size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_second),EAX_num,0x8F,"POP   ",size, st);
       src_second_rc = dst_second_rc = rc_bad;  // flag as already moved the second bits
     }
     // move low bits
-    size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),ESI_num,0xFF,"PUSH  ",size);
-    size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),EAX_num,0x8F,"POP   ",size);
+    size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),ESI_num,0xFF,"PUSH  ",size, st);
+    size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),EAX_num,0x8F,"POP   ",size, st);
     if( src_second_rc == rc_stack && dst_second_rc == rc_stack ) { // mov second bits
-      size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),ESI_num,0xFF,"PUSH  ",size);
-      size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_second),EAX_num,0x8F,"POP   ",size);
+      size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),ESI_num,0xFF,"PUSH  ",size, st);
+      size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_second),EAX_num,0x8F,"POP   ",size, st);
     }
     return size;
   }
 
   // --------------------------------------
   // Check for integer reg-reg copy
   if( src_first_rc == rc_int && dst_first_rc == rc_int )
-    size = impl_mov_helper(cbuf,do_size,src_first,dst_first,size);
+    size = impl_mov_helper(cbuf,do_size,src_first,dst_first,size, st);
   
   // Check for integer store
   if( src_first_rc == rc_int && dst_first_rc == rc_stack )
-    size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),src_first,0x89,"MOV ",size);
+    size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),src_first,0x89,"MOV ",size, st);
 
   // Check for integer load
   if( dst_first_rc == rc_int && src_first_rc == rc_stack )
-    size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),dst_first,0x8B,"MOV ",size);
+    size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),dst_first,0x8B,"MOV ",size, st);
 
   // --------------------------------------
   // Check for float reg-reg copy
   if( src_first_rc == rc_float && dst_first_rc == rc_float ) {
     assert( (src_second_rc == rc_bad && dst_second_rc == rc_bad) ||

@@ -949,11 +951,11 @@
     return size + ((src_first != FPR1L_num) ? 2+2 : 2);
   }
   
   // Check for float store
   if( src_first_rc == rc_float && dst_first_rc == rc_stack ) {
-    return impl_fp_store_helper(cbuf,do_size,src_first,src_second,dst_first,dst_second,ra_->reg2offset(dst_first),size);
+    return impl_fp_store_helper(cbuf,do_size,src_first,src_second,dst_first,dst_second,ra_->reg2offset(dst_first),size, st);
   }
 
   // Check for float load
   if( dst_first_rc == rc_float && src_first_rc == rc_stack ) {
     int offset = ra_->reg2offset(src_first);

@@ -985,21 +987,21 @@
   // Check for xmm reg-reg copy
   if( src_first_rc == rc_xmm && dst_first_rc == rc_xmm ) {
     assert( (src_second_rc == rc_bad && dst_second_rc == rc_bad) ||
             (src_first+1 == src_second && dst_first+1 == dst_second), 
             "no non-adjacent float-moves" );
-    return impl_movx_helper(cbuf,do_size,src_first,dst_first,src_second, dst_second, size);
+    return impl_movx_helper(cbuf,do_size,src_first,dst_first,src_second, dst_second, size, st);
   }
 
   // Check for xmm store
   if( src_first_rc == rc_xmm && dst_first_rc == rc_stack ) {
-    return impl_x_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),src_first, src_second, size);
+    return impl_x_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),src_first, src_second, size, st);
   }
 
   // Check for float xmm load
   if( dst_first_rc == rc_xmm && src_first_rc == rc_stack ) {
-    return impl_x_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),dst_first, dst_second, size);
+    return impl_x_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),dst_first, dst_second, size, st);
   }
 
   // Copy from float reg to xmm reg
   if( dst_first_rc == rc_xmm && src_first_rc == rc_float ) {
     // copy to the top of stack from floating point reg

@@ -1015,14 +1017,14 @@
       st->print("LEA    ESP,[ESP-8]");
 #endif
     }
     size += 4;
 
-    size = impl_fp_store_helper(cbuf,do_size,src_first,src_second,dst_first,dst_second,0,size);
+    size = impl_fp_store_helper(cbuf,do_size,src_first,src_second,dst_first,dst_second,0,size, st);
 
     // Copy from the temp memory to the xmm reg.
-    size = impl_x_helper(cbuf,do_size,true ,0,dst_first, dst_second, size);
+    size = impl_x_helper(cbuf,do_size,true ,0,dst_first, dst_second, size, st);
 
     if( cbuf ) {
       emit_opcode(*cbuf,0x8D);  // LEA  ESP,[ESP+8]
       emit_rm(*cbuf, 0x1, ESP_enc, 0x04);
       emit_rm(*cbuf, 0x0, 0x04, ESP_enc);

@@ -1045,19 +1047,19 @@
     return size;               // Self copy; no move
   assert( src_second_rc != rc_bad && dst_second_rc != rc_bad, "src_second & dst_second cannot be Bad" );
 
   // Check for second word int-int move
   if( src_second_rc == rc_int && dst_second_rc == rc_int )
-    return impl_mov_helper(cbuf,do_size,src_second,dst_second,size);
+    return impl_mov_helper(cbuf,do_size,src_second,dst_second,size, st);
 
   // Check for second word integer store
   if( src_second_rc == rc_int && dst_second_rc == rc_stack )
-    return impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_second),src_second,0x89,"MOV ",size);
+    return impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_second),src_second,0x89,"MOV ",size, st);
 
   // Check for second word integer load
   if( dst_second_rc == rc_int && src_second_rc == rc_stack )
-    return impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),dst_second,0x8B,"MOV ",size);
+    return impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),dst_second,0x8B,"MOV ",size, st);
 
 
   Unimplemented();
 }
 

@@ -1148,11 +1150,12 @@
   if (base == NULL)  return;  // CodeBuffer::expand failed
   // static stub relocation stores the instruction address of the call
   __ relocate(static_stub_Relocation::spec(mark), RELOC_IMM32);
   // static stub relocation also tags the methodOop in the code-stream.
   __ movoop(rbx, (jobject)NULL);  // method is zapped till fixup time
-  __ jump(RuntimeAddress((address)-1));
+  // This is recognized as unresolved by relocs/nativeInst/ic code
+  __ jump(RuntimeAddress(__ pc()));
 
   __ end_a_stub();
   // Update current stubs pointer and restore code_end.
 }
 // size of call stub, compiled java to interpretor

@@ -1179,11 +1182,11 @@
 void MachUEPNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
   MacroAssembler masm(&cbuf);
 #ifdef ASSERT
   uint code_size = cbuf.code_size();
 #endif
-  masm.cmpl(rax, Address(rcx, oopDesc::klass_offset_in_bytes()));
+  masm.cmpptr(rax, Address(rcx, oopDesc::klass_offset_in_bytes()));
   masm.jump_cc(Assembler::notEqual,
                RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
   /* WARNING these NOPs are critical so that verified entry point is properly
      aligned for patching by NativeJump::patch_verified_entry() */
   int nops_cnt = 2;

@@ -1315,11 +1318,15 @@
 
 // Is this branch offset short enough that a short branch can be used?
 //
 // NOTE: If the platform does not provide any short branch variants, then
 //       this method should return false for offset 0.
-bool Matcher::is_short_branch_offset(int offset) {
+bool Matcher::is_short_branch_offset(int rule, int offset) {
+  // the short version of jmpConUCF2 contains multiple branches,
+  // making the reach slightly less
+  if (rule == jmpConUCF2_rule)
+    return (-126 <= offset && offset <= 125);
   return (-128 <= offset && offset <= 127);
 }
 
 const bool Matcher::isSimpleConstant64(jlong value) {
   // Will one (StoreL ConL) be cheaper than two (StoreI ConI)?.

@@ -1685,24 +1692,24 @@
 
     MacroAssembler _masm(&cbuf);
     // Compare super with sub directly, since super is not in its own SSA.
     // The compiler used to emit this test, but we fold it in here,
     // to allow platform-specific tweaking on sparc.
-    __ cmpl(Reax, Resi);
+    __ cmpptr(Reax, Resi);
     __ jcc(Assembler::equal, hit);
 #ifndef PRODUCT
-    __ increment(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
+    __ incrementl(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
 #endif //PRODUCT
-    __ movl(Redi,Address(Resi,sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes()));
+    __ movptr(Redi,Address(Resi,sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes()));
     __ movl(Recx,Address(Redi,arrayOopDesc::length_offset_in_bytes()));
-    __ addl(Redi,arrayOopDesc::base_offset_in_bytes(T_OBJECT));
+    __ addptr(Redi,arrayOopDesc::base_offset_in_bytes(T_OBJECT));
     __ repne_scan();
     __ jcc(Assembler::notEqual, miss);
-    __ movl(Address(Resi,sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes()),Reax);
+    __ movptr(Address(Resi,sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes()),Reax);
     __ bind(hit);
     if( $primary )
-      __ xorl(Redi,Redi);
+      __ xorptr(Redi,Redi);
     __ bind(miss);
   %}
 
   enc_class FFree_Float_Stack_All %{    // Free_Float_Stack_All
     MacroAssembler masm(&cbuf);

@@ -1747,19 +1754,19 @@
         // mode the result needs to be removed from the FPU stack.  It's
         // likely that this function call could be removed by the
         // optimizer if the C function is a pure function.
         __ ffree(0);
       } else if (rt == T_FLOAT) {
-        __ leal(rsp, Address(rsp, -4));
+        __ lea(rsp, Address(rsp, -4));
         __ fstp_s(Address(rsp, 0));
         __ movflt(xmm0, Address(rsp, 0));
-        __ leal(rsp, Address(rsp,  4));
+        __ lea(rsp, Address(rsp,  4));
       } else if (rt == T_DOUBLE) {
-        __ leal(rsp, Address(rsp, -8));
+        __ lea(rsp, Address(rsp, -8));
         __ fstp_d(Address(rsp, 0));
         __ movdbl(xmm0, Address(rsp, 0));
-        __ leal(rsp, Address(rsp,  8));
+        __ lea(rsp, Address(rsp,  8));
       }
     }
   %}
 
 

@@ -2886,14 +2893,14 @@
 
     __ jccb(Assembler::parity, nan);
     __ jccb(Assembler::equal,  done);
     __ jccb(Assembler::above,  inc);
     __ bind(nan);
-    __ decrement(as_Register($dst$$reg));
+    __ decrement(as_Register($dst$$reg)); // NO L qqq
     __ jmpb(done);
     __ bind(inc);
-    __ increment(as_Register($dst$$reg));
+    __ increment(as_Register($dst$$reg)); // NO L qqq
     __ bind(done);
   %}
 
   // Compare the longs and set flags
   // BROKEN!  Do Not use as-is

@@ -3156,11 +3163,11 @@
   %}
 
   enc_class mov_i2x(regXD dst, eRegI src) %{
     MacroAssembler _masm(&cbuf);
 
-    __ movd(as_XMMRegister($dst$$reg), as_Register($src$$reg));
+    __ movdl(as_XMMRegister($dst$$reg), as_Register($src$$reg));
   %}
 
 
   // Because the transitions from emitted code to the runtime 
   // monitorenter/exit helper stubs are so slow it's critical that 

@@ -3257,30 +3264,30 @@
       masm.atomic_incl(ExternalAddress((address) _counters->total_entry_count_addr()));
     }
     if (EmitSync & 1) {
         // set box->dhw = unused_mark (3)
         // Force all sync thru slow-path: slow_enter() and slow_exit() 
-        masm.movl (Address(boxReg, 0), intptr_t(markOopDesc::unused_mark())) ;             
-        masm.cmpl (rsp, 0) ;                        
+        masm.movptr (Address(boxReg, 0), int32_t(markOopDesc::unused_mark())) ;             
+        masm.cmpptr (rsp, (int32_t)0) ;                        
     } else 
     if (EmitSync & 2) { 
         Label DONE_LABEL ;           
         if (UseBiasedLocking) {
            // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
            masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
         }
 
-        masm.movl  (tmpReg, Address(objReg, 0)) ;          // fetch markword 
-        masm.orl   (tmpReg, 0x1);
-        masm.movl  (Address(boxReg, 0), tmpReg);           // Anticipate successful CAS 
+        masm.movptr(tmpReg, Address(objReg, 0)) ;          // fetch markword 
+        masm.orptr (tmpReg, 0x1);
+        masm.movptr(Address(boxReg, 0), tmpReg);           // Anticipate successful CAS 
         if (os::is_MP()) { masm.lock();  }
-        masm.cmpxchg(boxReg, Address(objReg, 0));          // Updates tmpReg
+        masm.cmpxchgptr(boxReg, Address(objReg, 0));          // Updates tmpReg
         masm.jcc(Assembler::equal, DONE_LABEL);
         // Recursive locking
-        masm.subl(tmpReg, rsp);
-        masm.andl(tmpReg, 0xFFFFF003 );
-        masm.movl(Address(boxReg, 0), tmpReg);
+        masm.subptr(tmpReg, rsp);
+        masm.andptr(tmpReg, (int32_t) 0xFFFFF003 );
+        masm.movptr(Address(boxReg, 0), tmpReg);
         masm.bind(DONE_LABEL) ; 
     } else {  
       // Possible cases that we'll encounter in fast_lock 
       // ------------------------------------------------
       // * Inflated

@@ -3304,33 +3311,33 @@
       // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
       // order to reduce the number of conditional branches in the most common cases.  
       // Beware -- there's a subtle invariant that fetch of the markword
       // at [FETCH], below, will never observe a biased encoding (*101b).
       // If this invariant is not held we risk exclusion (safety) failure.
-      if (UseBiasedLocking) { 
+      if (UseBiasedLocking && !UseOptoBiasInlining) {
         masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
       }
 
-      masm.movl  (tmpReg, Address(objReg, 0)) ;        // [FETCH]
-      masm.testl (tmpReg, 0x02) ;                      // Inflated v (Stack-locked or neutral)
+      masm.movptr(tmpReg, Address(objReg, 0)) ;         // [FETCH]
+      masm.testptr(tmpReg, 0x02) ;                      // Inflated v (Stack-locked or neutral)
       masm.jccb  (Assembler::notZero, IsInflated) ;
 
       // Attempt stack-locking ...
-      masm.orl   (tmpReg, 0x1);
-      masm.movl  (Address(boxReg, 0), tmpReg);            // Anticipate successful CAS
+      masm.orptr (tmpReg, 0x1);
+      masm.movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
       if (os::is_MP()) { masm.lock();  }
-      masm.cmpxchg(boxReg, Address(objReg, 0));           // Updates tmpReg
+      masm.cmpxchgptr(boxReg, Address(objReg, 0));           // Updates tmpReg
       if (_counters != NULL) {
         masm.cond_inc32(Assembler::equal,
                         ExternalAddress((address)_counters->fast_path_entry_count_addr()));
       }
       masm.jccb (Assembler::equal, DONE_LABEL);
 
       // Recursive locking
-      masm.subl(tmpReg, rsp);
-      masm.andl(tmpReg, 0xFFFFF003 );
-      masm.movl(Address(boxReg, 0), tmpReg);
+      masm.subptr(tmpReg, rsp);
+      masm.andptr(tmpReg, 0xFFFFF003 );
+      masm.movptr(Address(boxReg, 0), tmpReg);
       if (_counters != NULL) {
         masm.cond_inc32(Assembler::equal,
                         ExternalAddress((address)_counters->fast_path_entry_count_addr()));
       }
       masm.jmp  (DONE_LABEL) ;

@@ -3359,38 +3366,35 @@
       //   set box->_displaced_header = markOop::unused_mark().  Any non-0 value suffices.
       // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
       // additional latency as we have another ST in the store buffer that must drain.  
 
       if (EmitSync & 8192) { 
-         masm.movl  (Address(boxReg, 0), 3) ;            // results in ST-before-CAS penalty
+         masm.movptr(Address(boxReg, 0), 3) ;            // results in ST-before-CAS penalty
          masm.get_thread (scrReg) ; 
-         masm.movl  (boxReg, tmpReg);                    // consider: LEA box, [tmp-2] 
-         masm.movl  (tmpReg, 0);                         // consider: xor vs mov
+         masm.movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2] 
+         masm.movptr(tmpReg, 0);                         // consider: xor vs mov
          if (os::is_MP()) { masm.lock(); } 
-         masm.cmpxchg (scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
+         masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
       } else 
       if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
-         masm.movl (scrReg, boxReg) ; 
-         masm.movl (boxReg, tmpReg);                    // consider: LEA box, [tmp-2] 
+         masm.movptr(scrReg, boxReg) ; 
+         masm.movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2] 
 
          // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
          if ((EmitSync & 2048) && VM_Version::supports_3dnow() && os::is_MP()) {
             // prefetchw [eax + Offset(_owner)-2] 
-            masm.emit_raw (0x0F) ; 
-            masm.emit_raw (0x0D) ; 
-            masm.emit_raw (0x48) ; 
-            masm.emit_raw (ObjectMonitor::owner_offset_in_bytes()-2) ; 
+            masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
          }
 
          if ((EmitSync & 64) == 0) { 
            // Optimistic form: consider XORL tmpReg,tmpReg
-           masm.movl  (tmpReg, 0 ) ; 
+           masm.movptr(tmpReg, 0 ) ; 
          } else { 
            // Can suffer RTS->RTO upgrades on shared or cold $ lines
            // Test-And-CAS instead of CAS
-           masm.movl  (tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
-           masm.testl (tmpReg, tmpReg) ;                   // Locked ? 
+           masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
+           masm.testptr(tmpReg, tmpReg) ;                   // Locked ? 
            masm.jccb  (Assembler::notZero, DONE_LABEL) ;                   
          }
 
          // Appears unlocked - try to swing _owner from null to non-null. 
          // Ideally, I'd manifest "Self" with get_thread and then attempt

@@ -3399,53 +3403,50 @@
          // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
          // we later store "Self" into m->Owner.  Transiently storing a stack address 
          // (rsp or the address of the box) into  m->owner is harmless.  
          // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.  
          if (os::is_MP()) { masm.lock();  }
-         masm.cmpxchg (scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
-         masm.movl  (Address(scrReg, 0), 3) ;          // box->_displaced_header = 3
+         masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
+         masm.movptr(Address(scrReg, 0), 3) ;          // box->_displaced_header = 3
          masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
          masm.get_thread (scrReg) ;                    // beware: clobbers ICCs
-         masm.movl  (Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg) ; 
-         masm.xorl  (boxReg, boxReg) ;                 // set icc.ZFlag = 1 to indicate success
+         masm.movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg) ; 
+         masm.xorptr(boxReg, boxReg) ;                 // set icc.ZFlag = 1 to indicate success
                        
          // If the CAS fails we can either retry or pass control to the slow-path.  
          // We use the latter tactic.  
          // Pass the CAS result in the icc.ZFlag into DONE_LABEL
          // If the CAS was successful ...
          //   Self has acquired the lock
          //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
          // Intentional fall-through into DONE_LABEL ...
       } else {
-         masm.movl (Address(boxReg, 0), 3) ;       // results in ST-before-CAS penalty
-         masm.movl (boxReg, tmpReg) ; 
+         masm.movptr(Address(boxReg, 0), 3) ;       // results in ST-before-CAS penalty
+         masm.movptr(boxReg, tmpReg) ; 
 
          // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
          if ((EmitSync & 2048) && VM_Version::supports_3dnow() && os::is_MP()) {
             // prefetchw [eax + Offset(_owner)-2] 
-            masm.emit_raw (0x0F) ; 
-            masm.emit_raw (0x0D) ; 
-            masm.emit_raw (0x48) ; 
-            masm.emit_raw (ObjectMonitor::owner_offset_in_bytes()-2) ; 
+            masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
          }
 
          if ((EmitSync & 64) == 0) { 
            // Optimistic form
-           masm.xorl  (tmpReg, tmpReg) ; 
+           masm.xorptr  (tmpReg, tmpReg) ; 
          } else { 
            // Can suffer RTS->RTO upgrades on shared or cold $ lines
-           masm.movl  (tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
-           masm.testl (tmpReg, tmpReg) ;                   // Locked ? 
+           masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
+           masm.testptr(tmpReg, tmpReg) ;                   // Locked ? 
            masm.jccb  (Assembler::notZero, DONE_LABEL) ;                   
          }
 
          // Appears unlocked - try to swing _owner from null to non-null. 
          // Use either "Self" (in scr) or rsp as thread identity in _owner. 
          // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.  
          masm.get_thread (scrReg) ; 
          if (os::is_MP()) { masm.lock(); } 
-         masm.cmpxchg (scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
+         masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
 
          // If the CAS fails we can either retry or pass control to the slow-path.  
          // We use the latter tactic.  
          // Pass the CAS result in the icc.ZFlag into DONE_LABEL
          // If the CAS was successful ...

@@ -3512,38 +3513,38 @@
     guarantee (boxReg == as_Register(EAX_enc), "") ; 
     MacroAssembler masm(&cbuf);
 
     if (EmitSync & 4) { 
       // Disable - inhibit all inlining.  Force control through the slow-path
-      masm.cmpl (rsp, 0) ; 
+      masm.cmpptr (rsp, 0) ; 
     } else 
     if (EmitSync & 8) {
       Label DONE_LABEL ; 
       if (UseBiasedLocking) {
          masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
       }
       // classic stack-locking code ...
-      masm.movl  (tmpReg, Address(boxReg, 0)) ;
-      masm.testl (tmpReg, tmpReg) ;
+      masm.movptr(tmpReg, Address(boxReg, 0)) ;
+      masm.testptr(tmpReg, tmpReg) ;
       masm.jcc   (Assembler::zero, DONE_LABEL) ;
       if (os::is_MP()) { masm.lock(); }
-      masm.cmpxchg(tmpReg, Address(objReg, 0));          // Uses EAX which is box
+      masm.cmpxchgptr(tmpReg, Address(objReg, 0));          // Uses EAX which is box
       masm.bind(DONE_LABEL);
     } else {
       Label DONE_LABEL, Stacked, CheckSucc, Inflated ; 
 
       // Critically, the biased locking test must have precedence over
       // and appear before the (box->dhw == 0) recursive stack-lock test.  
-      if (UseBiasedLocking) {
+      if (UseBiasedLocking && !UseOptoBiasInlining) {
          masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
       }
       
-      masm.cmpl  (Address(boxReg, 0), 0) ;            // Examine the displaced header
-      masm.movl  (tmpReg, Address(objReg, 0)) ;       // Examine the object's markword
+      masm.cmpptr(Address(boxReg, 0), 0) ;            // Examine the displaced header
+      masm.movptr(tmpReg, Address(objReg, 0)) ;       // Examine the object's markword
       masm.jccb  (Assembler::zero, DONE_LABEL) ;      // 0 indicates recursive stack-lock
 
-      masm.testl (tmpReg, 0x02) ;                     // Inflated? 
+      masm.testptr(tmpReg, 0x02) ;                     // Inflated? 
       masm.jccb  (Assembler::zero, Stacked) ;
 
       masm.bind  (Inflated) ; 
       // It's inflated.
       // Despite our balanced locking property we still check that m->_owner == Self 

@@ -3570,37 +3571,34 @@
       // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.   
 
       masm.get_thread (boxReg) ; 
       if ((EmitSync & 4096) && VM_Version::supports_3dnow() && os::is_MP()) {
          // prefetchw [ebx + Offset(_owner)-2] 
-         masm.emit_raw (0x0F) ; 
-         masm.emit_raw (0x0D) ; 
-         masm.emit_raw (0x4B) ; 
-         masm.emit_raw (ObjectMonitor::owner_offset_in_bytes()-2) ; 
+        masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2));
       }
        
       // Note that we could employ various encoding schemes to reduce
       // the number of loads below (currently 4) to just 2 or 3.  
       // Refer to the comments in synchronizer.cpp.
       // In practice the chain of fetches doesn't seem to impact performance, however.
       if ((EmitSync & 65536) == 0 && (EmitSync & 256)) { 
          // Attempt to reduce branch density - AMD's branch predictor.
-         masm.xorl  (boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
-         masm.orl   (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
-         masm.orl   (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
-         masm.orl   (boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
+         masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
+         masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
+         masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
+         masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
          masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
-         masm.movl  (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), 0) ; 
+         masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), 0) ; 
          masm.jmpb  (DONE_LABEL) ; 
       } else { 
-         masm.xorl  (boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
-         masm.orl   (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
+         masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
+         masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
          masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
-         masm.movl  (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
-         masm.orl   (boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
+         masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
+         masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
          masm.jccb  (Assembler::notZero, CheckSucc) ; 
-         masm.movl  (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), 0) ; 
+         masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), 0) ; 
          masm.jmpb  (DONE_LABEL) ; 
       }
 
       // The Following code fragment (EmitSync & 65536) improves the performance of
       // contended applications and contended synchronization microbenchmarks.  

@@ -3614,11 +3612,11 @@
 
          masm.bind  (CheckSucc) ;
 
          // Optional pre-test ... it's safe to elide this
          if ((EmitSync & 16) == 0) { 
-            masm.cmpl  (Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
+            masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
             masm.jccb  (Assembler::zero, LGoSlowPath) ; 
          }
 
          // We have a classic Dekker-style idiom:
          //    ST m->_owner = 0 ; MEMBAR; LD m->_succ

@@ -3644,53 +3642,51 @@
          //     remains in M-state for the lock:orl.
          //
          // We currently use (3), although it's likely that switching to (2) 
          // is correct for the future.
             
-         masm.movl  (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), 0) ; 
+         masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), 0) ; 
          if (os::is_MP()) { 
             if (VM_Version::supports_sse2() && 1 == FenceInstruction) { 
-              masm.emit_raw (0x0F) ;    // MFENCE ...
-              masm.emit_raw (0xAE) ; 
-              masm.emit_raw (0xF0) ; 
+              masm.mfence();
             } else { 
-              masm.lock () ; masm.addl (Address(rsp, 0), 0) ; 
+              masm.lock () ; masm.addptr(Address(rsp, 0), 0) ; 
             }
          }
          // Ratify _succ remains non-null
-         masm.cmpl  (Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
+         masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
          masm.jccb  (Assembler::notZero, LSuccess) ; 
 
-         masm.xorl  (boxReg, boxReg) ;                  // box is really EAX
+         masm.xorptr(boxReg, boxReg) ;                  // box is really EAX
          if (os::is_MP()) { masm.lock(); }
-         masm.cmpxchg(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+         masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
          masm.jccb  (Assembler::notEqual, LSuccess) ;
          // Since we're low on registers we installed rsp as a placeholding in _owner.
          // Now install Self over rsp.  This is safe as we're transitioning from
          // non-null to non=null
          masm.get_thread (boxReg) ;
-         masm.movl  (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ;
+         masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ;
          // Intentional fall-through into LGoSlowPath ...
 
          masm.bind  (LGoSlowPath) ; 
-         masm.orl   (boxReg, 1) ;                      // set ICC.ZF=0 to indicate failure
+         masm.orptr(boxReg, 1) ;                      // set ICC.ZF=0 to indicate failure
          masm.jmpb  (DONE_LABEL) ; 
 
          masm.bind  (LSuccess) ; 
-         masm.xorl  (boxReg, boxReg) ;                 // set ICC.ZF=1 to indicate success
+         masm.xorptr(boxReg, boxReg) ;                 // set ICC.ZF=1 to indicate success
          masm.jmpb  (DONE_LABEL) ; 
       }
   
       masm.bind (Stacked) ;
       // It's not inflated and it's not recursively stack-locked and it's not biased. 
       // It must be stack-locked. 
       // Try to reset the header to displaced header.
       // The "box" value on the stack is stable, so we can reload
       // and be assured we observe the same value as above.
-      masm.movl (tmpReg, Address(boxReg, 0)) ;
+      masm.movptr(tmpReg, Address(boxReg, 0)) ;
       if (os::is_MP()) {   masm.lock();    }
-      masm.cmpxchg(tmpReg, Address(objReg, 0)); // Uses EAX which is box
+      masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box
       // Intention fall-thru into DONE_LABEL
 
       
       // DONE_LABEL is a hot target - we'd really like to place it at the
       // start of cache line by padding with NOPs.

@@ -3718,34 +3714,34 @@
     int value_offset  = java_lang_String::value_offset_in_bytes();
     int offset_offset = java_lang_String::offset_offset_in_bytes();
     int count_offset  = java_lang_String::count_offset_in_bytes();
     int base_offset   = arrayOopDesc::base_offset_in_bytes(T_CHAR);
 
-    masm.movl(rax, Address(rsi, value_offset));
+    masm.movptr(rax, Address(rsi, value_offset));
     masm.movl(rcx, Address(rsi, offset_offset));
-    masm.leal(rax, Address(rax, rcx, Address::times_2, base_offset));
-    masm.movl(rbx, Address(rdi, value_offset));
+    masm.lea(rax, Address(rax, rcx, Address::times_2, base_offset));
+    masm.movptr(rbx, Address(rdi, value_offset));
     masm.movl(rcx, Address(rdi, offset_offset));
-    masm.leal(rbx, Address(rbx, rcx, Address::times_2, base_offset));
+    masm.lea(rbx, Address(rbx, rcx, Address::times_2, base_offset));
 
     // Compute the minimum of the string lengths(rsi) and the
     // difference of the string lengths (stack)
 
 
     if (VM_Version::supports_cmov()) {
       masm.movl(rdi, Address(rdi, count_offset));
       masm.movl(rsi, Address(rsi, count_offset));
       masm.movl(rcx, rdi);
       masm.subl(rdi, rsi);
-      masm.pushl(rdi);
+      masm.push(rdi);
       masm.cmovl(Assembler::lessEqual, rsi, rcx);
     } else {
       masm.movl(rdi, Address(rdi, count_offset));
       masm.movl(rcx, Address(rsi, count_offset));
       masm.movl(rsi, rdi);
       masm.subl(rdi, rcx);
-      masm.pushl(rdi);
+      masm.push(rdi);
       masm.jcc(Assembler::lessEqual, ECX_GOOD_LABEL);
       masm.movl(rsi, rcx);
       // rsi holds min, rcx is unused
     }
                 

@@ -3759,18 +3755,18 @@
     masm.load_unsigned_word(rdi, Address(rax, 0));
     
     // Compare first characters
     masm.subl(rcx, rdi);
     masm.jcc(Assembler::notZero,  POP_LABEL);
-    masm.decrement(rsi);
+    masm.decrementl(rsi);
     masm.jcc(Assembler::zero, LENGTH_DIFF_LABEL);
 
     {
       // Check after comparing first character to see if strings are equivalent
       Label LSkip2;
       // Check if the strings start at same location
-      masm.cmpl(rbx,rax);
+      masm.cmpptr(rbx,rax);
       masm.jcc(Assembler::notEqual, LSkip2);
     
       // Check if the length difference is zero (from stack)
       masm.cmpl(Address(rsp, 0), 0x0);
       masm.jcc(Assembler::equal,  LENGTH_DIFF_LABEL);

@@ -3778,31 +3774,103 @@
       // Strings might not be equivalent
       masm.bind(LSkip2);
     }
 
     // Shift rax, and rbx, to the end of the arrays, negate min
-    masm.leal(rax, Address(rax, rsi, Address::times_2, 2));
-    masm.leal(rbx, Address(rbx, rsi, Address::times_2, 2));
+    masm.lea(rax, Address(rax, rsi, Address::times_2, 2));
+    masm.lea(rbx, Address(rbx, rsi, Address::times_2, 2));
     masm.negl(rsi);
 
     // Compare the rest of the characters
     masm.bind(WHILE_HEAD_LABEL);
     masm.load_unsigned_word(rcx, Address(rbx, rsi, Address::times_2, 0));
     masm.load_unsigned_word(rdi, Address(rax, rsi, Address::times_2, 0));
     masm.subl(rcx, rdi);
     masm.jcc(Assembler::notZero, POP_LABEL);
-    masm.increment(rsi);
+    masm.incrementl(rsi);
     masm.jcc(Assembler::notZero, WHILE_HEAD_LABEL);
     
     // Strings are equal up to min length.  Return the length difference.
     masm.bind(LENGTH_DIFF_LABEL);
-    masm.popl(rcx);
+    masm.pop(rcx);
     masm.jmp(DONE_LABEL);
 
     // Discard the stored length difference
     masm.bind(POP_LABEL);
-    masm.addl(rsp, 4);
+    masm.addptr(rsp, 4);
+       
+    // That's it
+    masm.bind(DONE_LABEL);
+  %}
+
+  enc_class enc_Array_Equals(eDIRegP ary1, eSIRegP ary2, eAXRegI tmp1, eBXRegI tmp2, eCXRegI result) %{
+    Label TRUE_LABEL, FALSE_LABEL, DONE_LABEL, COMPARE_LOOP_HDR, COMPARE_LOOP;
+    MacroAssembler masm(&cbuf);
+
+    Register ary1Reg   = as_Register($ary1$$reg);
+    Register ary2Reg   = as_Register($ary2$$reg);
+    Register tmp1Reg   = as_Register($tmp1$$reg);
+    Register tmp2Reg   = as_Register($tmp2$$reg);
+    Register resultReg = as_Register($result$$reg);
+
+    int length_offset  = arrayOopDesc::length_offset_in_bytes();
+    int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
+
+    // Check the input args
+    masm.cmpl(ary1Reg, ary2Reg);
+    masm.jcc(Assembler::equal, TRUE_LABEL);
+    masm.testl(ary1Reg, ary1Reg);
+    masm.jcc(Assembler::zero, FALSE_LABEL);
+    masm.testl(ary2Reg, ary2Reg);
+    masm.jcc(Assembler::zero, FALSE_LABEL);
+
+    // Check the lengths
+    masm.movl(tmp2Reg, Address(ary1Reg, length_offset));
+    masm.movl(resultReg, Address(ary2Reg, length_offset));
+    masm.cmpl(tmp2Reg, resultReg);
+    masm.jcc(Assembler::notEqual, FALSE_LABEL);
+    masm.testl(resultReg, resultReg);
+    masm.jcc(Assembler::zero, TRUE_LABEL);
+
+    // Get the number of 4 byte vectors to compare
+    masm.shrl(resultReg, 1);
+
+    // Check for odd-length arrays
+    masm.andl(tmp2Reg, 1);
+    masm.testl(tmp2Reg, tmp2Reg);
+    masm.jcc(Assembler::zero, COMPARE_LOOP_HDR);
+
+    // Compare 2-byte "tail" at end of arrays
+    masm.load_unsigned_word(tmp1Reg, Address(ary1Reg, resultReg, Address::times_4, base_offset));
+    masm.load_unsigned_word(tmp2Reg, Address(ary2Reg, resultReg, Address::times_4, base_offset));
+    masm.cmpl(tmp1Reg, tmp2Reg);
+    masm.jcc(Assembler::notEqual, FALSE_LABEL);
+    masm.testl(resultReg, resultReg);
+    masm.jcc(Assembler::zero, TRUE_LABEL);
+
+    // Setup compare loop
+    masm.bind(COMPARE_LOOP_HDR);
+    // Shift tmp1Reg and tmp2Reg to the last 4-byte boundary of the arrays
+    masm.leal(tmp1Reg, Address(ary1Reg, resultReg, Address::times_4, base_offset));
+    masm.leal(tmp2Reg, Address(ary2Reg, resultReg, Address::times_4, base_offset));
+    masm.negl(resultReg);
+
+    // 4-byte-wide compare loop
+    masm.bind(COMPARE_LOOP);
+    masm.movl(ary1Reg, Address(tmp1Reg, resultReg, Address::times_4, 0));
+    masm.movl(ary2Reg, Address(tmp2Reg, resultReg, Address::times_4, 0));
+    masm.cmpl(ary1Reg, ary2Reg);
+    masm.jcc(Assembler::notEqual, FALSE_LABEL);
+    masm.increment(resultReg);
+    masm.jcc(Assembler::notZero, COMPARE_LOOP);
+
+    masm.bind(TRUE_LABEL);
+    masm.movl(resultReg, 1);   // return true
+    masm.jmp(DONE_LABEL);
+
+    masm.bind(FALSE_LABEL);
+    masm.xorl(resultReg, resultReg); // return false
        
     // That's it
     masm.bind(DONE_LABEL);
   %}
 

@@ -4241,11 +4309,12 @@
     // masm.membar();
   %}
 
   enc_class enc_membar_volatile %{
     MacroAssembler masm(&cbuf);
-    masm.membar();
+    masm.membar(Assembler::Membar_mask_bits(Assembler::StoreLoad |
+                                            Assembler::StoreStore));
   %}
 
   // Atomically load the volatile long
   enc_class enc_loadL_volatile( memory mem, stackSlotL dst ) %{
     emit_opcode(cbuf,0xDF);

@@ -4536,12 +4605,12 @@
   %}
 
   // Location of C & interpreter return values
   c_return_value %{
     assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" );
-    static int lo[Op_RegL+1] = { 0, 0, EAX_num,      EAX_num,      FPR1L_num,    FPR1L_num, EAX_num };
-    static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, FPR1H_num, EDX_num };
+    static int lo[Op_RegL+1] = { 0, 0, OptoReg::Bad, EAX_num,      EAX_num,      FPR1L_num,    FPR1L_num, EAX_num };
+    static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, FPR1H_num, EDX_num };
 
     // in SSE2+ mode we want to keep the FPU stack clean so pretend
     // that C functions return float and double results in XMM0.
     if( ideal_reg == Op_RegD && UseSSE>=2 )
       return OptoRegPair(XMM0b_num,XMM0a_num);

@@ -4552,12 +4621,12 @@
   %}
 
   // Location of return values
   return_value %{
     assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" );
-    static int lo[Op_RegL+1] = { 0, 0, EAX_num,      EAX_num,      FPR1L_num,    FPR1L_num, EAX_num };
-    static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, FPR1H_num, EDX_num };
+    static int lo[Op_RegL+1] = { 0, 0, OptoReg::Bad, EAX_num,      EAX_num,      FPR1L_num,    FPR1L_num, EAX_num };
+    static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, FPR1H_num, EDX_num };
     if( ideal_reg == Op_RegD && UseSSE>=2 )
       return OptoRegPair(XMM0b_num,XMM0a_num);
     if( ideal_reg == Op_RegF && UseSSE>=1 )
       return OptoRegPair(OptoReg::Bad,XMM0a_num);
     return OptoRegPair(hi[ideal_reg],lo[ideal_reg]);

@@ -4680,10 +4749,37 @@
 
   format %{ %}
   interface(CONST_INTER);
 %}
 
+operand immI_1() %{
+  predicate( n->get_int() == 1 );
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immI_2() %{
+  predicate( n->get_int() == 2 );
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immI_3() %{
+  predicate( n->get_int() == 3 );
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 // Pointer Immediate
 operand immP() %{
   match(ConP);
 
   op_cost(10);

@@ -4718,10 +4814,20 @@
 
   format %{ %}
   interface(CONST_INTER);
 %}
 
+// Long Immediate zero
+operand immL_M1() %{
+  predicate( n->get_long() == -1L );
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 // Long immediate from 0 to 127.
 // Used for a shorter form of long mul by 10.
 operand immL_127() %{
   predicate((0 <= n->get_long()) && (n->get_long() <= 127));
   match(ConL);

@@ -5170,10 +5276,19 @@
 
   format %{ "EFLAGS_U" %}
   interface(REG_INTER);
 %}
 
+operand eFlagsRegUCF() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+  predicate(false);
+
+  format %{ "EFLAGS_U_CF" %}
+  interface(REG_INTER);
+%}
+
 // Condition Code Register used by long compare
 operand flagsReg_long_LTGE() %{
   constraint(ALLOC_IN_RC(int_flags));
   match(RegFlags);
   format %{ "FLAGS_LTGE" %}

@@ -5647,16 +5762,16 @@
 operand cmpOp() %{
   match(Bool);
 
   format %{ "" %}
   interface(COND_INTER) %{
-    equal(0x4);
-    not_equal(0x5);
-    less(0xC);
-    greater_equal(0xD);
-    less_equal(0xE);
-    greater(0xF);
+    equal(0x4, "e");
+    not_equal(0x5, "ne");
+    less(0xC, "l");
+    greater_equal(0xD, "ge");
+    less_equal(0xE, "le");
+    greater(0xF, "g");
   %}
 %}
 
 // Comparison Code, unsigned compare.  Used by FP also, with
 // C2 (unordered) turned into GT or LT already.  The other bits

@@ -5664,16 +5779,51 @@
 operand cmpOpU() %{
   match(Bool);
 
   format %{ "" %}
   interface(COND_INTER) %{
-    equal(0x4);
-    not_equal(0x5);
-    less(0x2);
-    greater_equal(0x3);
-    less_equal(0x6);
-    greater(0x7);
+    equal(0x4, "e");
+    not_equal(0x5, "ne");
+    less(0x2, "b");
+    greater_equal(0x3, "nb");
+    less_equal(0x6, "be");
+    greater(0x7, "nbe");
+  %}
+%}
+
+// Floating comparisons that don't require any fixup for the unordered case
+operand cmpOpUCF() %{
+  match(Bool);
+  predicate(n->as_Bool()->_test._test == BoolTest::lt ||
+            n->as_Bool()->_test._test == BoolTest::ge ||
+            n->as_Bool()->_test._test == BoolTest::le ||
+            n->as_Bool()->_test._test == BoolTest::gt);
+  format %{ "" %}
+  interface(COND_INTER) %{
+    equal(0x4, "e");
+    not_equal(0x5, "ne");
+    less(0x2, "b");
+    greater_equal(0x3, "nb");
+    less_equal(0x6, "be");
+    greater(0x7, "nbe");
+  %}
+%}
+
+
+// Floating comparisons that can be fixed up with extra conditional jumps
+operand cmpOpUCF2() %{
+  match(Bool);
+  predicate(n->as_Bool()->_test._test == BoolTest::ne ||
+            n->as_Bool()->_test._test == BoolTest::eq);
+  format %{ "" %}
+  interface(COND_INTER) %{
+    equal(0x4, "e");
+    not_equal(0x5, "ne");
+    less(0x2, "b");
+    greater_equal(0x3, "nb");
+    less_equal(0x6, "be");
+    greater(0x7, "nbe");
   %}
 %}
 
 // Comparison Code for FP conditional move
 operand cmpOp_fcmov() %{

@@ -5694,16 +5844,16 @@
 operand cmpOp_commute() %{
   match(Bool);
 
   format %{ "" %}
   interface(COND_INTER) %{
-    equal(0x4);
-    not_equal(0x5);
-    less(0xF);
-    greater_equal(0xE);
-    less_equal(0xD);
-    greater(0xC);
+    equal(0x4, "e");
+    not_equal(0x5, "ne");
+    less(0xF, "g");
+    greater_equal(0xE, "le");
+    less_equal(0xD, "ge");
+    greater(0xC, "l");
   %}
 %}
 
 //----------OPERAND CLASSES----------------------------------------------------
 // Operand Classes are groups of operands that are used as to simplify

@@ -7255,20 +7405,29 @@
   opcode(0x0F,0x40);
   ins_encode( enc_cmov(cop), RegReg( dst, src ) );
   ins_pipe( pipe_cmov_reg );
 %}
 
-instruct cmovI_regU( eRegI dst, eRegI src, eFlagsRegU cr, cmpOpU cop ) %{
+instruct cmovI_regU( cmpOpU cop, eFlagsRegU cr, eRegI dst, eRegI src ) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
   ins_cost(200);
   format %{ "CMOV$cop $dst,$src" %}
   opcode(0x0F,0x40);
   ins_encode( enc_cmov(cop), RegReg( dst, src ) );
   ins_pipe( pipe_cmov_reg );
 %}
 
+instruct cmovI_regUCF( cmpOpUCF cop, eFlagsRegUCF cr, eRegI dst, eRegI src ) %{
+  predicate(VM_Version::supports_cmov() );
+  match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
+  ins_cost(200);
+  expand %{
+    cmovI_regU(cop, cr, dst, src);
+  %}
+%}
+
 // Conditional move
 instruct cmovI_mem(cmpOp cop, eFlagsReg cr, eRegI dst, memory src) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
   ins_cost(250);

@@ -7277,20 +7436,29 @@
   ins_encode( enc_cmov(cop), RegMem( dst, src ) );
   ins_pipe( pipe_cmov_mem );
 %}
 
 // Conditional move
-instruct cmovI_memu(cmpOpU cop, eFlagsRegU cr, eRegI dst, memory src) %{
+instruct cmovI_memU(cmpOpU cop, eFlagsRegU cr, eRegI dst, memory src) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
   ins_cost(250);
   format %{ "CMOV$cop $dst,$src" %}
   opcode(0x0F,0x40);
   ins_encode( enc_cmov(cop), RegMem( dst, src ) );
   ins_pipe( pipe_cmov_mem );
 %}
 
+instruct cmovI_memUCF(cmpOpUCF cop, eFlagsRegUCF cr, eRegI dst, memory src) %{
+  predicate(VM_Version::supports_cmov() );
+  match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
+  ins_cost(250);
+  expand %{
+    cmovI_memU(cop, cr, dst, src);
+  %}
+%}
+
 // Conditional move
 instruct cmovP_reg(eRegP dst, eRegP src, eFlagsReg cr, cmpOp cop ) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
   ins_cost(200);

@@ -7314,20 +7482,29 @@
   ins_encode( enc_cmov_branch(cop, 0x2), OpcP, RegReg(dst, src));
   ins_pipe( pipe_cmov_reg );
 %}
 
 // Conditional move
-instruct cmovP_regU(eRegP dst, eRegP src, eFlagsRegU cr, cmpOpU cop ) %{
+instruct cmovP_regU(cmpOpU cop, eFlagsRegU cr, eRegP dst, eRegP src ) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
   ins_cost(200);
   format %{ "CMOV$cop $dst,$src\t# ptr" %}
   opcode(0x0F,0x40);
   ins_encode( enc_cmov(cop), RegReg( dst, src ) );
   ins_pipe( pipe_cmov_reg );
 %}
 
+instruct cmovP_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, eRegP dst, eRegP src ) %{
+  predicate(VM_Version::supports_cmov() );
+  match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
+  ins_cost(200);
+  expand %{
+    cmovP_regU(cop, cr, dst, src);
+  %}
+%}
+
 // DISABLED: Requires the ADLC to emit a bottom_type call that
 // correctly meets the two pointer arguments; one is an incoming
 // register but the other is a memory operand.  ALSO appears to
 // be buggy with implicit null checks.
 //

@@ -7453,10 +7630,19 @@
     __ bind(skip);
   %}
   ins_pipe( pipe_slow );
 %}
 
+instruct fcmovX_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, regX dst, regX src) %{
+  predicate (UseSSE>=1);
+  match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
+  ins_cost(200);
+  expand %{
+    fcmovX_regU(cop, cr, dst, src);
+  %}
+%}
+
 // unsigned version
 instruct fcmovXD_regU(cmpOpU cop, eFlagsRegU cr, regXD dst, regXD src) %{
   predicate (UseSSE>=2);
   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
   ins_cost(200);

@@ -7471,10 +7657,19 @@
     __ bind(skip);
   %}
   ins_pipe( pipe_slow );
 %}
 
+instruct fcmovXD_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, regXD dst, regXD src) %{
+  predicate (UseSSE>=2);
+  match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
+  ins_cost(200);
+  expand %{
+    fcmovXD_regU(cop, cr, dst, src);
+  %}
+%}
+
 instruct cmovL_reg(cmpOp cop, eFlagsReg cr, eRegL dst, eRegL src) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveL (Binary cop cr) (Binary dst src)));
   ins_cost(200);
   format %{ "CMOV$cop $dst.lo,$src.lo\n\t"

@@ -7493,10 +7688,19 @@
   opcode(0x0F,0x40);
   ins_encode( enc_cmov(cop), RegReg_Lo2( dst, src ), enc_cmov(cop), RegReg_Hi2( dst, src ) ); 
   ins_pipe( pipe_cmov_reg_long );
 %}
 
+instruct cmovL_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, eRegL dst, eRegL src) %{
+  predicate(VM_Version::supports_cmov() );
+  match(Set dst (CMoveL (Binary cop cr) (Binary dst src)));
+  ins_cost(200);
+  expand %{
+    cmovL_regU(cop, cr, dst, src);
+  %}
+%}
+
 //----------Arithmetic Instructions--------------------------------------------
 //----------Addition Instructions----------------------------------------------
 // Integer Addition Instructions
 instruct addI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
   match(Set dst (AddI dst src));

@@ -7724,37 +7928,40 @@
   format %{ "CMPXCHG $heap_top_ptr,$newval\t# If EAX==$heap_top_ptr Then store $newval into $heap_top_ptr" %}
   ins_encode( lock_prefix, Opcode(0x0F), Opcode(0xB1), RegMem(newval,heap_top_ptr) );
   ins_pipe( pipe_cmpxchg );
 %}
 
-// Conditional-store of a long value
-// Returns a boolean value (0/1) on success.  Implemented with a CMPXCHG8 on Intel.
-// mem_ptr can actually be in either ESI or EDI
-instruct storeLConditional( eRegI res, eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
-  match(Set res (StoreLConditional mem_ptr (Binary oldval newval)));
-  effect(KILL cr);
-  // EDX:EAX is killed if there is contention, but then it's also unused.
-  // In the common case of no contention, EDX:EAX holds the new oop address.
-  format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EDX:EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
-            "MOV    $res,0\n\t"
-            "JNE,s  fail\n\t"
-            "MOV    $res,1\n"
-          "fail:" %}
-  ins_encode( enc_cmpxchg8(mem_ptr),
-              enc_flags_ne_to_boolean(res) );
+// Conditional-store of an int value.
+// ZF flag is set on success, reset otherwise.  Implemented with a CMPXCHG on Intel.
+instruct storeIConditional( memory mem, eAXRegI oldval, eRegI newval, eFlagsReg cr ) %{
+  match(Set cr (StoreIConditional mem (Binary oldval newval)));
+  effect(KILL oldval);
+  format %{ "CMPXCHG $mem,$newval\t# If EAX==$mem Then store $newval into $mem" %}
+  ins_encode( lock_prefix, Opcode(0x0F), Opcode(0xB1), RegMem(newval, mem) );
   ins_pipe( pipe_cmpxchg );
 %}
 
-// Conditional-store of a long value
+// Conditional-store of a long value.
 // ZF flag is set on success, reset otherwise. Implemented with a CMPXCHG8 on Intel.
-// mem_ptr can actually be in either ESI or EDI
-instruct storeLConditional_flags( eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr, immI0 zero ) %{
-  match(Set cr (CmpI (StoreLConditional mem_ptr (Binary oldval newval)) zero));
-  // EDX:EAX is killed if there is contention, but then it's also unused.
-  // In the common case of no contention, EDX:EAX holds the new oop address.
-  format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t" %}
-  ins_encode( enc_cmpxchg8(mem_ptr) );
+instruct storeLConditional( memory mem, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
+  match(Set cr (StoreLConditional mem (Binary oldval newval)));
+  effect(KILL oldval);
+  format %{ "XCHG   EBX,ECX\t# correct order for CMPXCHG8 instruction\n\t"
+            "CMPXCHG8 $mem,ECX:EBX\t# If EDX:EAX==$mem Then store ECX:EBX into $mem\n\t"
+            "XCHG   EBX,ECX"
+  %}
+  ins_encode %{
+    // Note: we need to swap rbx, and rcx before and after the
+    //       cmpxchg8 instruction because the instruction uses
+    //       rcx as the high order word of the new value to store but
+    //       our register encoding uses rbx.
+    __ xchgl(as_Register(EBX_enc), as_Register(ECX_enc));
+    if( os::is_MP() )
+      __ lock();
+    __ cmpxchg8(Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp));
+    __ xchgl(as_Register(EBX_enc), as_Register(ECX_enc));
+  %}
   ins_pipe( pipe_cmpxchg );
 %}
 
 // No flag versions for CompareAndSwap{P,I,L} because matcher can't match them
 

@@ -8217,10 +8424,11 @@
   opcode(0xC1, 0x5);  /* C1 /5 ib */
   ins_encode( RegOpcImm( dst, shift) );
   ins_pipe( ialu_reg );
 %}
 
+
 // Logical Shift Right by 24, followed by Arithmetic Shift Left by 24.
 // This idiom is used by the compiler for the i2b bytecode.
 instruct i2b(eRegI dst, xRegI src, immI_24 twentyfour, eFlagsReg cr) %{
   match(Set dst (RShiftI (LShiftI src twentyfour) twentyfour));
   effect(KILL cr);

@@ -8334,10 +8542,22 @@
   opcode(0x0B);
   ins_encode( OpcP, RegReg( dst, src) );
   ins_pipe( ialu_reg_reg );
 %}
 
+instruct orI_eReg_castP2X(eRegI dst, eRegP src, eFlagsReg cr) %{
+  match(Set dst (OrI dst (CastP2X src)));
+  effect(KILL cr);
+
+  size(2);
+  format %{ "OR     $dst,$src" %}
+  opcode(0x0B);
+  ins_encode( OpcP, RegReg( dst, src) );
+  ins_pipe( ialu_reg_reg );
+%}
+
+
 // Or Register with Immediate
 instruct orI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
   match(Set dst (OrI dst src));
   effect(KILL cr);
 

@@ -8529,10 +8749,22 @@
   opcode(0x33);
   ins_encode( OpcP, RegReg( dst, src) );
   ins_pipe( ialu_reg_reg );
 %}
 
+// Xor Register with Immediate -1
+instruct xorI_eReg_im1(eRegI dst, immI_M1 imm) %{
+  match(Set dst (XorI dst imm));  
+
+  size(2);
+  format %{ "NOT    $dst" %}  
+  ins_encode %{
+     __ notl($dst$$Register);
+  %}
+  ins_pipe( ialu_reg );
+%}
+
 // Xor Register with Immediate
 instruct xorI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
   match(Set dst (XorI dst src));
   effect(KILL cr);
 

@@ -8846,10 +9078,22 @@
   opcode(0x33,0x33);
   ins_encode( RegReg_Lo( dst, src), RegReg_Hi( dst, src) );
   ins_pipe( ialu_reg_reg_long );
 %}
 
+// Xor Long Register with Immediate -1
+instruct xorl_eReg_im1(eRegL dst, immL_M1 imm) %{
+  match(Set dst (XorL dst imm));  
+  format %{ "NOT    $dst.lo\n\t"
+            "NOT    $dst.hi" %}
+  ins_encode %{
+     __ notl($dst$$Register);
+     __ notl(HIGH_FROM_LOW($dst$$Register));
+  %}
+  ins_pipe( ialu_reg_long );
+%}
+
 // Xor Long Register with Immediate
 instruct xorl_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
   match(Set dst (XorL dst src));
   effect(KILL cr);
   format %{ "XOR    $dst.lo,$src.lo\n\t"

@@ -8869,10 +9113,67 @@
   opcode(0x33,0x33);
   ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
   ins_pipe( ialu_reg_long_mem );
 %}
 
+// Shift Left Long by 1
+instruct shlL_eReg_1(eRegL dst, immI_1 cnt, eFlagsReg cr) %{
+  predicate(UseNewLongLShift);
+  match(Set dst (LShiftL dst cnt));
+  effect(KILL cr);
+  ins_cost(100);
+  format %{ "ADD    $dst.lo,$dst.lo\n\t"
+            "ADC    $dst.hi,$dst.hi" %}
+  ins_encode %{
+    __ addl($dst$$Register,$dst$$Register);
+    __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
+  %}
+  ins_pipe( ialu_reg_long );
+%}
+
+// Shift Left Long by 2
+instruct shlL_eReg_2(eRegL dst, immI_2 cnt, eFlagsReg cr) %{
+  predicate(UseNewLongLShift);
+  match(Set dst (LShiftL dst cnt));
+  effect(KILL cr);
+  ins_cost(100);
+  format %{ "ADD    $dst.lo,$dst.lo\n\t"
+            "ADC    $dst.hi,$dst.hi\n\t" 
+            "ADD    $dst.lo,$dst.lo\n\t"
+            "ADC    $dst.hi,$dst.hi" %}
+  ins_encode %{
+    __ addl($dst$$Register,$dst$$Register);
+    __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
+    __ addl($dst$$Register,$dst$$Register);
+    __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
+  %}
+  ins_pipe( ialu_reg_long );
+%}
+
+// Shift Left Long by 3
+instruct shlL_eReg_3(eRegL dst, immI_3 cnt, eFlagsReg cr) %{
+  predicate(UseNewLongLShift);
+  match(Set dst (LShiftL dst cnt));
+  effect(KILL cr);
+  ins_cost(100);
+  format %{ "ADD    $dst.lo,$dst.lo\n\t"
+            "ADC    $dst.hi,$dst.hi\n\t" 
+            "ADD    $dst.lo,$dst.lo\n\t"
+            "ADC    $dst.hi,$dst.hi\n\t" 
+            "ADD    $dst.lo,$dst.lo\n\t"
+            "ADC    $dst.hi,$dst.hi" %}
+  ins_encode %{
+    __ addl($dst$$Register,$dst$$Register);
+    __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
+    __ addl($dst$$Register,$dst$$Register);
+    __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
+    __ addl($dst$$Register,$dst$$Register);
+    __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
+  %}
+  ins_pipe( ialu_reg_long );
+%}
+
 // Shift Left Long by 1-31
 instruct shlL_eReg_1_31(eRegL dst, immI_1_31 cnt, eFlagsReg cr) %{
   match(Set dst (LShiftL dst cnt));
   effect(KILL cr);
   ins_cost(200);

@@ -9017,10 +9318,22 @@
               OpcP, RegOpc(src2),
               cmpF_P6_fixup );
   ins_pipe( pipe_slow );
 %}
 
+instruct cmpD_cc_P6CF(eFlagsRegUCF cr, regD src1, regD src2) %{
+  predicate(VM_Version::supports_cmov() && UseSSE <=1);
+  match(Set cr (CmpD src1 src2));
+  ins_cost(150);
+  format %{ "FLD    $src1\n\t"
+            "FUCOMIP ST,$src2  // P6 instruction" %}
+  opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
+  ins_encode( Push_Reg_D(src1),
+              OpcP, RegOpc(src2));
+  ins_pipe( pipe_slow );
+%}
+
 // Compare & branch
 instruct cmpD_cc(eFlagsRegU cr, regD src1, regD src2, eAXRegI rax) %{
   predicate(UseSSE<=1);
   match(Set cr (CmpD src1 src2));
   effect(KILL rax);

@@ -9081,10 +9394,20 @@
   opcode(0x66, 0x0F, 0x2F);
   ins_encode(OpcP, OpcS, Opcode(tertiary), RegReg(dst, src), cmpF_P6_fixup);
   ins_pipe( pipe_slow );
 %}
 
+instruct cmpXD_ccCF(eFlagsRegUCF cr, regXD dst, regXD src) %{
+  predicate(UseSSE>=2);
+  match(Set cr (CmpD dst src));
+  ins_cost(100);
+  format %{ "COMISD $dst,$src" %}
+  opcode(0x66, 0x0F, 0x2F);
+  ins_encode(OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
+  ins_pipe( pipe_slow );
+%}
+
 // float compare and set condition codes in EFLAGS by XMM regs
 instruct cmpXD_ccmem(eFlagsRegU cr, regXD dst, memory src, eAXRegI rax) %{
   predicate(UseSSE>=2);
   match(Set cr (CmpD dst (LoadD src)));
   effect(KILL rax);

@@ -9097,10 +9420,20 @@
   opcode(0x66, 0x0F, 0x2F);
   ins_encode(OpcP, OpcS, Opcode(tertiary), RegMem(dst, src), cmpF_P6_fixup);
   ins_pipe( pipe_slow );
 %}
 
+instruct cmpXD_ccmemCF(eFlagsRegUCF cr, regXD dst, memory src) %{
+  predicate(UseSSE>=2);
+  match(Set cr (CmpD dst (LoadD src)));
+  ins_cost(100);
+  format %{ "COMISD $dst,$src" %}
+  opcode(0x66, 0x0F, 0x2F);
+  ins_encode(OpcP, OpcS, Opcode(tertiary), RegMem(dst, src));
+  ins_pipe( pipe_slow );
+%}
+
 // Compare into -1,0,1 in XMM
 instruct cmpXD_reg(eRegI dst, regXD src1, regXD src2, eFlagsReg cr) %{
   predicate(UseSSE>=2);
   match(Set dst (CmpD3 src1 src2));
   effect(KILL cr);

@@ -9984,10 +10317,22 @@
               OpcP, RegOpc(src2),
               cmpF_P6_fixup );
   ins_pipe( pipe_slow );
 %}
 
+instruct cmpF_cc_P6CF(eFlagsRegUCF cr, regF src1, regF src2) %{
+  predicate(VM_Version::supports_cmov() && UseSSE == 0);
+  match(Set cr (CmpF src1 src2));
+  ins_cost(100);
+  format %{ "FLD    $src1\n\t"
+            "FUCOMIP ST,$src2  // P6 instruction" %}
+  opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
+  ins_encode( Push_Reg_D(src1),
+              OpcP, RegOpc(src2));
+  ins_pipe( pipe_slow );
+%}
+
 
 // Compare & branch
 instruct cmpF_cc(eFlagsRegU cr, regF src1, regF src2, eAXRegI rax) %{
   predicate(UseSSE == 0);
   match(Set cr (CmpF src1 src2));

@@ -10049,10 +10394,20 @@
   opcode(0x0F, 0x2F);
   ins_encode(OpcP, OpcS, RegReg(dst, src), cmpF_P6_fixup);
   ins_pipe( pipe_slow );
 %}
 
+instruct cmpX_ccCF(eFlagsRegUCF cr, regX dst, regX src) %{
+  predicate(UseSSE>=1);
+  match(Set cr (CmpF dst src));
+  ins_cost(100);
+  format %{ "COMISS $dst,$src" %}
+  opcode(0x0F, 0x2F);
+  ins_encode(OpcP, OpcS, RegReg(dst, src));
+  ins_pipe( pipe_slow );
+%}
+
 // float compare and set condition codes in EFLAGS by XMM regs
 instruct cmpX_ccmem(eFlagsRegU cr, regX dst, memory src, eAXRegI rax) %{
   predicate(UseSSE>=1);
   match(Set cr (CmpF dst (LoadF src)));
   effect(KILL rax);

@@ -10065,10 +10420,20 @@
   opcode(0x0F, 0x2F);
   ins_encode(OpcP, OpcS, RegMem(dst, src), cmpF_P6_fixup);
   ins_pipe( pipe_slow );
 %}
 
+instruct cmpX_ccmemCF(eFlagsRegUCF cr, regX dst, memory src) %{
+  predicate(UseSSE>=1);
+  match(Set cr (CmpF dst (LoadF src)));
+  ins_cost(100);
+  format %{ "COMISS $dst,$src" %}
+  opcode(0x0F, 0x2F);
+  ins_encode(OpcP, OpcS, RegMem(dst, src));
+  ins_pipe( pipe_slow );
+%}
+
 // Compare into -1,0,1 in XMM
 instruct cmpX_reg(eRegI dst, regX src1, regX src2, eFlagsReg cr) %{
   predicate(UseSSE>=1);
   match(Set dst (CmpF3 src1 src2));
   effect(KILL cr);

@@ -10968,11 +11333,11 @@
   ins_encode(Push_Mem_I(src), Pop_Reg_D(dst));
   ins_pipe( fpu_reg_mem );
 %}
 
 instruct convI2XD_reg(regXD dst, eRegI src) %{
-  predicate( UseSSE>=2 );
+  predicate( UseSSE>=2 && !UseXmmI2D );
   match(Set dst (ConvI2D src));
   format %{ "CVTSI2SD $dst,$src" %}
   opcode(0xF2, 0x0F, 0x2A);  
   ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
   ins_pipe( pipe_slow );

@@ -10985,10 +11350,24 @@
   opcode(0xF2, 0x0F, 0x2A);  
   ins_encode( OpcP, OpcS, Opcode(tertiary), RegMem(dst, mem));
   ins_pipe( pipe_slow );
 %}
 
+instruct convXI2XD_reg(regXD dst, eRegI src)
+%{
+  predicate( UseSSE>=2 && UseXmmI2D );
+  match(Set dst (ConvI2D src));
+
+  format %{ "MOVD  $dst,$src\n\t"
+            "CVTDQ2PD $dst,$dst\t# i2d" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ cvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow); // XXX
+%}
+
 instruct convI2D_mem(regD dst, memory mem) %{
   predicate( UseSSE<=1 && !Compile::current()->select_24_bit_instr());
   match(Set dst (ConvI2D (LoadI mem)));
   format %{ "FILD   $mem\n\t"
             "FSTP   $dst" %}

@@ -11060,19 +11439,33 @@
   ins_pipe( fpu_reg_mem );
 %}
 
 // Convert an int to a float in xmm; no rounding step needed.
 instruct convI2X_reg(regX dst, eRegI src) %{
-  predicate(UseSSE>=1);
+  predicate( UseSSE==1 || UseSSE>=2 && !UseXmmI2F );
   match(Set dst (ConvI2F src));
   format %{ "CVTSI2SS $dst, $src" %}
 
   opcode(0xF3, 0x0F, 0x2A);  /* F3 0F 2A /r */
   ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
   ins_pipe( pipe_slow );
 %}
 
+ instruct convXI2X_reg(regX dst, eRegI src)
+%{
+  predicate( UseSSE>=2 && UseXmmI2F );
+  match(Set dst (ConvI2F src));
+
+  format %{ "MOVD  $dst,$src\n\t"
+            "CVTDQ2PS $dst,$dst\t# i2f" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ cvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow); // XXX
+%}
+
 instruct convI2L_reg( eRegL dst, eRegI src, eFlagsReg cr) %{
   match(Set dst (ConvI2L src));
   effect(KILL cr);
   format %{ "MOV    $dst.lo,$src\n\t"
             "MOV    $dst.hi,$src\n\t"

@@ -11535,10 +11928,21 @@
   format %{ "String Compare $str1,$str2 -> $result    // KILL EAX, EBX" %}
   ins_encode( enc_String_Compare() );
   ins_pipe( pipe_slow );
 %}
 
+// fast array equals
+instruct array_equals(eDIRegP ary1, eSIRegP ary2, eAXRegI tmp1, eBXRegI tmp2, eCXRegI result, eFlagsReg cr) %{
+  match(Set result (AryEq ary1 ary2));
+  effect(USE_KILL ary1, USE_KILL ary2, KILL tmp1, KILL tmp2, KILL cr);
+  //ins_cost(300);
+
+  format %{ "Array Equals $ary1,$ary2 -> $result    // KILL EAX, EBX" %}
+  ins_encode( enc_Array_Equals(ary1, ary2, tmp1, tmp2, result) );
+  ins_pipe( pipe_slow );
+%}
+
 //----------Control Flow Instructions------------------------------------------
 // Signed compare Instructions
 instruct compI_eReg(eFlagsReg cr, eRegI op1, eRegI op2) %{
   match(Set cr (CmpI op1 op2));
   effect( DEF cr, USE op1, USE op2 );

@@ -11877,21 +12281,89 @@
   ins_encode( Jcc( cop, labl) );
   ins_pipe( pipe_jcc );
   ins_pc_relative(1);
 %}
 
+instruct jmpLoopEndUCF(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{
+  match(CountedLoopEnd cop cmp);
+  effect(USE labl);
+
+  ins_cost(200);
+  format %{ "J$cop,u  $labl\t# Loop end" %}
+  size(6);
+  opcode(0x0F, 0x80);
+  ins_encode( Jcc( cop, labl) );
+  ins_pipe( pipe_jcc );
+  ins_pc_relative(1);
+%}
+
 // Jump Direct Conditional - using unsigned comparison
 instruct jmpConU(cmpOpU cop, eFlagsRegU cmp, label labl) %{
   match(If cop cmp);
   effect(USE labl);
 
   ins_cost(300);
   format %{ "J$cop,u  $labl" %}
   size(6);
   opcode(0x0F, 0x80);
-  ins_encode( Jcc( cop, labl) );
-  ins_pipe( pipe_jcc );
+  ins_encode(Jcc(cop, labl));
+  ins_pipe(pipe_jcc);
+  ins_pc_relative(1);
+%}
+
+instruct jmpConUCF(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{
+  match(If cop cmp);
+  effect(USE labl);
+
+  ins_cost(200);
+  format %{ "J$cop,u  $labl" %}
+  size(6);
+  opcode(0x0F, 0x80);
+  ins_encode(Jcc(cop, labl));
+  ins_pipe(pipe_jcc);
+  ins_pc_relative(1);
+%}
+
+instruct jmpConUCF2(cmpOpUCF2 cop, eFlagsRegUCF cmp, label labl) %{
+  match(If cop cmp);
+  effect(USE labl);
+
+  ins_cost(200);
+  format %{ $$template
+    if ($cop$$cmpcode == Assembler::notEqual) {
+      $$emit$$"JP,u   $labl\n\t"
+      $$emit$$"J$cop,u   $labl"
+    } else {
+      $$emit$$"JP,u   done\n\t"
+      $$emit$$"J$cop,u   $labl\n\t"
+      $$emit$$"done:"
+    }
+  %}
+  size(12);
+  opcode(0x0F, 0x80);
+  ins_encode %{
+    Label* l = $labl$$label;
+    $$$emit8$primary;
+    emit_cc(cbuf, $secondary, Assembler::parity);
+    int parity_disp = -1;
+    bool ok = false;
+    if ($cop$$cmpcode == Assembler::notEqual) {
+       // the two jumps 6 bytes apart so the jump distances are too
+       parity_disp = l ? (l->loc_pos() - (cbuf.code_size() + 4)) : 0;
+    } else if ($cop$$cmpcode == Assembler::equal) {
+       parity_disp = 6;
+       ok = true;
+    } else {
+       ShouldNotReachHere();
+    }
+    emit_d32(cbuf, parity_disp);
+    $$$emit8$primary;
+    emit_cc(cbuf, $secondary, $cop$$cmpcode);
+    int disp = l ? (l->loc_pos() - (cbuf.code_size() + 4)) : 0;
+    emit_d32(cbuf, disp);
+  %}
+  ins_pipe(pipe_jcc);
   ins_pc_relative(1);
 %}
 
 // ============================================================================
 // The 2nd slow-half of a subtype check.  Scan the subklass's 2ndary superklass

@@ -11986,11 +12458,11 @@
 instruct jmpLoopEnd_short(cmpOp cop, eFlagsReg cr, label labl) %{
   match(CountedLoopEnd cop cr);
   effect(USE labl);
 
   ins_cost(300);
-  format %{ "J$cop,s  $labl" %}
+  format %{ "J$cop,s  $labl\t# Loop end" %}
   size(2);
   opcode(0x70);
   ins_encode( JccShort( cop, labl) );
   ins_pipe( pipe_jcc );
   ins_pc_relative(1);

@@ -12001,11 +12473,25 @@
 instruct jmpLoopEndU_short(cmpOpU cop, eFlagsRegU cmp, label labl) %{
   match(CountedLoopEnd cop cmp);
   effect(USE labl);
 
   ins_cost(300);
-  format %{ "J$cop,us $labl" %}
+  format %{ "J$cop,us $labl\t# Loop end" %}
+  size(2);
+  opcode(0x70);
+  ins_encode( JccShort( cop, labl) );
+  ins_pipe( pipe_jcc );
+  ins_pc_relative(1);
+  ins_short_branch(1);
+%}
+
+instruct jmpLoopEndUCF_short(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{
+  match(CountedLoopEnd cop cmp);
+  effect(USE labl);
+
+  ins_cost(300);
+  format %{ "J$cop,us $labl\t# Loop end" %}
   size(2);
   opcode(0x70);
   ins_encode( JccShort( cop, labl) );
   ins_pipe( pipe_jcc );
   ins_pc_relative(1);

@@ -12025,10 +12511,64 @@
   ins_pipe( pipe_jcc );
   ins_pc_relative(1);
   ins_short_branch(1);
 %}
 
+instruct jmpConUCF_short(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{
+  match(If cop cmp);
+  effect(USE labl);
+
+  ins_cost(300);
+  format %{ "J$cop,us $labl" %}
+  size(2);
+  opcode(0x70);
+  ins_encode( JccShort( cop, labl) );
+  ins_pipe( pipe_jcc );
+  ins_pc_relative(1);
+  ins_short_branch(1);
+%}
+
+instruct jmpConUCF2_short(cmpOpUCF2 cop, eFlagsRegUCF cmp, label labl) %{
+  match(If cop cmp);
+  effect(USE labl);
+
+  ins_cost(300);
+  format %{ $$template
+    if ($cop$$cmpcode == Assembler::notEqual) {
+      $$emit$$"JP,u,s   $labl\n\t"
+      $$emit$$"J$cop,u,s   $labl"
+    } else {
+      $$emit$$"JP,u,s   done\n\t"
+      $$emit$$"J$cop,u,s  $labl\n\t"
+      $$emit$$"done:"
+    }
+  %}
+  size(4);
+  opcode(0x70);
+  ins_encode %{
+    Label* l = $labl$$label;
+    emit_cc(cbuf, $primary, Assembler::parity);
+    int parity_disp = -1;
+    if ($cop$$cmpcode == Assembler::notEqual) {
+      parity_disp = l ? (l->loc_pos() - (cbuf.code_size() + 1)) : 0;
+    } else if ($cop$$cmpcode == Assembler::equal) {
+      parity_disp = 2;
+    } else {
+      ShouldNotReachHere();
+    }
+    emit_d8(cbuf, parity_disp);
+    emit_cc(cbuf, $primary, $cop$$cmpcode);
+    int disp = l ? (l->loc_pos() - (cbuf.code_size() + 1)) : 0;
+    emit_d8(cbuf, disp);
+    assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
+    assert(-128 <= parity_disp && parity_disp <= 127, "Displacement too large for short jmp");
+  %}
+  ins_pipe(pipe_jcc);
+  ins_pc_relative(1);
+  ins_short_branch(1);
+%}
+
 // ============================================================================
 // Long Compare
 //
 // Currently we hold longs in 2 registers.  Comparing such values efficiently
 // is tricky.  The flavor of compare used depends on whether we are testing

@@ -12065,22 +12605,22 @@
             "JMP,s  done\n"
     "m_one:\tDEC    $dst\n"
      "done:" %}
   ins_encode %{
     Label p_one, m_one, done;
-    __ xorl($dst$$Register, $dst$$Register);
+    __ xorptr($dst$$Register, $dst$$Register);
     __ cmpl(HIGH_FROM_LOW($src1$$Register), HIGH_FROM_LOW($src2$$Register));
     __ jccb(Assembler::less,    m_one);
     __ jccb(Assembler::greater, p_one);
     __ cmpl($src1$$Register, $src2$$Register);
     __ jccb(Assembler::below,   m_one);
     __ jccb(Assembler::equal,   done);
     __ bind(p_one);
-    __ increment($dst$$Register);
+    __ incrementl($dst$$Register);
     __ jmpb(done);
     __ bind(m_one);
-    __ decrement($dst$$Register);
+    __ decrementl($dst$$Register);
     __ bind(done);
   %}
   ins_pipe( pipe_slow );
 %}
 

@@ -12774,8 +13314,5 @@
 %}
 
 //----------SMARTSPILL RULES---------------------------------------------------
 // These must follow all instruction definitions as they use the names
 // defined in the instructions definitions.
-
-
-