--- old/src/cpu/sparc/vm/sparc.ad	Thu May 20 16:56:53 2010
+++ new/src/cpu/sparc/vm/sparc.ad	Thu May 20 16:56:53 2010
@@ -1750,6 +1750,12 @@
 // registers?  True for Intel but false for most RISCs
 const bool Matcher::clone_shift_expressions = false;
 
+bool Matcher::narrow_oop_use_complex_address() {
+  NOT_LP64(ShouldNotCallThis());
+  assert(UseCompressedOops, "only for comressed oops code");
+  return false;
+}
+
 // Is it better to copy float constants, or load them directly from memory?
 // Intel can load a float constant from a direct address, requiring no
 // extra registers.  Most RISCs will have to materialize an address into a
--- old/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu May 20 16:56:54 2010
+++ new/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu May 20 16:56:54 2010
@@ -65,13 +65,6 @@
       FLAG_SET_DEFAULT(UseInlineCaches, false);
     }
 #ifdef _LP64
-    // Single issue niagara1 is slower for CompressedOops
-    // but niagaras after that it's fine.
-    if (!is_niagara1_plus()) {
-      if (FLAG_IS_DEFAULT(UseCompressedOops)) {
-        FLAG_SET_ERGO(bool, UseCompressedOops, false);
-      }
-    }
     // 32-bit oops don't make sense for the 64-bit VM on sparc
     // since the 32-bit VM has the same registers and smaller objects.
     Universe::set_narrow_oop_shift(LogMinObjAlignmentInBytes);
--- old/src/cpu/x86/vm/x86_32.ad	Thu May 20 16:56:55 2010
+++ new/src/cpu/x86/vm/x86_32.ad	Thu May 20 16:56:55 2010
@@ -1377,6 +1377,12 @@
 // registers?  True for Intel but false for most RISCs
 const bool Matcher::clone_shift_expressions = true;
 
+bool Matcher::narrow_oop_use_complex_address() {
+  ShouldNotCallThis();
+  return true;
+}
+
+
 // Is it better to copy float constants, or load them directly from memory?
 // Intel can load a float constant from a direct address, requiring no
 // extra registers.  Most RISCs will have to materialize an address into a
--- old/src/cpu/x86/vm/x86_64.ad	Thu May 20 16:56:55 2010
+++ new/src/cpu/x86/vm/x86_64.ad	Thu May 20 16:56:55 2010
@@ -2054,6 +2054,11 @@
 // into registers?  True for Intel but false for most RISCs
 const bool Matcher::clone_shift_expressions = true;
 
+bool Matcher::narrow_oop_use_complex_address() {
+  assert(UseCompressedOops, "only for comressed oops code");
+  return (LogMinObjAlignmentInBytes <= 3);
+}
+
 // Is it better to copy float constants, or load them directly from
 // memory?  Intel can load a float constant from a direct address,
 // requiring no extra registers.  Most RISCs will have to materialize
--- old/src/share/vm/opto/compile.cpp	Thu May 20 16:56:56 2010
+++ new/src/share/vm/opto/compile.cpp	Thu May 20 16:56:56 2010
@@ -2183,7 +2183,7 @@
       Node* new_in1 = in1->clone();
       new_in1->as_DecodeN()->set_type(t);
 
-      if (!Matcher::clone_shift_expressions) {
+      if (!Matcher::narrow_oop_use_complex_address()) {
         //
         // x86, ARM and friends can handle 2 adds in addressing mode
         // and Matcher can fold a DecodeN node into address by using
@@ -2291,7 +2291,7 @@
     assert(!n->in(1)->is_EncodeP(), "should be optimized out");
     // DecodeN could be pinned on Sparc where it can't be fold into
     // an address expression, see the code for Op_CastPP above.
-    assert(n->in(0) == NULL || !Matcher::clone_shift_expressions, "no control except on sparc");
+    assert(n->in(0) == NULL || !Matcher::narrow_oop_use_complex_address(), "no control except on sparc");
     break;
 
   case Op_EncodeP: {
--- old/src/share/vm/opto/lcm.cpp	Thu May 20 16:56:57 2010
+++ new/src/share/vm/opto/lcm.cpp	Thu May 20 16:56:57 2010
@@ -32,7 +32,8 @@
 // with suitable memory ops nearby.  Use the memory op to do the NULL check.
 // I can generate a memory op if there is not one nearby.
 // The proj is the control projection for the not-null case.
-// The val is the pointer being checked for nullness.
+// The val is the pointer being checked for nullness or
+// decodeHeapOop_not_null node if it did not fold into address. 
 void Block::implicit_null_check(PhaseCFG *cfg, Node *proj, Node *val, int allowed_reasons) {
   // Assume if null check need for 0 offset then always needed
   // Intel solaris doesn't support any null checks yet and no
@@ -96,6 +97,10 @@
     }
   }
 
+  // Check for decodeHeapOop_not_null node which did not fold into address.
+  bool is_decoden = val->is_Mach() &&
+                   (val->as_Mach()->ideal_Opcode() == Op_DecodeN);
+
   // Search the successor block for a load or store who's base value is also
   // the tested value.  There may be several.
   Node_List *out = new Node_List(Thread::current()->resource_area());
@@ -148,7 +153,8 @@
       if( !mach->needs_anti_dependence_check() )
         continue;               // Not an memory op; skip it
       {
-        // Check that value is used in memory address.
+        // Check that value is used in memory address in
+        // instructions with embedded load (CmpP val1,(val2+off)).
         Node* base;
         Node* index;
         const MachOper* oper = mach->memory_inputs(base, index);
@@ -213,7 +219,11 @@
     uint vidx = 0;              // Capture index of value into memop
     uint j;
     for( j = mach->req()-1; j > 0; j-- ) {
-      if( mach->in(j) == val ) vidx = j;
+      if( mach->in(j) == val ) {
+        vidx = j;
+        // Ignore DecodeN val since it could be hoisted to where needed.
+        if( is_decoden ) continue;
+      }
       // Block of memory-op input
       Block *inb = cfg->_bbs[mach->in(j)->_idx];
       Block *b = this;          // Start from nul check
@@ -271,6 +281,15 @@
   implicit_null_checks++;
 
   // Hoist the memory candidate up to the end of the test block.
+  if( is_decoden ) {
+    // Check if we need to hoist DecodeN val first.
+    Block *valb = cfg->_bbs[val->_idx];
+    if( this != valb && this->_dom_depth < valb->_dom_depth ) {
+      valb->find_remove(val);
+      this->add_inst(val);
+      cfg->_bbs.map(val->_idx,this);
+    }
+  }
   Block *old_block = cfg->_bbs[best->_idx];
   old_block->find_remove(best);
   add_inst(best);
--- old/src/share/vm/opto/matcher.cpp	Thu May 20 16:56:57 2010
+++ new/src/share/vm/opto/matcher.cpp	Thu May 20 16:56:57 2010
@@ -1334,7 +1334,7 @@
       if( j == max_scan )       // No post-domination before scan end?
         return true;            // Then break the match tree up
     }
-    if (m->is_DecodeN() && Matcher::clone_shift_expressions) {
+    if (m->is_DecodeN() && Matcher::narrow_oop_use_complex_address()) {
       // These are commonly used in address expressions and can
       // efficiently fold into them on X64 in some cases.
       return false;
@@ -2110,8 +2110,8 @@
         _null_check_tests.push(proj);
         Node* val = cmp->in(1);
 #ifdef _LP64
-        if (UseCompressedOops && !Matcher::clone_shift_expressions &&
-            val->bottom_type()->isa_narrowoop()) {
+        if (val->bottom_type()->isa_narrowoop() &&
+            !Matcher::narrow_oop_use_complex_address()) {
           //
           // Look for DecodeN node which should be pinned to orig_proj.
           // On platforms (Sparc) which can not handle 2 adds
--- old/src/share/vm/opto/matcher.hpp	Thu May 20 16:56:58 2010
+++ new/src/share/vm/opto/matcher.hpp	Thu May 20 16:56:58 2010
@@ -352,6 +352,8 @@
   // registers?  True for Intel but false for most RISCs
   static const bool clone_shift_expressions;
 
+  static bool narrow_oop_use_complex_address();
+
   // Is it better to copy float constants, or load them directly from memory?
   // Intel can load a float constant from a direct address, requiring no
   // extra registers.  Most RISCs will have to materialize an address into a