--- old/src/cpu/aarch64/vm/c2_init_aarch64.cpp	2015-05-01 18:06:41.349650500 -0700
+++ new/src/cpu/aarch64/vm/c2_init_aarch64.cpp	2015-05-01 18:06:41.164650500 -0700
@@ -33,4 +33,6 @@
   guarantee(CodeEntryAlignment >= InteriorEntryAlignment, "" );
   // QQQ presumably all 64bit cpu's support this. Seems like the ifdef could
   // simply be left out.
+
+  SuperWordLoopUnrollAnalysis = false;
 }
--- old/src/cpu/ppc/vm/c2_init_ppc.cpp	2015-05-01 18:06:45.115650500 -0700
+++ new/src/cpu/ppc/vm/c2_init_ppc.cpp	2015-05-01 18:06:44.931650500 -0700
@@ -45,4 +45,6 @@
       FLAG_SET_ERGO(bool, InsertEndGroupPPC64, true);
     }
   }
+
+  SuperWordLoopUnrollAnalysis = false;
 }
--- old/src/cpu/sparc/vm/c2_init_sparc.cpp	2015-05-01 18:06:48.818650500 -0700
+++ new/src/cpu/sparc/vm/c2_init_sparc.cpp	2015-05-01 18:06:48.635650500 -0700
@@ -30,4 +30,6 @@
 
 void Compile::pd_compiler2_init() {
   guarantee(CodeEntryAlignment >= InteriorEntryAlignment, "" );
+
+  SuperWordLoopUnrollAnalysis = false;
 }
--- old/src/share/vm/opto/c2_globals.hpp	2015-05-01 18:06:52.614650500 -0700
+++ new/src/share/vm/opto/c2_globals.hpp	2015-05-01 18:06:52.416650500 -0700
@@ -191,6 +191,12 @@
   product(intx,  LoopMaxUnroll, 16,                                         \
           "Maximum number of unrolls for main loop")                        \
                                                                             \
+  product(bool,  SuperWordLoopUnrollAnalysis, true,                         \
+          "Map number of unrolls for main loop via slp analysis")           \
+                                                                            \
+  notproduct(bool, TraceSuperWordLoopUnrollAnalysis, false,                 \
+          "Trace what slp analysis applies")                                \
+                                                                            \
   product(intx,  LoopUnrollMin, 4,                                          \
           "Minimum number of unroll loop bodies before checking progress"   \
           "of rounds of unroll,optimize,..")                                \
--- old/src/share/vm/opto/loopTransform.cpp	2015-05-01 18:06:56.436650500 -0700
+++ new/src/share/vm/opto/loopTransform.cpp	2015-05-01 18:06:56.250650500 -0700
@@ -38,6 +38,7 @@
 #include "opto/rootnode.hpp"
 #include "opto/runtime.hpp"
 #include "opto/subnode.hpp"
+#include "opto/superword.hpp"
 #include "opto/vectornode.hpp"
 
 //------------------------------is_loop_exit-----------------------------------
@@ -640,7 +641,7 @@
 //------------------------------policy_unroll----------------------------------
 // Return TRUE or FALSE if the loop should be unrolled or not.  Unroll if
 // the loop is a CountedLoop and the body is small enough.
-bool IdealLoopTree::policy_unroll( PhaseIdealLoop *phase ) const {
+bool IdealLoopTree::policy_unroll( PhaseIdealLoop *phase ) {
 
   CountedLoopNode *cl = _head->as_CountedLoop();
   assert(cl->is_normal_loop() || cl->is_main_loop(), "");
@@ -652,9 +653,46 @@
   // After split at least one iteration will be executed in pre-loop.
   if (cl->trip_count() <= (uint)(cl->is_normal_loop() ? 2 : 1)) return false;
 
+  _local_loop_unroll_limit = LoopUnrollLimit;
+  _local_loop_unroll_factor = 4;
   int future_unroll_ct = cl->unrolled_count() * 2;
   if (future_unroll_ct > LoopMaxUnroll) return false;
 
+  if (UseSuperWord) {
+    if (cl->is_reduction_loop() == false) phase->mark_reductions(this);
+
+    // Only attempt slp analysis when user controls do not prohibit it
+    if (LoopMaxUnroll > _local_loop_unroll_factor) {
+      // Once policy_slp_analysis succeeds, mark the loop with the
+      // maximal unroll factor so that we minimize analysis passes
+      if (cl->has_passed_slp() == false) {
+        if (policy_slp_analysis(cl, phase)) {
+          if (_local_loop_unroll_factor > 4) {
+            cl->mark_passed_slp();
+            cl->set_slp_max_unroll(_local_loop_unroll_factor);
+          }
+        }
+      }
+
+      if (cl->has_passed_slp()) {
+        int slp_max_unroll_factor = cl->slp_max_unroll();
+        if ((slp_max_unroll_factor > 4) &&
+            (slp_max_unroll_factor >= future_unroll_ct)) {
+          int new_limit = cl->node_count_before_unroll() * slp_max_unroll_factor;
+          if (new_limit > LoopUnrollLimit) {
+#ifndef PRODUCT
+            if (TraceSuperWordLoopUnrollAnalysis) {
+              tty->print_cr("slp analysis is applying unroll limit  %d, the original limit was %d\n",
+                            new_limit, _local_loop_unroll_limit);
+            }
+#endif
+            _local_loop_unroll_limit = new_limit;
+          }
+        }
+      }
+    }
+  }
+
   // Check for initial stride being a small enough constant
   if (abs(cl->stride_con()) > (1<<2)*future_unroll_ct) return false;
 
@@ -748,7 +786,7 @@
   }
 
   // Check for being too big
-  if (body_size > (uint)LoopUnrollLimit) {
+  if (body_size > (uint)_local_loop_unroll_limit) {
     if (xors_in_loop >= 4 && body_size < (uint)LoopUnrollLimit*4) return true;
     // Normal case: loop too big
     return false;
@@ -758,6 +796,172 @@
   return true;
 }
 
+bool IdealLoopTree::policy_slp_analysis( CountedLoopNode *cl, PhaseIdealLoop *phase ) {
+  // SLP analysis
+  bool not_slp = false;
+
+  // Enable this functionality target by target as needed
+  if (SuperWordLoopUnrollAnalysis) {
+    SuperWord sw(phase);
+    sw.transform_loop(this, false);
+
+    // If the loop is slp canonical analyze it
+    if (sw.early_return() == false) {
+      Arena *a = Thread::current()->resource_area();
+      int max_vector = Matcher::max_vector_size(T_INT);
+      size_t ignored_size = _body.size()*sizeof(int*);
+      int *ignored_loop_nodes = (int*)a->Amalloc_D(ignored_size);
+      Node_Stack nstack((int)ignored_size);
+      Node *cl_exit = cl->loopexit();
+
+      // First clear the entries
+      for (uint i = 0; i < _body.size(); i++) {
+        ignored_loop_nodes[i] = -1;
+      }
+
+      // Process the loop, some/all of the stack entries will not be in order, ergo
+      // need to preprocess the ignored initial state before we process the loop
+      for (uint i = 0; i < _body.size(); i++) {
+        Node* n = _body.at(i);
+        if (n == cl->incr() ||
+            n->is_reduction() ||
+            n->is_AddP() ||
+            n->is_Cmp() ||
+            n->is_IfTrue() ||
+            n->is_CountedLoop() ||
+            (n == cl_exit)) {
+          ignored_loop_nodes[i] = n->_idx;
+          continue;
+        }
+
+        if (n->is_If()) {
+          IfNode *iff = n->as_If();
+          if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
+            if (is_loop_exit(iff)) {
+              ignored_loop_nodes[i] = n->_idx;
+              continue;
+            }
+          }
+        }
+
+        if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
+          Node* n_tail = n->in(LoopNode::LoopBackControl);
+          if (n_tail != n->in(LoopNode::EntryControl)) {
+            if (!n_tail->is_Mem()) {
+              not_slp = true;
+              break;
+            }
+          }
+        }
+
+        // This must happen after check of phi/if
+        if (n->is_Phi() || n->is_If()) {
+          ignored_loop_nodes[i] = n->_idx;
+          continue;
+        }
+
+        if (n->is_LoadStore() || n->is_MergeMem() ||
+            (n->is_Proj() && !n->as_Proj()->is_CFG())) {
+          not_slp = true;
+          break;
+        }
+
+        if (n->is_Mem()) {
+          Node* adr = n->in(MemNode::Address);
+          Node* n_ctrl = phase->get_ctrl(adr);
+
+          // save a queue of post process nodes
+          if (n_ctrl != NULL && is_member(phase->get_loop(n_ctrl))) {
+            MemNode* current = n->as_Mem();
+            BasicType bt = current->memory_type();
+            if (is_java_primitive(bt) == false) {
+              ignored_loop_nodes[i] = n->_idx;
+              continue;
+            }
+
+            // Process the memory expression
+            int stack_idx = 0;
+            bool have_side_effects = true;
+            if (adr->is_AddP() == false) {
+              nstack.push(adr, stack_idx++);
+            } else {
+              // Mark the components of the memory operation in nstack
+              SWPointer p1(current, &sw, &nstack, true);
+              have_side_effects = p1.node_stack()->is_nonempty();
+            }
+
+            // Process the pointer stack
+            while (have_side_effects) {
+              Node* pointer_node = nstack.node();
+              for (uint j = 0; j < _body.size(); j++) {
+                Node* cur_node = _body.at(j);
+                if (cur_node == pointer_node) {
+                  ignored_loop_nodes[j] = cur_node->_idx;
+                  break;
+                }
+              }
+              nstack.pop();
+              have_side_effects = nstack.is_nonempty();
+            }
+
+            // Cleanup
+            nstack.clear();
+          }
+        }
+      }
+
+      if (not_slp == false) {
+        // Now we try to find the maximum supported consistent vector which the machine
+        // description can use
+        for (uint i = 0; i < _body.size(); i++) {
+          if (ignored_loop_nodes[i] != -1) continue;
+
+          BasicType bt;
+          Node* n = _body.at(i);
+          if (n->is_Store()) {
+            bt = n->as_Mem()->memory_type();
+          } else {
+            bt = n->bottom_type()->basic_type();
+          }
+
+          int cur_max_vector = Matcher::max_vector_size(bt);
+
+          // If a max vector exists which is not larger than _local_loop_unroll_factor
+          // stop looking, we already have the max vector to map to.
+          if (cur_max_vector <= _local_loop_unroll_factor) {
+            not_slp = true;
+#ifndef PRODUCT
+            if (TraceSuperWordLoopUnrollAnalysis) {
+              tty->print_cr("slp analysis fails: unroll limit equals max vector\n");
+            }
+#endif
+            break;
+          }
+
+          // Map the maximal common vector
+          if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
+            if (cur_max_vector < max_vector) {
+              max_vector = cur_max_vector;
+            }
+          }
+        }
+        if (not_slp == false) _local_loop_unroll_factor = max_vector;
+      }
+
+      if (not_slp) {
+        // Mark the loop as processed so that we do not try again
+        cl->mark_passed_slp();
+        cl->set_slp_max_unroll(_local_loop_unroll_factor);
+      }
+
+      // Now clean things up
+      a->Afree(ignored_loop_nodes, ignored_size);
+    }
+  }
+
+  return (not_slp == false);
+}
+
 //------------------------------policy_align-----------------------------------
 // Return TRUE or FALSE if the loop should be cache-line aligned.  Gather the
 // expression that does the alignment.  Note that only one array base can be
@@ -1551,6 +1755,7 @@
               for (unsigned j = 1; j < def_node->req(); j++) {
                 Node* in = def_node->in(j);
                 if (in == phi) {
+                  loop_head->mark_has_reductions();
                   def_node->add_flag(Node::Flag_is_reduction);
                   break;
                 }
@@ -2401,7 +2606,6 @@
     // and we'd rather unroll the post-RCE'd loop SO... do not unroll if
     // peeling.
     if (should_unroll && !should_peel) {
-      phase->mark_reductions(this);
       phase->do_unroll(this, old_new, true);
     }
 
--- old/src/share/vm/opto/loopnode.cpp	2015-05-01 18:07:00.247650500 -0700
+++ new/src/share/vm/opto/loopnode.cpp	2015-05-01 18:07:00.062650500 -0700
@@ -2408,7 +2408,7 @@
     for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
       IdealLoopTree* lpt = iter.current();
       if (lpt->is_counted()) {
-        sw.transform_loop(lpt);
+        sw.transform_loop(lpt, true);
       }
     }
   }
--- old/src/share/vm/opto/loopnode.hpp	2015-05-01 18:07:04.106650500 -0700
+++ new/src/share/vm/opto/loopnode.hpp	2015-05-01 18:07:03.911650500 -0700
@@ -62,7 +62,9 @@
          HasExactTripCount=8,
          InnerLoop=16,
          PartialPeelLoop=32,
-         PartialPeelFailed=64 };
+         PartialPeelFailed=64,
+         HasReductions=128,
+         PassedSlpAnalysis=256 };
   char _unswitch_count;
   enum { _unswitch_max=3 };
 
@@ -77,6 +79,8 @@
   void set_partial_peel_loop() { _loop_flags |= PartialPeelLoop; }
   int partial_peel_has_failed() const { return _loop_flags & PartialPeelFailed; }
   void mark_partial_peel_failed() { _loop_flags |= PartialPeelFailed; }
+  void mark_has_reductions() { _loop_flags |= HasReductions; }
+  void mark_passed_slp() { _loop_flags |= PassedSlpAnalysis; }
 
   int unswitch_max() { return _unswitch_max; }
   int unswitch_count() { return _unswitch_count; }
@@ -155,6 +159,10 @@
   // unroll,optimize,unroll,optimize,... is making progress
   int _node_count_before_unroll;
 
+  // If slp analysis is performed we record the maximum
+  // vector mapped unroll factor here
+  int slp_maximum_unroll_factor;
+
 public:
   CountedLoopNode( Node *entry, Node *backedge )
     : LoopNode(entry, backedge), _main_idx(0), _trip_count(max_juint),
@@ -199,10 +207,12 @@
 
   // A 'main' loop that is ONLY unrolled or peeled, never RCE'd or
   // Aligned, may be missing it's pre-loop.
-  int is_normal_loop() const { return (_loop_flags&PreMainPostFlagsMask) == Normal; }
-  int is_pre_loop   () const { return (_loop_flags&PreMainPostFlagsMask) == Pre;    }
-  int is_main_loop  () const { return (_loop_flags&PreMainPostFlagsMask) == Main;   }
-  int is_post_loop  () const { return (_loop_flags&PreMainPostFlagsMask) == Post;   }
+  int is_normal_loop   () const { return (_loop_flags&PreMainPostFlagsMask) == Normal; }
+  int is_pre_loop      () const { return (_loop_flags&PreMainPostFlagsMask) == Pre;    }
+  int is_main_loop     () const { return (_loop_flags&PreMainPostFlagsMask) == Main;   }
+  int is_post_loop     () const { return (_loop_flags&PreMainPostFlagsMask) == Post;   }
+  int is_reduction_loop() const { return (_loop_flags&HasReductions) == HasReductions; }
+  int has_passed_slp   () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; }
   int is_main_no_pre_loop() const { return _loop_flags & MainHasNoPreLoop; }
   void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; }
 
@@ -232,8 +242,10 @@
   void double_unrolled_count() { _unrolled_count_log2++; }
   int  unrolled_count()        { return 1 << MIN2(_unrolled_count_log2, BitsPerInt-3); }
 
-  void set_node_count_before_unroll(int ct) { _node_count_before_unroll = ct; }
-  int  node_count_before_unroll()           { return _node_count_before_unroll; }
+  void set_node_count_before_unroll(int ct)  { _node_count_before_unroll = ct; }
+  int  node_count_before_unroll()            { return _node_count_before_unroll; }
+  void set_slp_max_unroll(int unroll_factor) { slp_maximum_unroll_factor = unroll_factor; }
+  int  slp_max_unroll()                      { return slp_maximum_unroll_factor; }
 
 #ifndef PRODUCT
   virtual void dump_spec(outputStream *st) const;
@@ -336,6 +348,8 @@
   Node *_tail;                  // Tail of loop
   inline Node *tail();          // Handle lazy update of _tail field
   PhaseIdealLoop* _phase;
+  int _local_loop_unroll_limit;
+  int _local_loop_unroll_factor;
 
   Node_List _body;              // Loop body for inner loops
 
@@ -356,7 +370,8 @@
       _safepts(NULL),
       _required_safept(NULL),
       _allow_optimizations(true),
-      _nest(0), _irreducible(0), _has_call(0), _has_sfpt(0), _rce_candidate(0)
+      _nest(0), _irreducible(0), _has_call(0), _has_sfpt(0), _rce_candidate(0),
+      _local_loop_unroll_limit(0), _local_loop_unroll_factor(0)
   { }
 
   // Is 'l' a member of 'this'?
@@ -444,7 +459,11 @@
 
   // Return TRUE or FALSE if the loop should be unrolled or not.  Unroll if
   // the loop is a CountedLoop and the body is small enough.
-  bool policy_unroll( PhaseIdealLoop *phase ) const;
+  bool policy_unroll( PhaseIdealLoop *phase );
+
+  // Return TRUE or FALSE if the loop analyzes to map to a maximal
+  // superword unrolling for vectorization.
+  bool policy_slp_analysis( CountedLoopNode *cl, PhaseIdealLoop *phase );
 
   // Return TRUE or FALSE if the loop should be range-check-eliminated.
   // Gather a list of IF tests that are dominated by iteration splitting;
--- old/src/share/vm/opto/superword.cpp	2015-05-01 18:07:07.885650500 -0700
+++ new/src/share/vm/opto/superword.cpp	2015-05-01 18:07:07.689650500 -0700
@@ -66,11 +66,12 @@
   _lp(NULL),                              // LoopNode
   _bb(NULL),                              // basic block
   _iv(NULL),                              // induction var
-  _race_possible(false)                   // cases where SDMU is true
+  _race_possible(false),                  // cases where SDMU is true
+  _early_return(true)
 {}
 
 //------------------------------transform_loop---------------------------
-void SuperWord::transform_loop(IdealLoopTree* lpt) {
+void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
   assert(UseSuperWord, "should be");
   // Do vectors exist on this architecture?
   if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
@@ -105,8 +106,10 @@
   // For now, define one block which is the entire loop body
   set_bb(cl);
 
-  assert(_packset.length() == 0, "packset must be empty");
-  SLP_extract();
+  if (do_optimization) {
+    assert(_packset.length() == 0, "packset must be empty");
+    SLP_extract();
+  }
 }
 
 //------------------------------SLP_extract---------------------------
@@ -210,12 +213,12 @@
       best_iv_adjustment = iv_adjustment;
     }
 
-    SWPointer align_to_ref_p(mem_ref, this);
+    SWPointer align_to_ref_p(mem_ref, this, NULL, false);
     // Set alignment relative to "align_to_ref" for all related memory operations.
     for (int i = memops.size() - 1; i >= 0; i--) {
       MemNode* s = memops.at(i)->as_Mem();
       if (isomorphic(s, mem_ref)) {
-        SWPointer p2(s, this);
+        SWPointer p2(s, this, NULL, false);
         if (p2.comparable(align_to_ref_p)) {
           int align = memory_alignment(s, iv_adjustment);
           set_alignment(s, align);
@@ -344,7 +347,7 @@
   // Count number of comparable memory ops
   for (uint i = 0; i < memops.size(); i++) {
     MemNode* s1 = memops.at(i)->as_Mem();
-    SWPointer p1(s1, this);
+    SWPointer p1(s1, this, NULL, false);
     // Discard if pre loop can't align this reference
     if (!ref_is_alignable(p1)) {
       *cmp_ct.adr_at(i) = 0;
@@ -353,7 +356,7 @@
     for (uint j = i+1; j < memops.size(); j++) {
       MemNode* s2 = memops.at(j)->as_Mem();
       if (isomorphic(s1, s2)) {
-        SWPointer p2(s2, this);
+        SWPointer p2(s2, this, NULL, false);
         if (p1.comparable(p2)) {
           (*cmp_ct.adr_at(i))++;
           (*cmp_ct.adr_at(j))++;
@@ -374,7 +377,7 @@
     if (s->is_Store()) {
       int vw = vector_width_in_bytes(s);
       assert(vw > 1, "sanity");
-      SWPointer p(s, this);
+      SWPointer p(s, this, NULL, false);
       if (cmp_ct.at(j) >  max_ct ||
           cmp_ct.at(j) == max_ct &&
             (vw >  max_vw ||
@@ -397,7 +400,7 @@
       if (s->is_Load()) {
         int vw = vector_width_in_bytes(s);
         assert(vw > 1, "sanity");
-        SWPointer p(s, this);
+        SWPointer p(s, this, NULL, false);
         if (cmp_ct.at(j) >  max_ct ||
             cmp_ct.at(j) == max_ct &&
               (vw >  max_vw ||
@@ -482,7 +485,7 @@
 //---------------------------get_iv_adjustment---------------------------
 // Calculate loop's iv adjustment for this memory ops.
 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
-  SWPointer align_to_ref_p(mem_ref, this);
+  SWPointer align_to_ref_p(mem_ref, this, NULL, false);
   int offset = align_to_ref_p.offset_in_bytes();
   int scale  = align_to_ref_p.scale_in_bytes();
   int vw       = vector_width_in_bytes(mem_ref);
@@ -542,13 +545,13 @@
       if (_dg.dep(s1)->in_cnt() == 0) {
         _dg.make_edge(slice, s1);
       }
-      SWPointer p1(s1->as_Mem(), this);
+      SWPointer p1(s1->as_Mem(), this, NULL, false);
       bool sink_dependent = true;
       for (int k = j - 1; k >= 0; k--) {
         Node* s2 = _nlist.at(k);
         if (s1->is_Load() && s2->is_Load())
           continue;
-        SWPointer p2(s2->as_Mem(), this);
+        SWPointer p2(s2->as_Mem(), this, NULL, false);
 
         int cmp = p1.cmp(p2);
         if (SuperWordRTDepCheck &&
@@ -688,8 +691,8 @@
   if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
       _phase->C->get_alias_index(s2->as_Mem()->adr_type()))
     return false;
-  SWPointer p1(s1->as_Mem(), this);
-  SWPointer p2(s2->as_Mem(), this);
+  SWPointer p1(s1->as_Mem(), this, NULL, false);
+  SWPointer p2(s2->as_Mem(), this, NULL, false);
   if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
   int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
   return diff == data_size(s1);
@@ -1497,13 +1500,13 @@
       if (n->is_Load()) {
         Node* ctl = n->in(MemNode::Control);
         Node* mem = first->in(MemNode::Memory);
-        SWPointer p1(n->as_Mem(), this);
+        SWPointer p1(n->as_Mem(), this, NULL, false);
         // Identify the memory dependency for the new loadVector node by
         // walking up through memory chain.
         // This is done to give flexibility to the new loadVector node so that
         // it can move above independent storeVector nodes.
         while (mem->is_StoreVector()) {
-          SWPointer p2(mem->as_Mem(), this);
+          SWPointer p2(mem->as_Mem(), this, NULL, false);
           int cmp = p1.cmp(p2);
           if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
             mem = mem->in(MemNode::Memory);
@@ -2020,7 +2023,7 @@
 //------------------------------memory_alignment---------------------------
 // Alignment within a vector memory reference
 int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
-  SWPointer p(s, this);
+  SWPointer p(s, this, NULL, false);
   if (!p.valid()) {
     return bottom_align;
   }
@@ -2184,7 +2187,7 @@
   Node *orig_limit = pre_opaq->original_loop_limit();
   assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
 
-  SWPointer align_to_ref_p(align_to_ref, this);
+  SWPointer align_to_ref_p(align_to_ref, this, NULL, false);
   assert(align_to_ref_p.valid(), "sanity");
 
   // Given:
@@ -2355,6 +2358,7 @@
   _lp = NULL;
   _bb = NULL;
   _iv = NULL;
+  _early_return = false;
 }
 
 //------------------------------print_packset---------------------------
@@ -2411,9 +2415,11 @@
 //==============================SWPointer===========================
 
 //----------------------------SWPointer------------------------
-SWPointer::SWPointer(MemNode* mem, SuperWord* slp) :
+SWPointer::SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only) :
   _mem(mem), _slp(slp),  _base(NULL),  _adr(NULL),
-  _scale(0), _offset(0), _invar(NULL), _negate_invar(false) {
+  _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
+  _nstack(nstack), _analyze_only(analyze_only),
+  _stack_idx(0) {
 
   Node* adr = mem->in(MemNode::Address);
   if (!adr->is_AddP()) {
@@ -2446,7 +2452,9 @@
 // the pattern match of an address expression.
 SWPointer::SWPointer(SWPointer* p) :
   _mem(p->_mem), _slp(p->_slp),  _base(NULL),  _adr(NULL),
-  _scale(0), _offset(0), _invar(NULL), _negate_invar(false) {}
+  _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
+  _nstack(p->_nstack), _analyze_only(p->_analyze_only),
+  _stack_idx(p->_stack_idx) {}
 
 //------------------------scaled_iv_plus_offset--------------------
 // Match: k*iv + offset
@@ -2489,6 +2497,9 @@
     _scale = 1;
     return true;
   }
+  if (_analyze_only && (invariant(n) == false)) {
+    _nstack->push(n, _stack_idx++);
+  }
   int opc = n->Opcode();
   if (opc == Op_MulI) {
     if (n->in(1) == iv() && n->in(2)->is_Con()) {
@@ -2546,6 +2557,9 @@
     return false;
   }
   if (_invar != NULL) return false; // already have an invariant
+  if (_analyze_only && (invariant(n) == false)) {
+    _nstack->push(n, _stack_idx++);
+  }
   if (opc == Op_AddI) {
     if (n->in(2)->is_Con() && invariant(n->in(1))) {
       _negate_invar = negate;
--- old/src/share/vm/opto/superword.hpp	2015-05-01 18:07:11.616650500 -0700
+++ new/src/share/vm/opto/superword.hpp	2015-05-01 18:07:11.432650500 -0700
@@ -237,12 +237,13 @@
  public:
   SuperWord(PhaseIdealLoop* phase);
 
-  void transform_loop(IdealLoopTree* lpt);
+  void transform_loop(IdealLoopTree* lpt, bool do_optimization);
 
   // Accessors for SWPointer
   PhaseIdealLoop* phase()          { return _phase; }
   IdealLoopTree* lpt()             { return _lpt; }
   PhiNode* iv()                    { return _iv; }
+  bool early_return()              { return _early_return; }
 
  private:
   IdealLoopTree* _lpt;             // Current loop tree node
@@ -250,6 +251,7 @@
   Node*          _bb;              // Current basic block
   PhiNode*       _iv;              // Induction var
   bool           _race_possible;   // In cases where SDMU is true
+  bool           _early_return;    // True if we do not initialize
 
   // Accessors
   Arena* arena()                   { return _arena; }
@@ -434,15 +436,18 @@
 // Information about an address for dependence checking and vector alignment
 class SWPointer VALUE_OBJ_CLASS_SPEC {
  protected:
-  MemNode*   _mem;     // My memory reference node
-  SuperWord* _slp;     // SuperWord class
+  MemNode*    _mem;          // My memory reference node
+  SuperWord*  _slp;          // SuperWord class
 
-  Node* _base;         // NULL if unsafe nonheap reference
-  Node* _adr;          // address pointer
-  jint  _scale;        // multipler for iv (in bytes), 0 if no loop iv
-  jint  _offset;       // constant offset (in bytes)
-  Node* _invar;        // invariant offset (in bytes), NULL if none
-  bool  _negate_invar; // if true then use: (0 - _invar)
+  Node*       _base;         // NULL if unsafe nonheap reference
+  Node*       _adr;          // address pointer
+  jint        _scale;        // multipler for iv (in bytes), 0 if no loop iv
+  jint        _offset;       // constant offset (in bytes)
+  Node*       _invar;        // invariant offset (in bytes), NULL if none
+  bool        _negate_invar; // if true then use: (0 - _invar)
+  Node_Stack* _nstack;       // stack used to record a swpointer trace of variants
+  bool        _analyze_only; // Used in loop unrolling only for swpointer trace
+  uint        _stack_idx;    // Used in loop unrolling only for swpointer trace
 
   PhaseIdealLoop* phase() { return _slp->phase(); }
   IdealLoopTree*  lpt()   { return _slp->lpt(); }
@@ -469,7 +474,7 @@
     NotComparable = (Less | Greater | Equal)
   };
 
-  SWPointer(MemNode* mem, SuperWord* slp);
+  SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only);
   // Following is used to create a temporary object during
   // the pattern match of an address expression.
   SWPointer(SWPointer* p);
@@ -477,14 +482,15 @@
   bool valid()  { return _adr != NULL; }
   bool has_iv() { return _scale != 0; }
 
-  Node* base()            { return _base; }
-  Node* adr()             { return _adr; }
-  MemNode* mem()          { return _mem; }
-  int   scale_in_bytes()  { return _scale; }
-  Node* invar()           { return _invar; }
-  bool  negate_invar()    { return _negate_invar; }
-  int   offset_in_bytes() { return _offset; }
-  int   memory_size()     { return _mem->memory_size(); }
+  Node* base()             { return _base; }
+  Node* adr()              { return _adr; }
+  MemNode* mem()           { return _mem; }
+  int   scale_in_bytes()   { return _scale; }
+  Node* invar()            { return _invar; }
+  bool  negate_invar()     { return _negate_invar; }
+  int   offset_in_bytes()  { return _offset; }
+  int   memory_size()      { return _mem->memory_size(); }
+  Node_Stack* node_stack() { return _nstack; }
 
   // Comparable?
   int cmp(SWPointer& q) {