--- old/src/share/vm/opto/loopTransform.cpp	2015-05-01 18:06:56.436650500 -0700
+++ new/src/share/vm/opto/loopTransform.cpp	2015-05-01 18:06:56.250650500 -0700
@@ -38,6 +38,7 @@
 #include "opto/rootnode.hpp"
 #include "opto/runtime.hpp"
 #include "opto/subnode.hpp"
+#include "opto/superword.hpp"
 #include "opto/vectornode.hpp"
 
 //------------------------------is_loop_exit-----------------------------------
@@ -640,7 +641,7 @@
 //------------------------------policy_unroll----------------------------------
 // Return TRUE or FALSE if the loop should be unrolled or not.  Unroll if
 // the loop is a CountedLoop and the body is small enough.
-bool IdealLoopTree::policy_unroll( PhaseIdealLoop *phase ) const {
+bool IdealLoopTree::policy_unroll( PhaseIdealLoop *phase ) {
 
   CountedLoopNode *cl = _head->as_CountedLoop();
   assert(cl->is_normal_loop() || cl->is_main_loop(), "");
@@ -652,9 +653,46 @@
   // After split at least one iteration will be executed in pre-loop.
   if (cl->trip_count() <= (uint)(cl->is_normal_loop() ? 2 : 1)) return false;
 
+  _local_loop_unroll_limit = LoopUnrollLimit;
+  _local_loop_unroll_factor = 4;
   int future_unroll_ct = cl->unrolled_count() * 2;
   if (future_unroll_ct > LoopMaxUnroll) return false;
 
+  if (UseSuperWord) {
+    if (cl->is_reduction_loop() == false) phase->mark_reductions(this);
+
+    // Only attempt slp analysis when user controls do not prohibit it
+    if (LoopMaxUnroll > _local_loop_unroll_factor) {
+      // Once policy_slp_analysis succeeds, mark the loop with the
+      // maximal unroll factor so that we minimize analysis passes
+      if (cl->has_passed_slp() == false) {
+        if (policy_slp_analysis(cl, phase)) {
+          if (_local_loop_unroll_factor > 4) {
+            cl->mark_passed_slp();
+            cl->set_slp_max_unroll(_local_loop_unroll_factor);
+          }
+        }
+      }
+
+      if (cl->has_passed_slp()) {
+        int slp_max_unroll_factor = cl->slp_max_unroll();
+        if ((slp_max_unroll_factor > 4) &&
+            (slp_max_unroll_factor >= future_unroll_ct)) {
+          int new_limit = cl->node_count_before_unroll() * slp_max_unroll_factor;
+          if (new_limit > LoopUnrollLimit) {
+#ifndef PRODUCT
+            if (TraceSuperWordLoopUnrollAnalysis) {
+              tty->print_cr("slp analysis is applying unroll limit  %d, the original limit was %d\n",
+                            new_limit, _local_loop_unroll_limit);
+            }
+#endif
+            _local_loop_unroll_limit = new_limit;
+          }
+        }
+      }
+    }
+  }
+
   // Check for initial stride being a small enough constant
   if (abs(cl->stride_con()) > (1<<2)*future_unroll_ct) return false;
 
@@ -748,7 +786,7 @@
   }
 
   // Check for being too big
-  if (body_size > (uint)LoopUnrollLimit) {
+  if (body_size > (uint)_local_loop_unroll_limit) {
     if (xors_in_loop >= 4 && body_size < (uint)LoopUnrollLimit*4) return true;
     // Normal case: loop too big
     return false;
@@ -758,6 +796,172 @@
   return true;
 }
 
+bool IdealLoopTree::policy_slp_analysis( CountedLoopNode *cl, PhaseIdealLoop *phase ) {
+  // SLP analysis
+  bool not_slp = false;
+
+  // Enable this functionality target by target as needed
+  if (SuperWordLoopUnrollAnalysis) {
+    SuperWord sw(phase);
+    sw.transform_loop(this, false);
+
+    // If the loop is slp canonical analyze it
+    if (sw.early_return() == false) {
+      Arena *a = Thread::current()->resource_area();
+      int max_vector = Matcher::max_vector_size(T_INT);
+      size_t ignored_size = _body.size()*sizeof(int*);
+      int *ignored_loop_nodes = (int*)a->Amalloc_D(ignored_size);
+      Node_Stack nstack((int)ignored_size);
+      Node *cl_exit = cl->loopexit();
+
+      // First clear the entries
+      for (uint i = 0; i < _body.size(); i++) {
+        ignored_loop_nodes[i] = -1;
+      }
+
+      // Process the loop, some/all of the stack entries will not be in order, ergo
+      // need to preprocess the ignored initial state before we process the loop
+      for (uint i = 0; i < _body.size(); i++) {
+        Node* n = _body.at(i);
+        if (n == cl->incr() ||
+            n->is_reduction() ||
+            n->is_AddP() ||
+            n->is_Cmp() ||
+            n->is_IfTrue() ||
+            n->is_CountedLoop() ||
+            (n == cl_exit)) {
+          ignored_loop_nodes[i] = n->_idx;
+          continue;
+        }
+
+        if (n->is_If()) {
+          IfNode *iff = n->as_If();
+          if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
+            if (is_loop_exit(iff)) {
+              ignored_loop_nodes[i] = n->_idx;
+              continue;
+            }
+          }
+        }
+
+        if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
+          Node* n_tail = n->in(LoopNode::LoopBackControl);
+          if (n_tail != n->in(LoopNode::EntryControl)) {
+            if (!n_tail->is_Mem()) {
+              not_slp = true;
+              break;
+            }
+          }
+        }
+
+        // This must happen after check of phi/if
+        if (n->is_Phi() || n->is_If()) {
+          ignored_loop_nodes[i] = n->_idx;
+          continue;
+        }
+
+        if (n->is_LoadStore() || n->is_MergeMem() ||
+            (n->is_Proj() && !n->as_Proj()->is_CFG())) {
+          not_slp = true;
+          break;
+        }
+
+        if (n->is_Mem()) {
+          Node* adr = n->in(MemNode::Address);
+          Node* n_ctrl = phase->get_ctrl(adr);
+
+          // save a queue of post process nodes
+          if (n_ctrl != NULL && is_member(phase->get_loop(n_ctrl))) {
+            MemNode* current = n->as_Mem();
+            BasicType bt = current->memory_type();
+            if (is_java_primitive(bt) == false) {
+              ignored_loop_nodes[i] = n->_idx;
+              continue;
+            }
+
+            // Process the memory expression
+            int stack_idx = 0;
+            bool have_side_effects = true;
+            if (adr->is_AddP() == false) {
+              nstack.push(adr, stack_idx++);
+            } else {
+              // Mark the components of the memory operation in nstack
+              SWPointer p1(current, &sw, &nstack, true);
+              have_side_effects = p1.node_stack()->is_nonempty();
+            }
+
+            // Process the pointer stack
+            while (have_side_effects) {
+              Node* pointer_node = nstack.node();
+              for (uint j = 0; j < _body.size(); j++) {
+                Node* cur_node = _body.at(j);
+                if (cur_node == pointer_node) {
+                  ignored_loop_nodes[j] = cur_node->_idx;
+                  break;
+                }
+              }
+              nstack.pop();
+              have_side_effects = nstack.is_nonempty();
+            }
+
+            // Cleanup
+            nstack.clear();
+          }
+        }
+      }
+
+      if (not_slp == false) {
+        // Now we try to find the maximum supported consistent vector which the machine
+        // description can use
+        for (uint i = 0; i < _body.size(); i++) {
+          if (ignored_loop_nodes[i] != -1) continue;
+
+          BasicType bt;
+          Node* n = _body.at(i);
+          if (n->is_Store()) {
+            bt = n->as_Mem()->memory_type();
+          } else {
+            bt = n->bottom_type()->basic_type();
+          }
+
+          int cur_max_vector = Matcher::max_vector_size(bt);
+
+          // If a max vector exists which is not larger than _local_loop_unroll_factor
+          // stop looking, we already have the max vector to map to.
+          if (cur_max_vector <= _local_loop_unroll_factor) {
+            not_slp = true;
+#ifndef PRODUCT
+            if (TraceSuperWordLoopUnrollAnalysis) {
+              tty->print_cr("slp analysis fails: unroll limit equals max vector\n");
+            }
+#endif
+            break;
+          }
+
+          // Map the maximal common vector
+          if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
+            if (cur_max_vector < max_vector) {
+              max_vector = cur_max_vector;
+            }
+          }
+        }
+        if (not_slp == false) _local_loop_unroll_factor = max_vector;
+      }
+
+      if (not_slp) {
+        // Mark the loop as processed so that we do not try again
+        cl->mark_passed_slp();
+        cl->set_slp_max_unroll(_local_loop_unroll_factor);
+      }
+
+      // Now clean things up
+      a->Afree(ignored_loop_nodes, ignored_size);
+    }
+  }
+
+  return (not_slp == false);
+}
+
 //------------------------------policy_align-----------------------------------
 // Return TRUE or FALSE if the loop should be cache-line aligned.  Gather the
 // expression that does the alignment.  Note that only one array base can be
@@ -1551,6 +1755,7 @@
               for (unsigned j = 1; j < def_node->req(); j++) {
                 Node* in = def_node->in(j);
                 if (in == phi) {
+                  loop_head->mark_has_reductions();
                   def_node->add_flag(Node::Flag_is_reduction);
                   break;
                 }
@@ -2401,7 +2606,6 @@
     // and we'd rather unroll the post-RCE'd loop SO... do not unroll if
     // peeling.
     if (should_unroll && !should_peel) {
-      phase->mark_reductions(this);
       phase->do_unroll(this, old_new, true);
     }