--- old/src/share/vm/opto/loopTransform.cpp 2015-05-01 18:06:56.436650500 -0700 +++ new/src/share/vm/opto/loopTransform.cpp 2015-05-01 18:06:56.250650500 -0700 @@ -38,6 +38,7 @@ #include "opto/rootnode.hpp" #include "opto/runtime.hpp" #include "opto/subnode.hpp" +#include "opto/superword.hpp" #include "opto/vectornode.hpp" //------------------------------is_loop_exit----------------------------------- @@ -640,7 +641,7 @@ //------------------------------policy_unroll---------------------------------- // Return TRUE or FALSE if the loop should be unrolled or not. Unroll if // the loop is a CountedLoop and the body is small enough. -bool IdealLoopTree::policy_unroll( PhaseIdealLoop *phase ) const { +bool IdealLoopTree::policy_unroll( PhaseIdealLoop *phase ) { CountedLoopNode *cl = _head->as_CountedLoop(); assert(cl->is_normal_loop() || cl->is_main_loop(), ""); @@ -652,9 +653,46 @@ // After split at least one iteration will be executed in pre-loop. if (cl->trip_count() <= (uint)(cl->is_normal_loop() ? 2 : 1)) return false; + _local_loop_unroll_limit = LoopUnrollLimit; + _local_loop_unroll_factor = 4; int future_unroll_ct = cl->unrolled_count() * 2; if (future_unroll_ct > LoopMaxUnroll) return false; + if (UseSuperWord) { + if (cl->is_reduction_loop() == false) phase->mark_reductions(this); + + // Only attempt slp analysis when user controls do not prohibit it + if (LoopMaxUnroll > _local_loop_unroll_factor) { + // Once policy_slp_analysis succeeds, mark the loop with the + // maximal unroll factor so that we minimize analysis passes + if (cl->has_passed_slp() == false) { + if (policy_slp_analysis(cl, phase)) { + if (_local_loop_unroll_factor > 4) { + cl->mark_passed_slp(); + cl->set_slp_max_unroll(_local_loop_unroll_factor); + } + } + } + + if (cl->has_passed_slp()) { + int slp_max_unroll_factor = cl->slp_max_unroll(); + if ((slp_max_unroll_factor > 4) && + (slp_max_unroll_factor >= future_unroll_ct)) { + int new_limit = cl->node_count_before_unroll() * slp_max_unroll_factor; + if (new_limit > LoopUnrollLimit) { +#ifndef PRODUCT + if (TraceSuperWordLoopUnrollAnalysis) { + tty->print_cr("slp analysis is applying unroll limit %d, the original limit was %d\n", + new_limit, _local_loop_unroll_limit); + } +#endif + _local_loop_unroll_limit = new_limit; + } + } + } + } + } + // Check for initial stride being a small enough constant if (abs(cl->stride_con()) > (1<<2)*future_unroll_ct) return false; @@ -748,7 +786,7 @@ } // Check for being too big - if (body_size > (uint)LoopUnrollLimit) { + if (body_size > (uint)_local_loop_unroll_limit) { if (xors_in_loop >= 4 && body_size < (uint)LoopUnrollLimit*4) return true; // Normal case: loop too big return false; @@ -758,6 +796,172 @@ return true; } +bool IdealLoopTree::policy_slp_analysis( CountedLoopNode *cl, PhaseIdealLoop *phase ) { + // SLP analysis + bool not_slp = false; + + // Enable this functionality target by target as needed + if (SuperWordLoopUnrollAnalysis) { + SuperWord sw(phase); + sw.transform_loop(this, false); + + // If the loop is slp canonical analyze it + if (sw.early_return() == false) { + Arena *a = Thread::current()->resource_area(); + int max_vector = Matcher::max_vector_size(T_INT); + size_t ignored_size = _body.size()*sizeof(int*); + int *ignored_loop_nodes = (int*)a->Amalloc_D(ignored_size); + Node_Stack nstack((int)ignored_size); + Node *cl_exit = cl->loopexit(); + + // First clear the entries + for (uint i = 0; i < _body.size(); i++) { + ignored_loop_nodes[i] = -1; + } + + // Process the loop, some/all of the stack entries will not be in order, ergo + // need to preprocess the ignored initial state before we process the loop + for (uint i = 0; i < _body.size(); i++) { + Node* n = _body.at(i); + if (n == cl->incr() || + n->is_reduction() || + n->is_AddP() || + n->is_Cmp() || + n->is_IfTrue() || + n->is_CountedLoop() || + (n == cl_exit)) { + ignored_loop_nodes[i] = n->_idx; + continue; + } + + if (n->is_If()) { + IfNode *iff = n->as_If(); + if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) { + if (is_loop_exit(iff)) { + ignored_loop_nodes[i] = n->_idx; + continue; + } + } + } + + if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) { + Node* n_tail = n->in(LoopNode::LoopBackControl); + if (n_tail != n->in(LoopNode::EntryControl)) { + if (!n_tail->is_Mem()) { + not_slp = true; + break; + } + } + } + + // This must happen after check of phi/if + if (n->is_Phi() || n->is_If()) { + ignored_loop_nodes[i] = n->_idx; + continue; + } + + if (n->is_LoadStore() || n->is_MergeMem() || + (n->is_Proj() && !n->as_Proj()->is_CFG())) { + not_slp = true; + break; + } + + if (n->is_Mem()) { + Node* adr = n->in(MemNode::Address); + Node* n_ctrl = phase->get_ctrl(adr); + + // save a queue of post process nodes + if (n_ctrl != NULL && is_member(phase->get_loop(n_ctrl))) { + MemNode* current = n->as_Mem(); + BasicType bt = current->memory_type(); + if (is_java_primitive(bt) == false) { + ignored_loop_nodes[i] = n->_idx; + continue; + } + + // Process the memory expression + int stack_idx = 0; + bool have_side_effects = true; + if (adr->is_AddP() == false) { + nstack.push(adr, stack_idx++); + } else { + // Mark the components of the memory operation in nstack + SWPointer p1(current, &sw, &nstack, true); + have_side_effects = p1.node_stack()->is_nonempty(); + } + + // Process the pointer stack + while (have_side_effects) { + Node* pointer_node = nstack.node(); + for (uint j = 0; j < _body.size(); j++) { + Node* cur_node = _body.at(j); + if (cur_node == pointer_node) { + ignored_loop_nodes[j] = cur_node->_idx; + break; + } + } + nstack.pop(); + have_side_effects = nstack.is_nonempty(); + } + + // Cleanup + nstack.clear(); + } + } + } + + if (not_slp == false) { + // Now we try to find the maximum supported consistent vector which the machine + // description can use + for (uint i = 0; i < _body.size(); i++) { + if (ignored_loop_nodes[i] != -1) continue; + + BasicType bt; + Node* n = _body.at(i); + if (n->is_Store()) { + bt = n->as_Mem()->memory_type(); + } else { + bt = n->bottom_type()->basic_type(); + } + + int cur_max_vector = Matcher::max_vector_size(bt); + + // If a max vector exists which is not larger than _local_loop_unroll_factor + // stop looking, we already have the max vector to map to. + if (cur_max_vector <= _local_loop_unroll_factor) { + not_slp = true; +#ifndef PRODUCT + if (TraceSuperWordLoopUnrollAnalysis) { + tty->print_cr("slp analysis fails: unroll limit equals max vector\n"); + } +#endif + break; + } + + // Map the maximal common vector + if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) { + if (cur_max_vector < max_vector) { + max_vector = cur_max_vector; + } + } + } + if (not_slp == false) _local_loop_unroll_factor = max_vector; + } + + if (not_slp) { + // Mark the loop as processed so that we do not try again + cl->mark_passed_slp(); + cl->set_slp_max_unroll(_local_loop_unroll_factor); + } + + // Now clean things up + a->Afree(ignored_loop_nodes, ignored_size); + } + } + + return (not_slp == false); +} + //------------------------------policy_align----------------------------------- // Return TRUE or FALSE if the loop should be cache-line aligned. Gather the // expression that does the alignment. Note that only one array base can be @@ -1551,6 +1755,7 @@ for (unsigned j = 1; j < def_node->req(); j++) { Node* in = def_node->in(j); if (in == phi) { + loop_head->mark_has_reductions(); def_node->add_flag(Node::Flag_is_reduction); break; } @@ -2401,7 +2606,6 @@ // and we'd rather unroll the post-RCE'd loop SO... do not unroll if // peeling. if (should_unroll && !should_peel) { - phase->mark_reductions(this); phase->do_unroll(this, old_new, true); }