< prev index next >

src/share/vm/opto/superword.cpp

Print this page




  34 #include "opto/memnode.hpp"
  35 #include "opto/mulnode.hpp"
  36 #include "opto/opcodes.hpp"
  37 #include "opto/opaquenode.hpp"
  38 #include "opto/superword.hpp"
  39 #include "opto/vectornode.hpp"
  40 #include "opto/movenode.hpp"
  41 
  42 //
  43 //                  S U P E R W O R D   T R A N S F O R M
  44 //=============================================================================
  45 
  46 //------------------------------SuperWord---------------------------
  47 SuperWord::SuperWord(PhaseIdealLoop* phase) :
  48   _phase(phase),
  49   _igvn(phase->_igvn),
  50   _arena(phase->C->comp_arena()),
  51   _packset(arena(), 8,  0, NULL),         // packs for the current block
  52   _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
  53   _block(arena(), 8,  0, NULL),           // nodes in current block

  54   _data_entry(arena(), 8,  0, NULL),      // nodes with all inputs from outside
  55   _mem_slice_head(arena(), 8,  0, NULL),  // memory slice heads
  56   _mem_slice_tail(arena(), 8,  0, NULL),  // memory slice tails
  57   _node_info(arena(), 8,  0, SWNodeInfo::initial), // info needed per node
  58   _clone_map(phase->C->clone_map()),      // map of nodes created in cloning
  59   _cmovev_kit(_arena, this),              // map to facilitate CMoveVD creation
  60   _align_to_ref(NULL),                    // memory reference to align vectors to
  61   _disjoint_ptrs(arena(), 8,  0, OrderedPair::initial), // runtime disambiguated pointer pairs
  62   _dg(_arena),                            // dependence graph
  63   _visited(arena()),                      // visited node set
  64   _post_visited(arena()),                 // post visited node set
  65   _n_idx_list(arena(), 8),                // scratch list of (node,index) pairs
  66   _stk(arena(), 8, 0, NULL),              // scratch stack of nodes
  67   _nlist(arena(), 8, 0, NULL),            // scratch list of nodes
  68   _lpt(NULL),                             // loop tree node
  69   _lp(NULL),                              // LoopNode
  70   _bb(NULL),                              // basic block
  71   _iv(NULL),                              // induction var
  72   _race_possible(false),                  // cases where SDMU is true
  73   _early_return(true),                    // analysis evaluations routine


  82 #ifndef PRODUCT
  83   _vector_loop_debug = 0;
  84   if (_phase->C->method() != NULL) {
  85     _vector_loop_debug = phase->C->directive()->VectorizeDebugOption;
  86   }
  87 
  88 #endif
  89 }
  90 
  91 //------------------------------transform_loop---------------------------
  92 void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
  93   assert(UseSuperWord, "should be");
  94   // Do vectors exist on this architecture?
  95   if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
  96 
  97   assert(lpt->_head->is_CountedLoop(), "must be");
  98   CountedLoopNode *cl = lpt->_head->as_CountedLoop();
  99 
 100   if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
 101 
 102   if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops




















 103   // Check for no control flow in body (other than exit)
 104   Node *cl_exit = cl->loopexit();
 105   if (cl_exit->in(0) != lpt->_head) {
 106     #ifndef PRODUCT
 107       if (TraceSuperWord) {
 108         tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
 109         tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
 110         tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();
 111         tty->print("lpt->_head %d", lpt->_head->_idx); lpt->_head->dump();
 112         lpt->dump_head();
 113       }
 114     #endif
 115     return;
 116   }
 117 
 118   // Make sure the are no extra control users of the loop backedge
 119   if (cl->back_control()->outcnt() != 1) {
 120     return;
 121   }
 122 
 123   // We only re-enter slp when we vector mapped a queried loop and we want to
 124   // continue unrolling, in this case, slp is not subsequently done.
 125   if (cl->do_unroll_only()) return;
 126 

 127   // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
 128   CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
 129   if (pre_end == NULL) return;
 130   Node *pre_opaq1 = pre_end->limit();
 131   if (pre_opaq1->Opcode() != Op_Opaque1) return;

 132 
 133   init(); // initialize data structures
 134 
 135   set_lpt(lpt);
 136   set_lp(cl);
 137 
 138   // For now, define one block which is the entire loop body
 139   set_bb(cl);
 140 
 141   if (do_optimization) {
 142     assert(_packset.length() == 0, "packset must be empty");
 143     SLP_extract();













 144   }
 145 }
 146 
 147 //------------------------------early unrolling analysis------------------------------
 148 void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
 149   bool is_slp = true;
 150   ResourceMark rm;
 151   size_t ignored_size = lpt()->_body.size();
 152   int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
 153   Node_Stack nstack((int)ignored_size);
 154   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
 155   Node *cl_exit = cl->loopexit();



 156 
 157   // First clear the entries
 158   for (uint i = 0; i < lpt()->_body.size(); i++) {
 159     ignored_loop_nodes[i] = -1;
 160   }
 161 
 162   int max_vector = Matcher::max_vector_size(T_INT);

 163 
 164   // Process the loop, some/all of the stack entries will not be in order, ergo
 165   // need to preprocess the ignored initial state before we process the loop
 166   for (uint i = 0; i < lpt()->_body.size(); i++) {
 167     Node* n = lpt()->_body.at(i);
 168     if (n == cl->incr() ||
 169       n->is_reduction() ||
 170       n->is_AddP() ||
 171       n->is_Cmp() ||
 172       n->is_IfTrue() ||
 173       n->is_CountedLoop() ||
 174       (n == cl_exit)) {
 175       ignored_loop_nodes[i] = n->_idx;
 176       continue;
 177     }
 178 
 179     if (n->is_If()) {
 180       IfNode *iff = n->as_If();
 181       if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
 182         if (lpt()->is_loop_exit(iff)) {


 241         // Process the pointer stack
 242         while (have_side_effects) {
 243           Node* pointer_node = nstack.node();
 244           for (uint j = 0; j < lpt()->_body.size(); j++) {
 245             Node* cur_node = lpt()->_body.at(j);
 246             if (cur_node == pointer_node) {
 247               ignored_loop_nodes[j] = cur_node->_idx;
 248               break;
 249             }
 250           }
 251           nstack.pop();
 252           have_side_effects = nstack.is_nonempty();
 253         }
 254       }
 255     }
 256   }
 257 
 258   if (is_slp) {
 259     // Now we try to find the maximum supported consistent vector which the machine
 260     // description can use

 261     for (uint i = 0; i < lpt()->_body.size(); i++) {
 262       if (ignored_loop_nodes[i] != -1) continue;
 263 
 264       BasicType bt;
 265       Node* n = lpt()->_body.at(i);
 266       if (n->is_Mem()) {
 267         bt = n->as_Mem()->memory_type();
 268       } else {
 269         bt = n->bottom_type()->basic_type();
 270       }


















 271       if (is_java_primitive(bt) == false) continue;
 272 
 273       int cur_max_vector = Matcher::max_vector_size(bt);
 274 
 275       // If a max vector exists which is not larger than _local_loop_unroll_factor
 276       // stop looking, we already have the max vector to map to.
 277       if (cur_max_vector < local_loop_unroll_factor) {
 278         is_slp = false;
 279         if (TraceSuperWordLoopUnrollAnalysis) {
 280           tty->print_cr("slp analysis fails: unroll limit greater than max vector\n");
 281         }
 282         break;
 283       }
 284 
 285       // Map the maximal common vector
 286       if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
 287         if (cur_max_vector < max_vector) {
 288           max_vector = cur_max_vector;
 289         }






 290       }
 291     }
 292     if (is_slp) {
 293       local_loop_unroll_factor = max_vector;
 294       cl->mark_passed_slp();
 295     }
 296     cl->mark_was_slp();





 297     cl->set_slp_max_unroll(local_loop_unroll_factor);
 298   }


 299 }
 300 
 301 //------------------------------SLP_extract---------------------------
 302 // Extract the superword level parallelism
 303 //
 304 // 1) A reverse post-order of nodes in the block is constructed.  By scanning
 305 //    this list from first to last, all definitions are visited before their uses.
 306 //
 307 // 2) A point-to-point dependence graph is constructed between memory references.
 308 //    This simplies the upcoming "independence" checker.
 309 //
 310 // 3) The maximum depth in the node graph from the beginning of the block
 311 //    to each node is computed.  This is used to prune the graph search
 312 //    in the independence checker.
 313 //
 314 // 4) For integer types, the necessary bit width is propagated backwards
 315 //    from stores to allow packed operations on byte, char, and short
 316 //    integers.  This reverses the promotion to type "int" that javac
 317 //    did for operations like: char c1,c2,c3;  c1 = c2 + c3.
 318 //


 332 //    inserting scalar promotion, vector creation from multiple scalars, and
 333 //    extraction of scalar values from vectors.
 334 //
 335 void SuperWord::SLP_extract() {
 336 
 337 #ifndef PRODUCT
 338   if (_do_vector_loop && TraceSuperWord) {
 339     tty->print("SuperWord::SLP_extract\n");
 340     tty->print("input loop\n");
 341     _lpt->dump_head();
 342     _lpt->dump();
 343     for (uint i = 0; i < _lpt->_body.size(); i++) {
 344       _lpt->_body.at(i)->dump();
 345     }
 346   }
 347 #endif
 348   // Ready the block
 349   if (!construct_bb()) {
 350     return; // Exit if no interesting nodes or complex graph.
 351   }

 352   // build    _dg, _disjoint_ptrs
 353   dependence_graph();
 354 
 355   // compute function depth(Node*)
 356   compute_max_depth();
 357 



 358   if (_do_vector_loop) {
 359     if (mark_generations() != -1) {
 360       hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly
 361 
 362       if (!construct_bb()) {
 363         return; // Exit if no interesting nodes or complex graph.
 364       }
 365       dependence_graph();
 366       compute_max_depth();
 367     }
 368 
 369 #ifndef PRODUCT
 370     if (TraceSuperWord) {
 371       tty->print_cr("\nSuperWord::_do_vector_loop: graph after hoist_loads_in_graph");
 372       _lpt->dump_head();
 373       for (int j = 0; j < _block.length(); j++) {
 374         Node* n = _block.at(j);
 375         int d = depth(n);
 376         for (int i = 0;  i < d; i++) tty->print("%s", "  ");
 377         tty->print("%d :", d);


 392   if (_do_vector_loop) {
 393     if (_packset.length() == 0) {
 394       if (TraceSuperWord) {
 395         tty->print_cr("\nSuperWord::_do_vector_loop DFA could not build packset, now trying to build anyway");
 396       }
 397       pack_parallel();
 398     }
 399   }
 400 
 401   combine_packs();
 402 
 403   construct_my_pack_map();
 404 
 405   if (_do_vector_loop) {
 406     merge_packs_to_cmovd();
 407   }
 408 
 409   filter_packs();
 410 
 411   schedule();

































 412 
 413   output();
 414 }
 415 
 416 //------------------------------find_adjacent_refs---------------------------
 417 // Find the adjacent memory references and create pack pairs for them.
 418 // This is the initial set of packs that will then be extended by
 419 // following use->def and def->use links.  The align positions are
 420 // assigned relative to the reference "align_to_ref"
 421 void SuperWord::find_adjacent_refs() {
 422   // Get list of memory operations
 423   Node_List memops;
 424   for (int i = 0; i < _block.length(); i++) {
 425     Node* n = _block.at(i);
 426     if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
 427         is_java_primitive(n->as_Mem()->memory_type())) {
 428       int align = memory_alignment(n->as_Mem(), 0);
 429       if (align != bottom_align) {
 430         memops.push(n);
 431       }


 793   } else {
 794     // This memory op is not dependent on iv (scale == 0)
 795     iv_adjustment = 0;
 796   }
 797 
 798 #ifndef PRODUCT
 799   if (TraceSuperWord) {
 800     tty->print("SuperWord::get_iv_adjustment: n = %d, noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d: ",
 801       mem_ref->_idx, offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
 802     mem_ref->dump();
 803   }
 804 #endif
 805   return iv_adjustment;
 806 }
 807 
 808 //---------------------------dependence_graph---------------------------
 809 // Construct dependency graph.
 810 // Add dependence edges to load/store nodes for memory dependence
 811 //    A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
 812 void SuperWord::dependence_graph() {

 813   // First, assign a dependence node to each memory node
 814   for (int i = 0; i < _block.length(); i++ ) {
 815     Node *n = _block.at(i);
 816     if (n->is_Mem() || n->is_Phi() && n->bottom_type() == Type::MEMORY) {
 817       _dg.make_node(n);
 818     }
 819   }
 820 
 821   // For each memory slice, create the dependences
 822   for (int i = 0; i < _mem_slice_head.length(); i++) {
 823     Node* n      = _mem_slice_head.at(i);
 824     Node* n_tail = _mem_slice_tail.at(i);
 825 
 826     // Get slice in predecessor order (last is first)

 827     mem_slice_preds(n_tail, n, _nlist);

 828 
 829 #ifndef PRODUCT
 830     if(TraceSuperWord && Verbose) {
 831       tty->print_cr("SuperWord::dependence_graph: built a new mem slice");
 832       for (int j = _nlist.length() - 1; j >= 0 ; j--) {
 833         _nlist.at(j)->dump();
 834       }
 835     }
 836 #endif
 837     // Make the slice dependent on the root
 838     DepMem* slice = _dg.dep(n);
 839     _dg.make_edge(_dg.root(), slice);
 840 
 841     // Create a sink for the slice
 842     DepMem* slice_sink = _dg.make_node(NULL);
 843     _dg.make_edge(slice_sink, _dg.tail());
 844 
 845     // Now visit each pair of memory ops, creating the edges
 846     for (int j = _nlist.length() - 1; j >= 0 ; j--) {
 847       Node* s1 = _nlist.at(j);


2011   if(whole) {
2012     tty->print_cr("\n Whole loop tree");
2013     _phase->dump();
2014     tty->print_cr(" End of whole loop tree\n");
2015   }
2016 }
2017 #endif
2018 
2019 //------------------------------output---------------------------
2020 // Convert packs into vector node operations
2021 void SuperWord::output() {
2022   if (_packset.length() == 0) return;
2023 
2024 #ifndef PRODUCT
2025   if (TraceLoopOpts) {
2026     tty->print("SuperWord::output    ");
2027     lpt()->dump_head();
2028   }
2029 #endif
2030 


2031   // MUST ENSURE main loop's initial value is properly aligned:
2032   //  (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
2033 
2034   align_initial_loop_index(align_to_ref());
2035 
2036   // Insert extract (unpack) operations for scalar uses
2037   for (int i = 0; i < _packset.length(); i++) {
2038     insert_extracts(_packset.at(i));
2039   }

2040 
2041   Compile* C = _phase->C;
2042   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
2043   uint max_vlen_in_bytes = 0;
2044   uint max_vlen = 0;

2045 
2046   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop before create_reserve_version_of_loop"); print_loop(true);})
2047 
2048   CountedLoopReserveKit make_reversable(_phase, _lpt, do_reserve_copy());
2049 
2050   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop after create_reserve_version_of_loop"); print_loop(true);})
2051 
2052   if (do_reserve_copy() && !make_reversable.has_reserved()) {
2053     NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: loop was not reserved correctly, exiting SuperWord");})
2054     return;
2055   }
2056 
2057   for (int i = 0; i < _block.length(); i++) {
2058     Node* n = _block.at(i);
2059     Node_List* p = my_pack(n);
2060     if (p && n == executed_last(p)) {
2061       uint vlen = p->size();
2062       uint vlen_in_bytes = 0;
2063       Node* vn = NULL;
2064       Node* low_adr = p->at(0);
2065       Node* first   = executed_first(p);




2066       NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d executed first, %d executed last in pack", first->_idx, n->_idx); print_pack(p);})
2067       int   opc = n->Opcode();
2068       if (n->is_Load()) {
2069         Node* ctl = n->in(MemNode::Control);
2070         Node* mem = first->in(MemNode::Memory);
2071         SWPointer p1(n->as_Mem(), this, NULL, false);
2072         // Identify the memory dependency for the new loadVector node by
2073         // walking up through memory chain.
2074         // This is done to give flexibility to the new loadVector node so that
2075         // it can move above independent storeVector nodes.
2076         while (mem->is_StoreVector()) {
2077           SWPointer p2(mem->as_Mem(), this, NULL, false);
2078           int cmp = p1.cmp(p2);
2079           if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
2080             mem = mem->in(MemNode::Memory);
2081           } else {
2082             break; // dependent memory
2083           }
2084         }
2085         Node* adr = low_adr->in(MemNode::Address);


2135           in2 = tmp;
2136         }
2137         if (node_isa_reduction) {
2138           const Type *arith_type = n->bottom_type();
2139           vn = ReductionNode::make(opc, NULL, in1, in2, arith_type->basic_type());
2140           if (in2->is_Load()) {
2141             vlen_in_bytes = in2->as_LoadVector()->memory_size();
2142           } else {
2143             vlen_in_bytes = in2->as_Vector()->length_in_bytes();
2144           }
2145         } else {
2146           vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
2147           vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2148         }
2149       } else if (opc == Op_SqrtD || opc == Op_AbsF || opc == Op_AbsD || opc == Op_NegF || opc == Op_NegD) {
2150         // Promote operand to vector (Sqrt/Abs/Neg are 2 address instructions)
2151         Node* in = vector_opd(p, 1);
2152         vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
2153         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2154       } else if (is_cmov_pack(p)) {




2155         if (!n->is_CMove()) {
2156           continue;
2157         }
2158         // place here CMoveVDNode
2159         NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: print before CMove vectorization"); print_loop(false);})
2160         Node* bol = n->in(CMoveNode::Condition);
2161         if (!bol->is_Bool() && bol->Opcode() == Op_ExtractI && bol->req() > 1 ) {
2162           NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d is not Bool node, trying its in(1) node %d", bol->_idx, bol->in(1)->_idx); bol->dump(); bol->in(1)->dump();})
2163           bol = bol->in(1); //may be ExtractNode
2164         }
2165 
2166         assert(bol->is_Bool(), "should be BoolNode - too late to bail out!");
2167         if (!bol->is_Bool()) {
2168           if (do_reserve_copy()) {
2169             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: expected %d bool node, exiting SuperWord", bol->_idx); bol->dump();})
2170             return; //and reverse to backup IG
2171           }
2172           ShouldNotReachHere();
2173         }
2174 


2199         const TypeVect* vt = TypeVect::make(bt, vlen);
2200         vn = new CMoveVDNode(cc, src1, src2, vt);
2201         NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created new CMove node %d: ", vn->_idx); vn->dump();})
2202       } else {
2203         if (do_reserve_copy()) {
2204           NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: ShouldNotReachHere, exiting SuperWord");})
2205           return; //and reverse to backup IG
2206         }
2207         ShouldNotReachHere();
2208       }
2209 
2210       assert(vn != NULL, "sanity");
2211       if (vn == NULL) {
2212         if (do_reserve_copy()){
2213           NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: got NULL node, cannot proceed, exiting SuperWord");})
2214           return; //and reverse to backup IG
2215         }
2216         ShouldNotReachHere();
2217       }
2218 

2219       _igvn.register_new_node_with_optimizer(vn);
2220       _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
2221       for (uint j = 0; j < p->size(); j++) {
2222         Node* pm = p->at(j);
2223         _igvn.replace_node(pm, vn);
2224       }
2225       _igvn._worklist.push(vn);
2226 








2227       if (vlen_in_bytes > max_vlen_in_bytes) {
2228         max_vlen = vlen;
2229         max_vlen_in_bytes = vlen_in_bytes;
2230       }
2231 #ifdef ASSERT
2232       if (TraceNewVectors) {
2233         tty->print("new Vector node: ");
2234         vn->dump();
2235       }
2236 #endif
2237     }
2238   }//for (int i = 0; i < _block.length(); i++)
2239 
2240   C->set_max_vector_size(max_vlen_in_bytes);
2241 
2242   if (SuperWordLoopUnrollAnalysis) {
2243     if (cl->has_passed_slp()) {
2244       uint slp_max_unroll_factor = cl->slp_max_unroll();
2245       if (slp_max_unroll_factor == max_vlen) {
2246         if (TraceSuperWordLoopUnrollAnalysis) {
2247           tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte);
2248         }
2249         // For atomic unrolled loops which are vector mapped, instigate more unrolling.

2250         cl->set_notpassed_slp();
2251         // if vector resources are limited, do not allow additional unrolling



2252         if (FLOATPRESSURE > 8) {
2253           C->set_major_progress();
2254         }
2255         cl->mark_do_unroll_only();



2256         if (do_reserve_copy()) {
2257           cl->mark_loop_vectorized();
















2258         }
2259       }
2260     }
2261   }
2262 
2263   if (do_reserve_copy()) {
2264     make_reversable.use_new();
2265   }
2266   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("\n Final loop after SuperWord"); print_loop(true);})
2267   return;
2268 }
2269 
2270 //------------------------------vector_opd---------------------------
2271 // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
2272 Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
2273   Node* p0 = p->at(0);
2274   uint vlen = p->size();
2275   Node* opd = p0->in(opd_idx);






2276 
2277   if (same_inputs(p, opd_idx)) {
2278     if (opd->is_Vector() || opd->is_LoadVector()) {
2279       assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector");
2280       if (opd_idx == 2 && VectorNode::is_shift(p0)) {
2281         NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("shift's count can't be vector");})
2282         return NULL;
2283       }
2284       return opd; // input is matching vector
2285     }
2286     if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
2287       Compile* C = _phase->C;
2288       Node* cnt = opd;
2289       // Vector instructions do not mask shift count, do it here.
2290       juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
2291       const TypeInt* t = opd->find_int_type();
2292       if (t != NULL && t->is_con()) {
2293         juint shift = t->get_con();
2294         if (shift > mask) { // Unsigned cmp
2295           cnt = ConNode::make(TypeInt::make(shift & mask));


3072 }
3073 
3074 //----------------------------get_pre_loop_end---------------------------
3075 // Find pre loop end from main loop.  Returns null if none.
3076 CountedLoopEndNode* SuperWord::get_pre_loop_end(CountedLoopNode* cl) {
3077   // The loop cannot be optimized if the graph shape at
3078   // the loop entry is inappropriate.
3079   if (!PhaseIdealLoop::is_canonical_loop_entry(cl)) {
3080     return NULL;
3081   }
3082 
3083   Node* p_f = cl->in(LoopNode::EntryControl)->in(0)->in(0);
3084   if (!p_f->is_IfFalse()) return NULL;
3085   if (!p_f->in(0)->is_CountedLoopEnd()) return NULL;
3086   CountedLoopEndNode* pre_end = p_f->in(0)->as_CountedLoopEnd();
3087   CountedLoopNode* loop_node = pre_end->loopnode();
3088   if (loop_node == NULL || !loop_node->is_pre_loop()) return NULL;
3089   return pre_end;
3090 }
3091 
3092 
3093 //------------------------------init---------------------------
3094 void SuperWord::init() {
3095   _dg.init();
3096   _packset.clear();
3097   _disjoint_ptrs.clear();
3098   _block.clear();

3099   _data_entry.clear();
3100   _mem_slice_head.clear();
3101   _mem_slice_tail.clear();
3102   _iteration_first.clear();
3103   _iteration_last.clear();
3104   _node_info.clear();
3105   _align_to_ref = NULL;
3106   _lpt = NULL;
3107   _lp = NULL;
3108   _bb = NULL;
3109   _iv = NULL;
3110   _race_possible = 0;
3111   _early_return = false;
3112   _num_work_vecs = 0;
3113   _num_reductions = 0;
3114 }
3115 
3116 //------------------------------restart---------------------------
3117 void SuperWord::restart() {
3118   _dg.init();
3119   _packset.clear();
3120   _disjoint_ptrs.clear();
3121   _block.clear();

3122   _data_entry.clear();
3123   _mem_slice_head.clear();
3124   _mem_slice_tail.clear();
3125   _node_info.clear();
3126 }
3127 
3128 //------------------------------print_packset---------------------------
3129 void SuperWord::print_packset() {
3130 #ifndef PRODUCT
3131   tty->print_cr("packset");
3132   for (int i = 0; i < _packset.length(); i++) {
3133     tty->print_cr("Pack: %d", i);
3134     Node_List* p = _packset.at(i);
3135     print_pack(p);
3136   }
3137 #endif
3138 }
3139 
3140 //------------------------------print_pack---------------------------
3141 void SuperWord::print_pack(Node_List* p) {




  34 #include "opto/memnode.hpp"
  35 #include "opto/mulnode.hpp"
  36 #include "opto/opcodes.hpp"
  37 #include "opto/opaquenode.hpp"
  38 #include "opto/superword.hpp"
  39 #include "opto/vectornode.hpp"
  40 #include "opto/movenode.hpp"
  41 
  42 //
  43 //                  S U P E R W O R D   T R A N S F O R M
  44 //=============================================================================
  45 
  46 //------------------------------SuperWord---------------------------
  47 SuperWord::SuperWord(PhaseIdealLoop* phase) :
  48   _phase(phase),
  49   _igvn(phase->_igvn),
  50   _arena(phase->C->comp_arena()),
  51   _packset(arena(), 8,  0, NULL),         // packs for the current block
  52   _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
  53   _block(arena(), 8,  0, NULL),           // nodes in current block
  54   _post_block(arena(), 8, 0, NULL),       // nodes common to current block which are marked as post loop vectorizable
  55   _data_entry(arena(), 8,  0, NULL),      // nodes with all inputs from outside
  56   _mem_slice_head(arena(), 8,  0, NULL),  // memory slice heads
  57   _mem_slice_tail(arena(), 8,  0, NULL),  // memory slice tails
  58   _node_info(arena(), 8,  0, SWNodeInfo::initial), // info needed per node
  59   _clone_map(phase->C->clone_map()),      // map of nodes created in cloning
  60   _cmovev_kit(_arena, this),              // map to facilitate CMoveVD creation
  61   _align_to_ref(NULL),                    // memory reference to align vectors to
  62   _disjoint_ptrs(arena(), 8,  0, OrderedPair::initial), // runtime disambiguated pointer pairs
  63   _dg(_arena),                            // dependence graph
  64   _visited(arena()),                      // visited node set
  65   _post_visited(arena()),                 // post visited node set
  66   _n_idx_list(arena(), 8),                // scratch list of (node,index) pairs
  67   _stk(arena(), 8, 0, NULL),              // scratch stack of nodes
  68   _nlist(arena(), 8, 0, NULL),            // scratch list of nodes
  69   _lpt(NULL),                             // loop tree node
  70   _lp(NULL),                              // LoopNode
  71   _bb(NULL),                              // basic block
  72   _iv(NULL),                              // induction var
  73   _race_possible(false),                  // cases where SDMU is true
  74   _early_return(true),                    // analysis evaluations routine


  83 #ifndef PRODUCT
  84   _vector_loop_debug = 0;
  85   if (_phase->C->method() != NULL) {
  86     _vector_loop_debug = phase->C->directive()->VectorizeDebugOption;
  87   }
  88 
  89 #endif
  90 }
  91 
  92 //------------------------------transform_loop---------------------------
  93 void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
  94   assert(UseSuperWord, "should be");
  95   // Do vectors exist on this architecture?
  96   if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
  97 
  98   assert(lpt->_head->is_CountedLoop(), "must be");
  99   CountedLoopNode *cl = lpt->_head->as_CountedLoop();
 100 
 101   if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
 102 
 103   bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
 104   if (post_loop_allowed) {
 105     if (cl->is_reduction_loop()) return; // no predication mapping
 106     Node *limit = cl->limit();
 107     if (limit->is_Con()) return; // non constant limits only
 108     // Now check the limit for expressions we do not handle
 109     if (limit->is_Add()) {
 110       Node *in2 = limit->in(2);
 111       if (in2->is_Con()) {
 112         int val = in2->get_int();
 113         // should not try to program these cases
 114         if (val < 0) return;
 115       }
 116     }
 117   }
 118 
 119   // skip any loop that has not been assigned max unroll by analysis
 120   if (do_optimization) {
 121     if (cl->slp_max_unroll() == 0) return;
 122   }
 123 
 124   // Check for no control flow in body (other than exit)
 125   Node *cl_exit = cl->loopexit();
 126   if (cl->is_main_loop() && (cl_exit->in(0) != lpt->_head)) {
 127     #ifndef PRODUCT
 128       if (TraceSuperWord) {
 129         tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
 130         tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
 131         tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();
 132         tty->print("lpt->_head %d", lpt->_head->_idx); lpt->_head->dump();
 133         lpt->dump_head();
 134       }
 135     #endif
 136     return;
 137   }
 138 
 139   // Make sure the are no extra control users of the loop backedge
 140   if (cl->back_control()->outcnt() != 1) {
 141     return;
 142   }
 143 
 144   // Skip any loops already optimized by slp
 145   if (cl->is_vectorized_loop()) return;

 146 
 147   if (cl->is_main_loop()) {
 148     // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
 149     CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
 150     if (pre_end == NULL) return;
 151     Node *pre_opaq1 = pre_end->limit();
 152     if (pre_opaq1->Opcode() != Op_Opaque1) return;
 153   }
 154 
 155   init(); // initialize data structures
 156 
 157   set_lpt(lpt);
 158   set_lp(cl);
 159 
 160   // For now, define one block which is the entire loop body
 161   set_bb(cl);
 162 
 163   if (do_optimization) {
 164     assert(_packset.length() == 0, "packset must be empty");
 165     SLP_extract();
 166     if (PostLoopMultiversioning && Matcher::has_predicated_vectors()) {
 167       if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) {
 168         IdealLoopTree *lpt_next = lpt->_next;
 169         CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
 170         _phase->has_range_checks(lpt_next);
 171         if (cl_next->is_post_loop() && !cl_next->range_checks_present()) {
 172           if (!cl_next->is_vectorized_loop()) {
 173             int slp_max_unroll_factor = cl->slp_max_unroll();
 174             cl_next->set_slp_max_unroll(slp_max_unroll_factor);
 175           }
 176         }
 177       }
 178     }
 179   }
 180 }
 181 
 182 //------------------------------early unrolling analysis------------------------------
 183 void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
 184   bool is_slp = true;
 185   ResourceMark rm;
 186   size_t ignored_size = lpt()->_body.size();
 187   int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
 188   Node_Stack nstack((int)ignored_size);
 189   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
 190   Node *cl_exit = cl->loopexit();
 191   int rpo_idx = _post_block.length();
 192 
 193   assert(rpo_idx == 0, "post loop block is empty");
 194 
 195   // First clear the entries
 196   for (uint i = 0; i < lpt()->_body.size(); i++) {
 197     ignored_loop_nodes[i] = -1;
 198   }
 199 
 200   int max_vector = Matcher::max_vector_size(T_INT);
 201   bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
 202 
 203   // Process the loop, some/all of the stack entries will not be in order, ergo
 204   // need to preprocess the ignored initial state before we process the loop
 205   for (uint i = 0; i < lpt()->_body.size(); i++) {
 206     Node* n = lpt()->_body.at(i);
 207     if (n == cl->incr() ||
 208       n->is_reduction() ||
 209       n->is_AddP() ||
 210       n->is_Cmp() ||
 211       n->is_IfTrue() ||
 212       n->is_CountedLoop() ||
 213       (n == cl_exit)) {
 214       ignored_loop_nodes[i] = n->_idx;
 215       continue;
 216     }
 217 
 218     if (n->is_If()) {
 219       IfNode *iff = n->as_If();
 220       if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
 221         if (lpt()->is_loop_exit(iff)) {


 280         // Process the pointer stack
 281         while (have_side_effects) {
 282           Node* pointer_node = nstack.node();
 283           for (uint j = 0; j < lpt()->_body.size(); j++) {
 284             Node* cur_node = lpt()->_body.at(j);
 285             if (cur_node == pointer_node) {
 286               ignored_loop_nodes[j] = cur_node->_idx;
 287               break;
 288             }
 289           }
 290           nstack.pop();
 291           have_side_effects = nstack.is_nonempty();
 292         }
 293       }
 294     }
 295   }
 296 
 297   if (is_slp) {
 298     // Now we try to find the maximum supported consistent vector which the machine
 299     // description can use
 300     bool small_basic_type = false;
 301     for (uint i = 0; i < lpt()->_body.size(); i++) {
 302       if (ignored_loop_nodes[i] != -1) continue;
 303 
 304       BasicType bt;
 305       Node* n = lpt()->_body.at(i);
 306       if (n->is_Mem()) {
 307         bt = n->as_Mem()->memory_type();
 308       } else {
 309         bt = n->bottom_type()->basic_type();
 310       }
 311 
 312       if (post_loop_allowed) {
 313         if (!small_basic_type) {
 314           switch (bt) {
 315           case T_CHAR:
 316           case T_BYTE:
 317           case T_SHORT:
 318             small_basic_type = true;
 319             break;
 320           case T_LONG:
 321             // TODO: Remove when support completed for mask context with LONG.
 322             //       The ad files need to be augmented.
 323             small_basic_type = true;
 324             break;
 325           }
 326         }
 327       }
 328 
 329       if (is_java_primitive(bt) == false) continue;
 330 
 331       int cur_max_vector = Matcher::max_vector_size(bt);
 332 
 333       // If a max vector exists which is not larger than _local_loop_unroll_factor
 334       // stop looking, we already have the max vector to map to.
 335       if (cur_max_vector < local_loop_unroll_factor) {
 336         is_slp = false;
 337         if (TraceSuperWordLoopUnrollAnalysis) {
 338           tty->print_cr("slp analysis fails: unroll limit greater than max vector\n");
 339         }
 340         break;
 341       }
 342 
 343       // Map the maximal common vector
 344       if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
 345         if (cur_max_vector < max_vector) {
 346           max_vector = cur_max_vector;
 347         }
 348 
 349         // We only process post loops on predicated targets where we want to
 350         // mask map the loop to a single iteration
 351         if (post_loop_allowed) {
 352           _post_block.at_put_grow(rpo_idx++, n);
 353         }
 354       }
 355     }
 356     if (is_slp) {
 357       local_loop_unroll_factor = max_vector;
 358       cl->mark_passed_slp();
 359     }
 360     cl->mark_was_slp();
 361     if (cl->is_main_loop()) {
 362       cl->set_slp_max_unroll(local_loop_unroll_factor);
 363     } else if (post_loop_allowed) {
 364       if (!small_basic_type) {
 365         // avoid replication context for small basic types in programmable masked loops
 366         cl->set_slp_max_unroll(local_loop_unroll_factor);
 367       }
 368     }
 369   }
 370 }
 371 
 372 //------------------------------SLP_extract---------------------------
 373 // Extract the superword level parallelism
 374 //
 375 // 1) A reverse post-order of nodes in the block is constructed.  By scanning
 376 //    this list from first to last, all definitions are visited before their uses.
 377 //
 378 // 2) A point-to-point dependence graph is constructed between memory references.
 379 //    This simplies the upcoming "independence" checker.
 380 //
 381 // 3) The maximum depth in the node graph from the beginning of the block
 382 //    to each node is computed.  This is used to prune the graph search
 383 //    in the independence checker.
 384 //
 385 // 4) For integer types, the necessary bit width is propagated backwards
 386 //    from stores to allow packed operations on byte, char, and short
 387 //    integers.  This reverses the promotion to type "int" that javac
 388 //    did for operations like: char c1,c2,c3;  c1 = c2 + c3.
 389 //


 403 //    inserting scalar promotion, vector creation from multiple scalars, and
 404 //    extraction of scalar values from vectors.
 405 //
 406 void SuperWord::SLP_extract() {
 407 
 408 #ifndef PRODUCT
 409   if (_do_vector_loop && TraceSuperWord) {
 410     tty->print("SuperWord::SLP_extract\n");
 411     tty->print("input loop\n");
 412     _lpt->dump_head();
 413     _lpt->dump();
 414     for (uint i = 0; i < _lpt->_body.size(); i++) {
 415       _lpt->_body.at(i)->dump();
 416     }
 417   }
 418 #endif
 419   // Ready the block
 420   if (!construct_bb()) {
 421     return; // Exit if no interesting nodes or complex graph.
 422   }
 423 
 424   // build    _dg, _disjoint_ptrs
 425   dependence_graph();
 426 
 427   // compute function depth(Node*)
 428   compute_max_depth();
 429 
 430   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
 431   bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
 432   if (cl->is_main_loop()) {
 433     if (_do_vector_loop) {
 434       if (mark_generations() != -1) {
 435         hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly
 436 
 437         if (!construct_bb()) {
 438           return; // Exit if no interesting nodes or complex graph.
 439         }
 440         dependence_graph();
 441         compute_max_depth();
 442       }
 443 
 444 #ifndef PRODUCT
 445       if (TraceSuperWord) {
 446         tty->print_cr("\nSuperWord::_do_vector_loop: graph after hoist_loads_in_graph");
 447         _lpt->dump_head();
 448         for (int j = 0; j < _block.length(); j++) {
 449           Node* n = _block.at(j);
 450           int d = depth(n);
 451           for (int i = 0; i < d; i++) tty->print("%s", "  ");
 452           tty->print("%d :", d);


 467     if (_do_vector_loop) {
 468       if (_packset.length() == 0) {
 469         if (TraceSuperWord) {
 470           tty->print_cr("\nSuperWord::_do_vector_loop DFA could not build packset, now trying to build anyway");
 471         }
 472         pack_parallel();
 473       }
 474     }
 475 
 476     combine_packs();
 477 
 478     construct_my_pack_map();
 479 
 480     if (_do_vector_loop) {
 481       merge_packs_to_cmovd();
 482     }
 483 
 484     filter_packs();
 485 
 486     schedule();
 487   } else if (post_loop_allowed) {
 488     int saved_mapped_unroll_factor = cl->slp_max_unroll();
 489     if (saved_mapped_unroll_factor) {
 490       int vector_mapped_unroll_factor = saved_mapped_unroll_factor;
 491 
 492       // now reset the slp_unroll_factor so that we can check the analysis mapped
 493       // what the vector loop was mapped to
 494       cl->set_slp_max_unroll(0);
 495 
 496       // do the analysis on the post loop
 497       unrolling_analysis(vector_mapped_unroll_factor);
 498 
 499       // if our analyzed loop is a canonical fit, start processing it
 500       if (vector_mapped_unroll_factor == saved_mapped_unroll_factor) {
 501         // now add the vector nodes to packsets
 502         for (int i = 0; i < _post_block.length(); i++) {
 503           Node* n = _post_block.at(i);
 504           Node_List* singleton = new Node_List();
 505           singleton->push(n);
 506           _packset.append(singleton);
 507           set_my_pack(n, singleton);
 508         }
 509 
 510         // map base types for vector usage
 511         compute_vector_element_type();
 512       } else {
 513         return;
 514       }
 515     } else {
 516       // for some reason we could not map the slp analysis state of the vectorized loop
 517       return;
 518     }
 519   }
 520 
 521   output();
 522 }
 523 
 524 //------------------------------find_adjacent_refs---------------------------
 525 // Find the adjacent memory references and create pack pairs for them.
 526 // This is the initial set of packs that will then be extended by
 527 // following use->def and def->use links.  The align positions are
 528 // assigned relative to the reference "align_to_ref"
 529 void SuperWord::find_adjacent_refs() {
 530   // Get list of memory operations
 531   Node_List memops;
 532   for (int i = 0; i < _block.length(); i++) {
 533     Node* n = _block.at(i);
 534     if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
 535         is_java_primitive(n->as_Mem()->memory_type())) {
 536       int align = memory_alignment(n->as_Mem(), 0);
 537       if (align != bottom_align) {
 538         memops.push(n);
 539       }


 901   } else {
 902     // This memory op is not dependent on iv (scale == 0)
 903     iv_adjustment = 0;
 904   }
 905 
 906 #ifndef PRODUCT
 907   if (TraceSuperWord) {
 908     tty->print("SuperWord::get_iv_adjustment: n = %d, noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d: ",
 909       mem_ref->_idx, offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
 910     mem_ref->dump();
 911   }
 912 #endif
 913   return iv_adjustment;
 914 }
 915 
 916 //---------------------------dependence_graph---------------------------
 917 // Construct dependency graph.
 918 // Add dependence edges to load/store nodes for memory dependence
 919 //    A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
 920 void SuperWord::dependence_graph() {
 921   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
 922   // First, assign a dependence node to each memory node
 923   for (int i = 0; i < _block.length(); i++ ) {
 924     Node *n = _block.at(i);
 925     if (n->is_Mem() || n->is_Phi() && n->bottom_type() == Type::MEMORY) {
 926       _dg.make_node(n);
 927     }
 928   }
 929 
 930   // For each memory slice, create the dependences
 931   for (int i = 0; i < _mem_slice_head.length(); i++) {
 932     Node* n      = _mem_slice_head.at(i);
 933     Node* n_tail = _mem_slice_tail.at(i);
 934 
 935     // Get slice in predecessor order (last is first)
 936     if (cl->is_main_loop()) {
 937       mem_slice_preds(n_tail, n, _nlist);
 938     }
 939 
 940 #ifndef PRODUCT
 941     if(TraceSuperWord && Verbose) {
 942       tty->print_cr("SuperWord::dependence_graph: built a new mem slice");
 943       for (int j = _nlist.length() - 1; j >= 0 ; j--) {
 944         _nlist.at(j)->dump();
 945       }
 946     }
 947 #endif
 948     // Make the slice dependent on the root
 949     DepMem* slice = _dg.dep(n);
 950     _dg.make_edge(_dg.root(), slice);
 951 
 952     // Create a sink for the slice
 953     DepMem* slice_sink = _dg.make_node(NULL);
 954     _dg.make_edge(slice_sink, _dg.tail());
 955 
 956     // Now visit each pair of memory ops, creating the edges
 957     for (int j = _nlist.length() - 1; j >= 0 ; j--) {
 958       Node* s1 = _nlist.at(j);


2122   if(whole) {
2123     tty->print_cr("\n Whole loop tree");
2124     _phase->dump();
2125     tty->print_cr(" End of whole loop tree\n");
2126   }
2127 }
2128 #endif
2129 
2130 //------------------------------output---------------------------
2131 // Convert packs into vector node operations
2132 void SuperWord::output() {
2133   if (_packset.length() == 0) return;
2134 
2135 #ifndef PRODUCT
2136   if (TraceLoopOpts) {
2137     tty->print("SuperWord::output    ");
2138     lpt()->dump_head();
2139   }
2140 #endif
2141 
2142   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
2143   if (cl->is_main_loop()) {
2144     // MUST ENSURE main loop's initial value is properly aligned:
2145     //  (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
2146 
2147     align_initial_loop_index(align_to_ref());
2148 
2149     // Insert extract (unpack) operations for scalar uses
2150     for (int i = 0; i < _packset.length(); i++) {
2151       insert_extracts(_packset.at(i));
2152     }
2153   }
2154 
2155   Compile* C = _phase->C;

2156   uint max_vlen_in_bytes = 0;
2157   uint max_vlen = 0;
2158   bool can_process_post_loop = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
2159 
2160   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop before create_reserve_version_of_loop"); print_loop(true);})
2161 
2162   CountedLoopReserveKit make_reversable(_phase, _lpt, do_reserve_copy());
2163 
2164   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop after create_reserve_version_of_loop"); print_loop(true);})
2165 
2166   if (do_reserve_copy() && !make_reversable.has_reserved()) {
2167     NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: loop was not reserved correctly, exiting SuperWord");})
2168     return;
2169   }
2170 
2171   for (int i = 0; i < _block.length(); i++) {
2172     Node* n = _block.at(i);
2173     Node_List* p = my_pack(n);
2174     if (p && n == executed_last(p)) {
2175       uint vlen = p->size();
2176       uint vlen_in_bytes = 0;
2177       Node* vn = NULL;
2178       Node* low_adr = p->at(0);
2179       Node* first   = executed_first(p);
2180       if (can_process_post_loop) {
2181         // override vlen with the main loops vector length
2182         vlen = cl->slp_max_unroll();
2183       }
2184       NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d executed first, %d executed last in pack", first->_idx, n->_idx); print_pack(p);})
2185       int   opc = n->Opcode();
2186       if (n->is_Load()) {
2187         Node* ctl = n->in(MemNode::Control);
2188         Node* mem = first->in(MemNode::Memory);
2189         SWPointer p1(n->as_Mem(), this, NULL, false);
2190         // Identify the memory dependency for the new loadVector node by
2191         // walking up through memory chain.
2192         // This is done to give flexibility to the new loadVector node so that
2193         // it can move above independent storeVector nodes.
2194         while (mem->is_StoreVector()) {
2195           SWPointer p2(mem->as_Mem(), this, NULL, false);
2196           int cmp = p1.cmp(p2);
2197           if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
2198             mem = mem->in(MemNode::Memory);
2199           } else {
2200             break; // dependent memory
2201           }
2202         }
2203         Node* adr = low_adr->in(MemNode::Address);


2253           in2 = tmp;
2254         }
2255         if (node_isa_reduction) {
2256           const Type *arith_type = n->bottom_type();
2257           vn = ReductionNode::make(opc, NULL, in1, in2, arith_type->basic_type());
2258           if (in2->is_Load()) {
2259             vlen_in_bytes = in2->as_LoadVector()->memory_size();
2260           } else {
2261             vlen_in_bytes = in2->as_Vector()->length_in_bytes();
2262           }
2263         } else {
2264           vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
2265           vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2266         }
2267       } else if (opc == Op_SqrtD || opc == Op_AbsF || opc == Op_AbsD || opc == Op_NegF || opc == Op_NegD) {
2268         // Promote operand to vector (Sqrt/Abs/Neg are 2 address instructions)
2269         Node* in = vector_opd(p, 1);
2270         vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
2271         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2272       } else if (is_cmov_pack(p)) {
2273         if (can_process_post_loop) {
2274           // do not refactor of flow in post loop context
2275           return;
2276         }
2277         if (!n->is_CMove()) {
2278           continue;
2279         }
2280         // place here CMoveVDNode
2281         NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: print before CMove vectorization"); print_loop(false);})
2282         Node* bol = n->in(CMoveNode::Condition);
2283         if (!bol->is_Bool() && bol->Opcode() == Op_ExtractI && bol->req() > 1 ) {
2284           NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d is not Bool node, trying its in(1) node %d", bol->_idx, bol->in(1)->_idx); bol->dump(); bol->in(1)->dump();})
2285           bol = bol->in(1); //may be ExtractNode
2286         }
2287 
2288         assert(bol->is_Bool(), "should be BoolNode - too late to bail out!");
2289         if (!bol->is_Bool()) {
2290           if (do_reserve_copy()) {
2291             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: expected %d bool node, exiting SuperWord", bol->_idx); bol->dump();})
2292             return; //and reverse to backup IG
2293           }
2294           ShouldNotReachHere();
2295         }
2296 


2321         const TypeVect* vt = TypeVect::make(bt, vlen);
2322         vn = new CMoveVDNode(cc, src1, src2, vt);
2323         NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created new CMove node %d: ", vn->_idx); vn->dump();})
2324       } else {
2325         if (do_reserve_copy()) {
2326           NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: ShouldNotReachHere, exiting SuperWord");})
2327           return; //and reverse to backup IG
2328         }
2329         ShouldNotReachHere();
2330       }
2331 
2332       assert(vn != NULL, "sanity");
2333       if (vn == NULL) {
2334         if (do_reserve_copy()){
2335           NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: got NULL node, cannot proceed, exiting SuperWord");})
2336           return; //and reverse to backup IG
2337         }
2338         ShouldNotReachHere();
2339       }
2340 
2341       _block.at_put(i, vn);
2342       _igvn.register_new_node_with_optimizer(vn);
2343       _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
2344       for (uint j = 0; j < p->size(); j++) {
2345         Node* pm = p->at(j);
2346         _igvn.replace_node(pm, vn);
2347       }
2348       _igvn._worklist.push(vn);
2349 
2350       if (can_process_post_loop) {
2351         // first check if the vector size if the maximum vector which we can use on the machine,
2352         // other vector size have reduced values for predicated data mapping.
2353         if (vlen_in_bytes != (uint)MaxVectorSize) {
2354           return;
2355         }
2356       }
2357 
2358       if (vlen_in_bytes > max_vlen_in_bytes) {
2359         max_vlen = vlen;
2360         max_vlen_in_bytes = vlen_in_bytes;
2361       }
2362 #ifdef ASSERT
2363       if (TraceNewVectors) {
2364         tty->print("new Vector node: ");
2365         vn->dump();
2366       }
2367 #endif
2368     }
2369   }//for (int i = 0; i < _block.length(); i++)
2370 
2371   C->set_max_vector_size(max_vlen_in_bytes);
2372 
2373   if (SuperWordLoopUnrollAnalysis) {
2374     if (cl->has_passed_slp()) {
2375       uint slp_max_unroll_factor = cl->slp_max_unroll();
2376       if (slp_max_unroll_factor == max_vlen) {
2377         if (TraceSuperWordLoopUnrollAnalysis) {
2378           tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte);
2379         }
2380 
2381         // For atomic unrolled loops which are vector mapped, instigate more unrolling
2382         cl->set_notpassed_slp();
2383         if (cl->is_main_loop()) {
2384           // if vector resources are limited, do not allow additional unrolling, also
2385           // do not unroll more on pure vector loops which were not reduced so that we can
2386           // program the post loop to single iteration execution.
2387           if (FLOATPRESSURE > 8) {
2388             C->set_major_progress();

2389             cl->mark_do_unroll_only();
2390           }
2391         }
2392 
2393         if (do_reserve_copy()) {
2394           cl->mark_loop_vectorized();
2395           if (can_process_post_loop) {
2396             // Now create the difference of trip and limit and use it as our mask index.
2397             // Note: We limited the unroll of the vectorized loop so that
2398             //       only vlen-1 size iterations can remain to be mask programmed.
2399             Node *incr = cl->incr();
2400             SubINode *index = new SubINode(cl->limit(), cl->init_trip());
2401             _igvn.register_new_node_with_optimizer(index);
2402             MaskCreateINode *mask = new MaskCreateINode(_phase->get_ctrl(cl->init_trip()), index);
2403             _igvn.register_new_node_with_optimizer(mask);
2404             // make this a single iteration loop
2405             AddINode *new_incr = new AddINode(incr->in(1), mask);
2406             _igvn.register_new_node_with_optimizer(new_incr);
2407             _phase->set_ctrl(new_incr, _phase->get_ctrl(incr));
2408             _igvn.replace_node(incr, new_incr);
2409             cl->mark_is_multiversioned();
2410           }
2411         }
2412       }
2413     }
2414   }
2415 
2416   if (do_reserve_copy()) {
2417     make_reversable.use_new();
2418   }
2419   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("\n Final loop after SuperWord"); print_loop(true);})
2420   return;
2421 }
2422 
2423 //------------------------------vector_opd---------------------------
2424 // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
2425 Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
2426   Node* p0 = p->at(0);
2427   uint vlen = p->size();
2428   Node* opd = p0->in(opd_idx);
2429   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
2430 
2431   if (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop()) {
2432     // override vlen with the main loops vector length
2433     vlen = cl->slp_max_unroll();
2434   }
2435 
2436   if (same_inputs(p, opd_idx)) {
2437     if (opd->is_Vector() || opd->is_LoadVector()) {
2438       assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector");
2439       if (opd_idx == 2 && VectorNode::is_shift(p0)) {
2440         NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("shift's count can't be vector");})
2441         return NULL;
2442       }
2443       return opd; // input is matching vector
2444     }
2445     if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
2446       Compile* C = _phase->C;
2447       Node* cnt = opd;
2448       // Vector instructions do not mask shift count, do it here.
2449       juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
2450       const TypeInt* t = opd->find_int_type();
2451       if (t != NULL && t->is_con()) {
2452         juint shift = t->get_con();
2453         if (shift > mask) { // Unsigned cmp
2454           cnt = ConNode::make(TypeInt::make(shift & mask));


3231 }
3232 
3233 //----------------------------get_pre_loop_end---------------------------
3234 // Find pre loop end from main loop.  Returns null if none.
3235 CountedLoopEndNode* SuperWord::get_pre_loop_end(CountedLoopNode* cl) {
3236   // The loop cannot be optimized if the graph shape at
3237   // the loop entry is inappropriate.
3238   if (!PhaseIdealLoop::is_canonical_loop_entry(cl)) {
3239     return NULL;
3240   }
3241 
3242   Node* p_f = cl->in(LoopNode::EntryControl)->in(0)->in(0);
3243   if (!p_f->is_IfFalse()) return NULL;
3244   if (!p_f->in(0)->is_CountedLoopEnd()) return NULL;
3245   CountedLoopEndNode* pre_end = p_f->in(0)->as_CountedLoopEnd();
3246   CountedLoopNode* loop_node = pre_end->loopnode();
3247   if (loop_node == NULL || !loop_node->is_pre_loop()) return NULL;
3248   return pre_end;
3249 }
3250 

3251 //------------------------------init---------------------------
3252 void SuperWord::init() {
3253   _dg.init();
3254   _packset.clear();
3255   _disjoint_ptrs.clear();
3256   _block.clear();
3257   _post_block.clear();
3258   _data_entry.clear();
3259   _mem_slice_head.clear();
3260   _mem_slice_tail.clear();
3261   _iteration_first.clear();
3262   _iteration_last.clear();
3263   _node_info.clear();
3264   _align_to_ref = NULL;
3265   _lpt = NULL;
3266   _lp = NULL;
3267   _bb = NULL;
3268   _iv = NULL;
3269   _race_possible = 0;
3270   _early_return = false;
3271   _num_work_vecs = 0;
3272   _num_reductions = 0;
3273 }
3274 
3275 //------------------------------restart---------------------------
3276 void SuperWord::restart() {
3277   _dg.init();
3278   _packset.clear();
3279   _disjoint_ptrs.clear();
3280   _block.clear();
3281   _post_block.clear();
3282   _data_entry.clear();
3283   _mem_slice_head.clear();
3284   _mem_slice_tail.clear();
3285   _node_info.clear();
3286 }
3287 
3288 //------------------------------print_packset---------------------------
3289 void SuperWord::print_packset() {
3290 #ifndef PRODUCT
3291   tty->print_cr("packset");
3292   for (int i = 0; i < _packset.length(); i++) {
3293     tty->print_cr("Pack: %d", i);
3294     Node_List* p = _packset.at(i);
3295     print_pack(p);
3296   }
3297 #endif
3298 }
3299 
3300 //------------------------------print_pack---------------------------
3301 void SuperWord::print_pack(Node_List* p) {


< prev index next >