< prev index next >

src/share/vm/opto/superword.cpp

Print this page




  51   _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
  52   _block(arena(), 8,  0, NULL),           // nodes in current block
  53   _data_entry(arena(), 8,  0, NULL),      // nodes with all inputs from outside
  54   _mem_slice_head(arena(), 8,  0, NULL),  // memory slice heads
  55   _mem_slice_tail(arena(), 8,  0, NULL),  // memory slice tails
  56   _node_info(arena(), 8,  0, SWNodeInfo::initial), // info needed per node
  57   _clone_map(phase->C->clone_map()),      // map of nodes created in cloning
  58   _align_to_ref(NULL),                    // memory reference to align vectors to
  59   _disjoint_ptrs(arena(), 8,  0, OrderedPair::initial), // runtime disambiguated pointer pairs
  60   _dg(_arena),                            // dependence graph
  61   _visited(arena()),                      // visited node set
  62   _post_visited(arena()),                 // post visited node set
  63   _n_idx_list(arena(), 8),                // scratch list of (node,index) pairs
  64   _stk(arena(), 8, 0, NULL),              // scratch stack of nodes
  65   _nlist(arena(), 8, 0, NULL),            // scratch list of nodes
  66   _lpt(NULL),                             // loop tree node
  67   _lp(NULL),                              // LoopNode
  68   _bb(NULL),                              // basic block
  69   _iv(NULL),                              // induction var
  70   _race_possible(false),                  // cases where SDMU is true

  71   _num_work_vecs(0),                      // amount of vector work we have
  72   _num_reductions(0),                     // amount of reduction work we have
  73   _do_vector_loop(phase->C->do_vector_loop()),  // whether to do vectorization/simd style
  74   _ii_first(-1),                          // first loop generation index - only if do_vector_loop()
  75   _ii_last(-1),                           // last loop generation index - only if do_vector_loop()
  76   _ii_order(arena(), 8, 0, 0),
  77   _vector_loop_debug(phase->C->has_method() && phase->C->method_has_option("VectorizeDebug"))
  78 {}
  79 
  80 //------------------------------transform_loop---------------------------
  81 void SuperWord::transform_loop(IdealLoopTree* lpt) {
  82   assert(UseSuperWord, "should be");
  83   // Do vectors exist on this architecture?
  84   if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
  85 
  86   assert(lpt->_head->is_CountedLoop(), "must be");
  87   CountedLoopNode *cl = lpt->_head->as_CountedLoop();
  88 
  89   if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
  90 
  91   if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops
  92 
  93   // Check for no control flow in body (other than exit)
  94   Node *cl_exit = cl->loopexit();
  95   if (cl_exit->in(0) != lpt->_head) return;
  96 
  97   // Make sure the are no extra control users of the loop backedge
  98   if (cl->back_control()->outcnt() != 1) {
  99     return;
 100   }
 101 
 102   // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
 103   CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
 104   if (pre_end == NULL) return;
 105   Node *pre_opaq1 = pre_end->limit();
 106   if (pre_opaq1->Opcode() != Op_Opaque1) return;
 107 
 108   init(); // initialize data structures
 109 
 110   set_lpt(lpt);
 111   set_lp(cl);
 112 
 113   // For now, define one block which is the entire loop body
 114   set_bb(cl);
 115 

 116   assert(_packset.length() == 0, "packset must be empty");
 117   SLP_extract();





















































































































































 118 }
 119 
 120 //------------------------------SLP_extract---------------------------
 121 // Extract the superword level parallelism
 122 //
 123 // 1) A reverse post-order of nodes in the block is constructed.  By scanning
 124 //    this list from first to last, all definitions are visited before their uses.
 125 //
 126 // 2) A point-to-point dependence graph is constructed between memory references.
 127 //    This simplies the upcoming "independence" checker.
 128 //
 129 // 3) The maximum depth in the node graph from the beginning of the block
 130 //    to each node is computed.  This is used to prune the graph search
 131 //    in the independence checker.
 132 //
 133 // 4) For integer types, the necessary bit width is propagated backwards
 134 //    from stores to allow packed operations on byte, char, and short
 135 //    integers.  This reverses the promotion to type "int" that javac
 136 //    did for operations like: char c1,c2,c3;  c1 = c2 + c3.
 137 //


 251 
 252   Node_List align_to_refs;
 253   int best_iv_adjustment = 0;
 254   MemNode* best_align_to_mem_ref = NULL;
 255 
 256   while (memops.size() != 0) {
 257     // Find a memory reference to align to.
 258     MemNode* mem_ref = find_align_to_ref(memops);
 259     if (mem_ref == NULL) break;
 260     align_to_refs.push(mem_ref);
 261     int iv_adjustment = get_iv_adjustment(mem_ref);
 262 
 263     if (best_align_to_mem_ref == NULL) {
 264       // Set memory reference which is the best from all memory operations
 265       // to be used for alignment. The pre-loop trip count is modified to align
 266       // this reference to a vector-aligned address.
 267       best_align_to_mem_ref = mem_ref;
 268       best_iv_adjustment = iv_adjustment;
 269     }
 270 
 271     SWPointer align_to_ref_p(mem_ref, this);
 272     // Set alignment relative to "align_to_ref" for all related memory operations.
 273     for (int i = memops.size() - 1; i >= 0; i--) {
 274       MemNode* s = memops.at(i)->as_Mem();
 275       if (isomorphic(s, mem_ref)) {
 276         SWPointer p2(s, this);
 277         if (p2.comparable(align_to_ref_p)) {
 278           int align = memory_alignment(s, iv_adjustment);
 279           set_alignment(s, align);
 280         }
 281       }
 282     }
 283 
 284     // Create initial pack pairs of memory operations for which
 285     // alignment is set and vectors will be aligned.
 286     bool create_pack = true;
 287     if (memory_alignment(mem_ref, best_iv_adjustment) == 0 || _do_vector_loop) {
 288       if (!Matcher::misaligned_vectors_ok()) {
 289         int vw = vector_width(mem_ref);
 290         int vw_best = vector_width(best_align_to_mem_ref);
 291         if (vw > vw_best) {
 292           // Do not vectorize a memory access with more elements per vector
 293           // if unaligned memory access is not allowed because number of
 294           // iterations in pre-loop will be not enough to align it.
 295           create_pack = false;
 296         } else {
 297           SWPointer p2(best_align_to_mem_ref, this);
 298           if (align_to_ref_p.invar() != p2.invar()) {
 299             // Do not vectorize memory accesses with different invariants
 300             // if unaligned memory accesses are not allowed.
 301             create_pack = false;
 302           }
 303         }
 304       }
 305     } else {
 306       if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
 307         // Can't allow vectorization of unaligned memory accesses with the
 308         // same type since it could be overlapped accesses to the same array.
 309         create_pack = false;
 310       } else {
 311         // Allow independent (different type) unaligned memory operations
 312         // if HW supports them.
 313         if (!Matcher::misaligned_vectors_ok()) {
 314           create_pack = false;
 315         } else {
 316           // Check if packs of the same memory type but
 317           // with a different alignment were created before.


 394   set_align_to_ref(best_align_to_mem_ref);
 395 
 396 #ifndef PRODUCT
 397   if (TraceSuperWord) {
 398     tty->print_cr("\nAfter find_adjacent_refs");
 399     print_packset();
 400   }
 401 #endif
 402 }
 403 
 404 //------------------------------find_align_to_ref---------------------------
 405 // Find a memory reference to align the loop induction variable to.
 406 // Looks first at stores then at loads, looking for a memory reference
 407 // with the largest number of references similar to it.
 408 MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
 409   GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
 410 
 411   // Count number of comparable memory ops
 412   for (uint i = 0; i < memops.size(); i++) {
 413     MemNode* s1 = memops.at(i)->as_Mem();
 414     SWPointer p1(s1, this);
 415     // Discard if pre loop can't align this reference
 416     if (!ref_is_alignable(p1)) {
 417       *cmp_ct.adr_at(i) = 0;
 418       continue;
 419     }
 420     for (uint j = i+1; j < memops.size(); j++) {
 421       MemNode* s2 = memops.at(j)->as_Mem();
 422       if (isomorphic(s1, s2)) {
 423         SWPointer p2(s2, this);
 424         if (p1.comparable(p2)) {
 425           (*cmp_ct.adr_at(i))++;
 426           (*cmp_ct.adr_at(j))++;
 427         }
 428       }
 429     }
 430   }
 431 
 432   // Find Store (or Load) with the greatest number of "comparable" references,
 433   // biggest vector size, smallest data size and smallest iv offset.
 434   int max_ct        = 0;
 435   int max_vw        = 0;
 436   int max_idx       = -1;
 437   int min_size      = max_jint;
 438   int min_iv_offset = max_jint;
 439   for (uint j = 0; j < memops.size(); j++) {
 440     MemNode* s = memops.at(j)->as_Mem();
 441     if (s->is_Store()) {
 442       int vw = vector_width_in_bytes(s);
 443       assert(vw > 1, "sanity");
 444       SWPointer p(s, this);
 445       if (cmp_ct.at(j) >  max_ct ||
 446           cmp_ct.at(j) == max_ct &&
 447             (vw >  max_vw ||
 448              vw == max_vw &&
 449               (data_size(s) <  min_size ||
 450                data_size(s) == min_size &&
 451                  (p.offset_in_bytes() < min_iv_offset)))) {
 452         max_ct = cmp_ct.at(j);
 453         max_vw = vw;
 454         max_idx = j;
 455         min_size = data_size(s);
 456         min_iv_offset = p.offset_in_bytes();
 457       }
 458     }
 459   }
 460   // If no stores, look at loads
 461   if (max_ct == 0) {
 462     for (uint j = 0; j < memops.size(); j++) {
 463       MemNode* s = memops.at(j)->as_Mem();
 464       if (s->is_Load()) {
 465         int vw = vector_width_in_bytes(s);
 466         assert(vw > 1, "sanity");
 467         SWPointer p(s, this);
 468         if (cmp_ct.at(j) >  max_ct ||
 469             cmp_ct.at(j) == max_ct &&
 470               (vw >  max_vw ||
 471                vw == max_vw &&
 472                 (data_size(s) <  min_size ||
 473                  data_size(s) == min_size &&
 474                    (p.offset_in_bytes() < min_iv_offset)))) {
 475           max_ct = cmp_ct.at(j);
 476           max_vw = vw;
 477           max_idx = j;
 478           min_size = data_size(s);
 479           min_iv_offset = p.offset_in_bytes();
 480         }
 481       }
 482     }
 483   }
 484 
 485 #ifdef ASSERT
 486   if (TraceSuperWord && Verbose) {
 487     tty->print_cr("\nVector memops after find_align_to_ref");


 558     } else if (span % vw == 0) {
 559       // If span is a multiple of vw, we can simplify formula (1) to:
 560       //   (init_offset + i * span) % vw == 0
 561       //     =>
 562       //   (init_offset % vw) + ((i * span) % vw) == 0
 563       //     =>
 564       //   init_offset % vw == 0
 565       //
 566       // Because we add a multiple of vw to the initial offset, the final
 567       // offset is a multiple of vw if and only if init_offset is a multiple.
 568       //
 569       return (init_offset % vw) == 0;
 570     }
 571   }
 572   return false;
 573 }
 574 
 575 //---------------------------get_iv_adjustment---------------------------
 576 // Calculate loop's iv adjustment for this memory ops.
 577 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
 578   SWPointer align_to_ref_p(mem_ref, this);
 579   int offset = align_to_ref_p.offset_in_bytes();
 580   int scale  = align_to_ref_p.scale_in_bytes();
 581   int elt_size = align_to_ref_p.memory_size();
 582   int vw       = vector_width_in_bytes(mem_ref);
 583   assert(vw > 1, "sanity");
 584   int iv_adjustment;
 585   if (scale != 0) {
 586     int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
 587     // At least one iteration is executed in pre-loop by default. As result
 588     // several iterations are needed to align memory operations in main-loop even
 589     // if offset is 0.
 590     int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
 591     assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
 592            err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
 593     iv_adjustment = iv_adjustment_in_bytes/elt_size;
 594   } else {
 595     // This memory op is not dependent on iv (scale == 0)
 596     iv_adjustment = 0;
 597   }
 598 


 632         _nlist.at(j)->dump();
 633       }
 634     }
 635 #endif
 636     // Make the slice dependent on the root
 637     DepMem* slice = _dg.dep(n);
 638     _dg.make_edge(_dg.root(), slice);
 639 
 640     // Create a sink for the slice
 641     DepMem* slice_sink = _dg.make_node(NULL);
 642     _dg.make_edge(slice_sink, _dg.tail());
 643 
 644     // Now visit each pair of memory ops, creating the edges
 645     for (int j = _nlist.length() - 1; j >= 0 ; j--) {
 646       Node* s1 = _nlist.at(j);
 647 
 648       // If no dependency yet, use slice
 649       if (_dg.dep(s1)->in_cnt() == 0) {
 650         _dg.make_edge(slice, s1);
 651       }
 652       SWPointer p1(s1->as_Mem(), this);
 653       bool sink_dependent = true;
 654       for (int k = j - 1; k >= 0; k--) {
 655         Node* s2 = _nlist.at(k);
 656         if (s1->is_Load() && s2->is_Load())
 657           continue;
 658         SWPointer p2(s2->as_Mem(), this);
 659 
 660         int cmp = p1.cmp(p2);
 661         if (SuperWordRTDepCheck &&
 662             p1.base() != p2.base() && p1.valid() && p2.valid()) {
 663           // Create a runtime check to disambiguate
 664           OrderedPair pp(p1.base(), p2.base());
 665           _disjoint_ptrs.append_if_missing(pp);
 666         } else if (!SWPointer::not_equal(cmp)) {
 667           // Possibly same address
 668           _dg.make_edge(s1, s2);
 669           sink_dependent = false;
 670         }
 671       }
 672       if (sink_dependent) {
 673         _dg.make_edge(s1, slice_sink);
 674       }
 675     }
 676 #ifndef PRODUCT
 677     if (TraceSuperWord) {
 678       tty->print_cr("\nDependence graph for slice: %d", n->_idx);


 778   return false;
 779 }
 780 
 781 //------------------------------are_adjacent_refs---------------------------
 782 // Is s1 immediately before s2 in memory?
 783 bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
 784   if (!s1->is_Mem() || !s2->is_Mem()) return false;
 785   if (!in_bb(s1)    || !in_bb(s2))    return false;
 786 
 787   // Do not use superword for non-primitives
 788   if (!is_java_primitive(s1->as_Mem()->memory_type()) ||
 789       !is_java_primitive(s2->as_Mem()->memory_type())) {
 790     return false;
 791   }
 792 
 793   // FIXME - co_locate_pack fails on Stores in different mem-slices, so
 794   // only pack memops that are in the same alias set until that's fixed.
 795   if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
 796       _phase->C->get_alias_index(s2->as_Mem()->adr_type()))
 797     return false;
 798   SWPointer p1(s1->as_Mem(), this);
 799   SWPointer p2(s2->as_Mem(), this);
 800   if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
 801   int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
 802   return diff == data_size(s1);
 803 }
 804 
 805 //------------------------------isomorphic---------------------------
 806 // Are s1 and s2 similar?
 807 bool SuperWord::isomorphic(Node* s1, Node* s2) {
 808   if (s1->Opcode() != s2->Opcode()) return false;
 809   if (s1->req() != s2->req()) return false;
 810   if (s1->in(0) != s2->in(0)) return false;
 811   if (!same_velt_type(s1, s2)) return false;
 812   return true;
 813 }
 814 
 815 //------------------------------independent---------------------------
 816 // Is there no data path from s1 to s2 or s2 to s1?
 817 bool SuperWord::independent(Node* s1, Node* s2) {
 818   //  assert(s1->Opcode() == s2->Opcode(), "check isomorphic first");
 819   int d1 = depth(s1);


1598   // Insert extract (unpack) operations for scalar uses
1599   for (int i = 0; i < _packset.length(); i++) {
1600     insert_extracts(_packset.at(i));
1601   }
1602 
1603   Compile* C = _phase->C;
1604   uint max_vlen_in_bytes = 0;
1605   for (int i = 0; i < _block.length(); i++) {
1606     Node* n = _block.at(i);
1607     Node_List* p = my_pack(n);
1608     if (p && n == executed_last(p)) {
1609       uint vlen = p->size();
1610       uint vlen_in_bytes = 0;
1611       Node* vn = NULL;
1612       Node* low_adr = p->at(0);
1613       Node* first   = executed_first(p);
1614       int   opc = n->Opcode();
1615       if (n->is_Load()) {
1616         Node* ctl = n->in(MemNode::Control);
1617         Node* mem = first->in(MemNode::Memory);
1618         SWPointer p1(n->as_Mem(), this);
1619         // Identify the memory dependency for the new loadVector node by
1620         // walking up through memory chain.
1621         // This is done to give flexibility to the new loadVector node so that
1622         // it can move above independent storeVector nodes.
1623         while (mem->is_StoreVector()) {
1624           SWPointer p2(mem->as_Mem(), this);
1625           int cmp = p1.cmp(p2);
1626           if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
1627             mem = mem->in(MemNode::Memory);
1628           } else {
1629             break; // dependent memory
1630           }
1631         }
1632         Node* adr = low_adr->in(MemNode::Address);
1633         const TypePtr* atyp = n->adr_type();
1634         vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
1635         vlen_in_bytes = vn->as_LoadVector()->memory_size();
1636       } else if (n->is_Store()) {
1637         // Promote value to be stored to vector
1638         Node* val = vector_opd(p, MemNode::ValueIn);
1639         Node* ctl = n->in(MemNode::Control);
1640         Node* mem = first->in(MemNode::Memory);
1641         Node* adr = low_adr->in(MemNode::Address);
1642         const TypePtr* atyp = n->adr_type();
1643         vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
1644         vlen_in_bytes = vn->as_StoreVector()->memory_size();


2121           }
2122         }
2123       }
2124     }
2125   }
2126 #ifndef PRODUCT
2127   if (TraceSuperWord && Verbose) {
2128     for (int i = 0; i < _block.length(); i++) {
2129       Node* n = _block.at(i);
2130       velt_type(n)->dump();
2131       tty->print("\t");
2132       n->dump();
2133     }
2134   }
2135 #endif
2136 }
2137 
2138 //------------------------------memory_alignment---------------------------
2139 // Alignment within a vector memory reference
2140 int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
2141   SWPointer p(s, this);
2142   if (!p.valid()) {
2143     return bottom_align;
2144   }
2145   int vw = vector_width_in_bytes(s);
2146   if (vw < 2) {
2147     return bottom_align; // No vectors for this type
2148   }
2149   int offset  = p.offset_in_bytes();
2150   offset     += iv_adjust*p.memory_size();
2151   int off_rem = offset % vw;
2152   int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
2153   return off_mod;
2154 }
2155 
2156 //---------------------------container_type---------------------------
2157 // Smallest type containing range of values
2158 const Type* SuperWord::container_type(Node* n) {
2159   if (n->is_Mem()) {
2160     BasicType bt = n->as_Mem()->memory_type();
2161     if (n->is_Store() && (bt == T_CHAR)) {


2285 // to align_to_ref will be a position zero in the vector.
2286 //   (iv + k) mod vector_align == 0
2287 void SuperWord::align_initial_loop_index(MemNode* align_to_ref) {
2288   CountedLoopNode *main_head = lp()->as_CountedLoop();
2289   assert(main_head->is_main_loop(), "");
2290   CountedLoopEndNode* pre_end = get_pre_loop_end(main_head);
2291   assert(pre_end != NULL, "we must have a correct pre-loop");
2292   Node *pre_opaq1 = pre_end->limit();
2293   assert(pre_opaq1->Opcode() == Op_Opaque1, "");
2294   Opaque1Node *pre_opaq = (Opaque1Node*)pre_opaq1;
2295   Node *lim0 = pre_opaq->in(1);
2296 
2297   // Where we put new limit calculations
2298   Node *pre_ctrl = pre_end->loopnode()->in(LoopNode::EntryControl);
2299 
2300   // Ensure the original loop limit is available from the
2301   // pre-loop Opaque1 node.
2302   Node *orig_limit = pre_opaq->original_loop_limit();
2303   assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
2304 
2305   SWPointer align_to_ref_p(align_to_ref, this);
2306   assert(align_to_ref_p.valid(), "sanity");
2307 
2308   // Given:
2309   //     lim0 == original pre loop limit
2310   //     V == v_align (power of 2)
2311   //     invar == extra invariant piece of the address expression
2312   //     e == offset [ +/- invar ]
2313   //
2314   // When reassociating expressions involving '%' the basic rules are:
2315   //     (a - b) % k == 0   =>  a % k == b % k
2316   // and:
2317   //     (a + b) % k == 0   =>  a % k == (k - b) % k
2318   //
2319   // For stride > 0 && scale > 0,
2320   //   Derive the new pre-loop limit "lim" such that the two constraints:
2321   //     (1) lim = lim0 + N           (where N is some positive integer < V)
2322   //     (2) (e + lim) % V == 0
2323   //   are true.
2324   //
2325   //   Substituting (1) into (2),


2459 
2460 
2461 //------------------------------init---------------------------
2462 void SuperWord::init() {
2463   _dg.init();
2464   _packset.clear();
2465   _disjoint_ptrs.clear();
2466   _block.clear();
2467   _data_entry.clear();
2468   _mem_slice_head.clear();
2469   _mem_slice_tail.clear();
2470   _iteration_first.clear();
2471   _iteration_last.clear();
2472   _node_info.clear();
2473   _align_to_ref = NULL;
2474   _lpt = NULL;
2475   _lp = NULL;
2476   _bb = NULL;
2477   _iv = NULL;
2478   _race_possible = 0;

2479   _num_work_vecs = 0;
2480   _num_reductions = 0;
2481 }
2482 
2483 //------------------------------restart---------------------------
2484 void SuperWord::restart() {
2485   _dg.init();
2486   _packset.clear();
2487   _disjoint_ptrs.clear();
2488   _block.clear();
2489   _data_entry.clear();
2490   _mem_slice_head.clear();
2491   _mem_slice_tail.clear();
2492   _node_info.clear();
2493 }
2494 
2495 //------------------------------print_packset---------------------------
2496 void SuperWord::print_packset() {
2497 #ifndef PRODUCT
2498   tty->print_cr("packset");


2529 void SuperWord::print_stmt(Node* s) {
2530 #ifndef PRODUCT
2531   tty->print(" align: %d \t", alignment(s));
2532   s->dump();
2533 #endif
2534 }
2535 
2536 //------------------------------blank---------------------------
2537 char* SuperWord::blank(uint depth) {
2538   static char blanks[101];
2539   assert(depth < 101, "too deep");
2540   for (uint i = 0; i < depth; i++) blanks[i] = ' ';
2541   blanks[depth] = '\0';
2542   return blanks;
2543 }
2544 
2545 
2546 //==============================SWPointer===========================
2547 
2548 //----------------------------SWPointer------------------------
2549 SWPointer::SWPointer(MemNode* mem, SuperWord* slp) :
2550   _mem(mem), _slp(slp),  _base(NULL),  _adr(NULL),
2551   _scale(0), _offset(0), _invar(NULL), _negate_invar(false) {


2552 
2553   Node* adr = mem->in(MemNode::Address);
2554   if (!adr->is_AddP()) {
2555     assert(!valid(), "too complex");
2556     return;
2557   }
2558   // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant)
2559   Node* base = adr->in(AddPNode::Base);
2560   // The base address should be loop invariant
2561   if (!invariant(base)) {
2562     assert(!valid(), "base address is loop variant");
2563     return;
2564   }
2565   //unsafe reference could not be aligned appropriately without runtime checking
2566   if (base == NULL || base->bottom_type() == Type::TOP) {
2567     assert(!valid(), "unsafe access");
2568     return;
2569   }
2570   for (int i = 0; i < 3; i++) {
2571     if (!scaled_iv_plus_offset(adr->in(AddPNode::Offset))) {
2572       assert(!valid(), "too complex");
2573       return;
2574     }
2575     adr = adr->in(AddPNode::Address);
2576     if (base == adr || !adr->is_AddP()) {
2577       break; // stop looking at addp's
2578     }
2579   }
2580   _base = base;
2581   _adr  = adr;
2582   assert(valid(), "Usable");
2583 }
2584 
2585 // Following is used to create a temporary object during
2586 // the pattern match of an address expression.
2587 SWPointer::SWPointer(SWPointer* p) :
2588   _mem(p->_mem), _slp(p->_slp),  _base(NULL),  _adr(NULL),
2589   _scale(0), _offset(0), _invar(NULL), _negate_invar(false) {}


2590 
2591 //------------------------scaled_iv_plus_offset--------------------
2592 // Match: k*iv + offset
2593 // where: k is a constant that maybe zero, and
2594 //        offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional
2595 bool SWPointer::scaled_iv_plus_offset(Node* n) {
2596   if (scaled_iv(n)) {
2597     return true;
2598   }
2599   if (offset_plus_k(n)) {
2600     return true;
2601   }
2602   int opc = n->Opcode();
2603   if (opc == Op_AddI) {
2604     if (scaled_iv(n->in(1)) && offset_plus_k(n->in(2))) {
2605       return true;
2606     }
2607     if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
2608       return true;
2609     }


2612       return true;
2613     }
2614     if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
2615       _scale *= -1;
2616       return true;
2617     }
2618   }
2619   return false;
2620 }
2621 
2622 //----------------------------scaled_iv------------------------
2623 // Match: k*iv where k is a constant that's not zero
2624 bool SWPointer::scaled_iv(Node* n) {
2625   if (_scale != 0) {
2626     return false;  // already found a scale
2627   }
2628   if (n == iv()) {
2629     _scale = 1;
2630     return true;
2631   }



2632   int opc = n->Opcode();
2633   if (opc == Op_MulI) {
2634     if (n->in(1) == iv() && n->in(2)->is_Con()) {
2635       _scale = n->in(2)->get_int();
2636       return true;
2637     } else if (n->in(2) == iv() && n->in(1)->is_Con()) {
2638       _scale = n->in(1)->get_int();
2639       return true;
2640     }
2641   } else if (opc == Op_LShiftI) {
2642     if (n->in(1) == iv() && n->in(2)->is_Con()) {
2643       _scale = 1 << n->in(2)->get_int();
2644       return true;
2645     }
2646   } else if (opc == Op_ConvI2L) {
2647     if (scaled_iv_plus_offset(n->in(1))) {
2648       return true;
2649     }
2650   } else if (opc == Op_LShiftL) {
2651     if (!has_iv() && _invar == NULL) {


2669 //----------------------------offset_plus_k------------------------
2670 // Match: offset is (k [+/- invariant])
2671 // where k maybe zero and invariant is optional, but not both.
2672 bool SWPointer::offset_plus_k(Node* n, bool negate) {
2673   int opc = n->Opcode();
2674   if (opc == Op_ConI) {
2675     _offset += negate ? -(n->get_int()) : n->get_int();
2676     return true;
2677   } else if (opc == Op_ConL) {
2678     // Okay if value fits into an int
2679     const TypeLong* t = n->find_long_type();
2680     if (t->higher_equal(TypeLong::INT)) {
2681       jlong loff = n->get_long();
2682       jint  off  = (jint)loff;
2683       _offset += negate ? -off : loff;
2684       return true;
2685     }
2686     return false;
2687   }
2688   if (_invar != NULL) return false; // already have an invariant



2689   if (opc == Op_AddI) {
2690     if (n->in(2)->is_Con() && invariant(n->in(1))) {
2691       _negate_invar = negate;
2692       _invar = n->in(1);
2693       _offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
2694       return true;
2695     } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
2696       _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
2697       _negate_invar = negate;
2698       _invar = n->in(2);
2699       return true;
2700     }
2701   }
2702   if (opc == Op_SubI) {
2703     if (n->in(2)->is_Con() && invariant(n->in(1))) {
2704       _negate_invar = negate;
2705       _invar = n->in(1);
2706       _offset += !negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
2707       return true;
2708     } else if (n->in(1)->is_Con() && invariant(n->in(2))) {




  51   _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
  52   _block(arena(), 8,  0, NULL),           // nodes in current block
  53   _data_entry(arena(), 8,  0, NULL),      // nodes with all inputs from outside
  54   _mem_slice_head(arena(), 8,  0, NULL),  // memory slice heads
  55   _mem_slice_tail(arena(), 8,  0, NULL),  // memory slice tails
  56   _node_info(arena(), 8,  0, SWNodeInfo::initial), // info needed per node
  57   _clone_map(phase->C->clone_map()),      // map of nodes created in cloning
  58   _align_to_ref(NULL),                    // memory reference to align vectors to
  59   _disjoint_ptrs(arena(), 8,  0, OrderedPair::initial), // runtime disambiguated pointer pairs
  60   _dg(_arena),                            // dependence graph
  61   _visited(arena()),                      // visited node set
  62   _post_visited(arena()),                 // post visited node set
  63   _n_idx_list(arena(), 8),                // scratch list of (node,index) pairs
  64   _stk(arena(), 8, 0, NULL),              // scratch stack of nodes
  65   _nlist(arena(), 8, 0, NULL),            // scratch list of nodes
  66   _lpt(NULL),                             // loop tree node
  67   _lp(NULL),                              // LoopNode
  68   _bb(NULL),                              // basic block
  69   _iv(NULL),                              // induction var
  70   _race_possible(false),                  // cases where SDMU is true
  71   _early_return(true),                    // analysis evaluations routine
  72   _num_work_vecs(0),                      // amount of vector work we have
  73   _num_reductions(0),                     // amount of reduction work we have
  74   _do_vector_loop(phase->C->do_vector_loop()),  // whether to do vectorization/simd style
  75   _ii_first(-1),                          // first loop generation index - only if do_vector_loop()
  76   _ii_last(-1),                           // last loop generation index - only if do_vector_loop()
  77   _ii_order(arena(), 8, 0, 0),
  78   _vector_loop_debug(phase->C->has_method() && phase->C->method_has_option("VectorizeDebug"))
  79 {}
  80 
  81 //------------------------------transform_loop---------------------------
  82 void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
  83   assert(UseSuperWord, "should be");
  84   // Do vectors exist on this architecture?
  85   if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
  86 
  87   assert(lpt->_head->is_CountedLoop(), "must be");
  88   CountedLoopNode *cl = lpt->_head->as_CountedLoop();
  89 
  90   if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
  91 
  92   if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops
  93 
  94   // Check for no control flow in body (other than exit)
  95   Node *cl_exit = cl->loopexit();
  96   if (cl_exit->in(0) != lpt->_head) return;
  97 
  98   // Make sure the are no extra control users of the loop backedge
  99   if (cl->back_control()->outcnt() != 1) {
 100     return;
 101   }
 102 
 103   // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
 104   CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
 105   if (pre_end == NULL) return;
 106   Node *pre_opaq1 = pre_end->limit();
 107   if (pre_opaq1->Opcode() != Op_Opaque1) return;
 108 
 109   init(); // initialize data structures
 110 
 111   set_lpt(lpt);
 112   set_lp(cl);
 113 
 114   // For now, define one block which is the entire loop body
 115   set_bb(cl);
 116 
 117   if (do_optimization) {
 118     assert(_packset.length() == 0, "packset must be empty");
 119     SLP_extract();
 120   }
 121 }
 122 
 123 //------------------------------early unrolling analysis------------------------------
 124 void SuperWord::unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor) {
 125   bool is_slp = true;
 126   ResourceMark rm;
 127   size_t ignored_size = lpt()->_body.size();
 128   int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
 129   Node_Stack nstack((int)ignored_size);
 130   Node *cl_exit = cl->loopexit();
 131 
 132   // First clear the entries
 133   for (uint i = 0; i < lpt()->_body.size(); i++) {
 134     ignored_loop_nodes[i] = -1;
 135   }
 136 
 137   int max_vector = Matcher::max_vector_size(T_INT);
 138 
 139   // Process the loop, some/all of the stack entries will not be in order, ergo
 140   // need to preprocess the ignored initial state before we process the loop
 141   for (uint i = 0; i < lpt()->_body.size(); i++) {
 142     Node* n = lpt()->_body.at(i);
 143     if (n == cl->incr() ||
 144       n->is_reduction() ||
 145       n->is_AddP() ||
 146       n->is_Cmp() ||
 147       n->is_IfTrue() ||
 148       n->is_CountedLoop() ||
 149       (n == cl_exit)) {
 150       ignored_loop_nodes[i] = n->_idx;
 151       continue;
 152     }
 153 
 154     if (n->is_If()) {
 155       IfNode *iff = n->as_If();
 156       if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
 157         if (lpt()->is_loop_exit(iff)) {
 158           ignored_loop_nodes[i] = n->_idx;
 159           continue;
 160         }
 161       }
 162     }
 163 
 164     if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
 165       Node* n_tail = n->in(LoopNode::LoopBackControl);
 166       if (n_tail != n->in(LoopNode::EntryControl)) {
 167         if (!n_tail->is_Mem()) {
 168           is_slp = false;
 169           break;
 170         }
 171       }
 172     }
 173 
 174     // This must happen after check of phi/if
 175     if (n->is_Phi() || n->is_If()) {
 176       ignored_loop_nodes[i] = n->_idx;
 177       continue;
 178     }
 179 
 180     if (n->is_LoadStore() || n->is_MergeMem() ||
 181       (n->is_Proj() && !n->as_Proj()->is_CFG())) {
 182       is_slp = false;
 183       break;
 184     }
 185 
 186     if (n->is_Mem()) {
 187       Node* adr = n->in(MemNode::Address);
 188       Node* n_ctrl = _phase->get_ctrl(adr);
 189 
 190       // save a queue of post process nodes
 191       if (n_ctrl != NULL && lpt()->is_member(_phase->get_loop(n_ctrl))) {
 192         MemNode* current = n->as_Mem();
 193         BasicType bt = current->memory_type();
 194         if (is_java_primitive(bt) == false) {
 195           ignored_loop_nodes[i] = n->_idx;
 196           continue;
 197         }
 198 
 199         // Process the memory expression
 200         int stack_idx = 0;
 201         bool have_side_effects = true;
 202         if (adr->is_AddP() == false) {
 203           nstack.push(adr, stack_idx++);
 204         } else {
 205           // Mark the components of the memory operation in nstack
 206           SWPointer p1(current, this, &nstack, true);
 207           have_side_effects = p1.node_stack()->is_nonempty();
 208         }
 209 
 210         // Process the pointer stack
 211         while (have_side_effects) {
 212           Node* pointer_node = nstack.node();
 213           for (uint j = 0; j < lpt()->_body.size(); j++) {
 214             Node* cur_node = lpt()->_body.at(j);
 215             if (cur_node == pointer_node) {
 216               ignored_loop_nodes[j] = cur_node->_idx;
 217               break;
 218             }
 219           }
 220           nstack.pop();
 221           have_side_effects = nstack.is_nonempty();
 222         }
 223       }
 224     }
 225   }
 226 
 227   if (is_slp) {
 228     // Now we try to find the maximum supported consistent vector which the machine
 229     // description can use
 230     for (uint i = 0; i < lpt()->_body.size(); i++) {
 231       if (ignored_loop_nodes[i] != -1) continue;
 232 
 233       BasicType bt;
 234       Node* n = lpt()->_body.at(i);
 235       if (n->is_Store()) {
 236         bt = n->as_Mem()->memory_type();
 237       }
 238       else {
 239         bt = n->bottom_type()->basic_type();
 240       }
 241 
 242       int cur_max_vector = Matcher::max_vector_size(bt);
 243 
 244       // If a max vector exists which is not larger than _local_loop_unroll_factor
 245       // stop looking, we already have the max vector to map to.
 246       if (cur_max_vector <= local_loop_unroll_factor) {
 247         is_slp = false;
 248 #ifndef PRODUCT
 249         if (TraceSuperWordLoopUnrollAnalysis) {
 250           tty->print_cr("slp analysis fails: unroll limit equals max vector\n");
 251         }
 252 #endif
 253         break;
 254       }
 255 
 256       // Map the maximal common vector
 257       if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
 258         if (cur_max_vector < max_vector) {
 259           max_vector = cur_max_vector;
 260         }
 261       }
 262     }
 263     if (is_slp) {
 264       local_loop_unroll_factor = max_vector;
 265     }
 266     cl->mark_passed_slp();
 267     cl->set_slp_max_unroll(local_loop_unroll_factor);
 268   }
 269 }
 270 
 271 //------------------------------SLP_extract---------------------------
 272 // Extract the superword level parallelism
 273 //
 274 // 1) A reverse post-order of nodes in the block is constructed.  By scanning
 275 //    this list from first to last, all definitions are visited before their uses.
 276 //
 277 // 2) A point-to-point dependence graph is constructed between memory references.
 278 //    This simplies the upcoming "independence" checker.
 279 //
 280 // 3) The maximum depth in the node graph from the beginning of the block
 281 //    to each node is computed.  This is used to prune the graph search
 282 //    in the independence checker.
 283 //
 284 // 4) For integer types, the necessary bit width is propagated backwards
 285 //    from stores to allow packed operations on byte, char, and short
 286 //    integers.  This reverses the promotion to type "int" that javac
 287 //    did for operations like: char c1,c2,c3;  c1 = c2 + c3.
 288 //


 402 
 403   Node_List align_to_refs;
 404   int best_iv_adjustment = 0;
 405   MemNode* best_align_to_mem_ref = NULL;
 406 
 407   while (memops.size() != 0) {
 408     // Find a memory reference to align to.
 409     MemNode* mem_ref = find_align_to_ref(memops);
 410     if (mem_ref == NULL) break;
 411     align_to_refs.push(mem_ref);
 412     int iv_adjustment = get_iv_adjustment(mem_ref);
 413 
 414     if (best_align_to_mem_ref == NULL) {
 415       // Set memory reference which is the best from all memory operations
 416       // to be used for alignment. The pre-loop trip count is modified to align
 417       // this reference to a vector-aligned address.
 418       best_align_to_mem_ref = mem_ref;
 419       best_iv_adjustment = iv_adjustment;
 420     }
 421 
 422     SWPointer align_to_ref_p(mem_ref, this, NULL, false);
 423     // Set alignment relative to "align_to_ref" for all related memory operations.
 424     for (int i = memops.size() - 1; i >= 0; i--) {
 425       MemNode* s = memops.at(i)->as_Mem();
 426       if (isomorphic(s, mem_ref)) {
 427         SWPointer p2(s, this, NULL, false);
 428         if (p2.comparable(align_to_ref_p)) {
 429           int align = memory_alignment(s, iv_adjustment);
 430           set_alignment(s, align);
 431         }
 432       }
 433     }
 434 
 435     // Create initial pack pairs of memory operations for which
 436     // alignment is set and vectors will be aligned.
 437     bool create_pack = true;
 438     if (memory_alignment(mem_ref, best_iv_adjustment) == 0 || _do_vector_loop) {
 439       if (!Matcher::misaligned_vectors_ok()) {
 440         int vw = vector_width(mem_ref);
 441         int vw_best = vector_width(best_align_to_mem_ref);
 442         if (vw > vw_best) {
 443           // Do not vectorize a memory access with more elements per vector
 444           // if unaligned memory access is not allowed because number of
 445           // iterations in pre-loop will be not enough to align it.
 446           create_pack = false;
 447         } else {
 448           SWPointer p2(best_align_to_mem_ref, this, NULL, false);
 449           if (align_to_ref_p.invar() != p2.invar()) {
 450             // Do not vectorize memory accesses with different invariants
 451             // if unaligned memory accesses are not allowed.
 452             create_pack = false;
 453           }
 454         }
 455       }
 456     } else {
 457       if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
 458         // Can't allow vectorization of unaligned memory accesses with the
 459         // same type since it could be overlapped accesses to the same array.
 460         create_pack = false;
 461       } else {
 462         // Allow independent (different type) unaligned memory operations
 463         // if HW supports them.
 464         if (!Matcher::misaligned_vectors_ok()) {
 465           create_pack = false;
 466         } else {
 467           // Check if packs of the same memory type but
 468           // with a different alignment were created before.


 545   set_align_to_ref(best_align_to_mem_ref);
 546 
 547 #ifndef PRODUCT
 548   if (TraceSuperWord) {
 549     tty->print_cr("\nAfter find_adjacent_refs");
 550     print_packset();
 551   }
 552 #endif
 553 }
 554 
 555 //------------------------------find_align_to_ref---------------------------
 556 // Find a memory reference to align the loop induction variable to.
 557 // Looks first at stores then at loads, looking for a memory reference
 558 // with the largest number of references similar to it.
 559 MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
 560   GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
 561 
 562   // Count number of comparable memory ops
 563   for (uint i = 0; i < memops.size(); i++) {
 564     MemNode* s1 = memops.at(i)->as_Mem();
 565     SWPointer p1(s1, this, NULL, false);
 566     // Discard if pre loop can't align this reference
 567     if (!ref_is_alignable(p1)) {
 568       *cmp_ct.adr_at(i) = 0;
 569       continue;
 570     }
 571     for (uint j = i+1; j < memops.size(); j++) {
 572       MemNode* s2 = memops.at(j)->as_Mem();
 573       if (isomorphic(s1, s2)) {
 574         SWPointer p2(s2, this, NULL, false);
 575         if (p1.comparable(p2)) {
 576           (*cmp_ct.adr_at(i))++;
 577           (*cmp_ct.adr_at(j))++;
 578         }
 579       }
 580     }
 581   }
 582 
 583   // Find Store (or Load) with the greatest number of "comparable" references,
 584   // biggest vector size, smallest data size and smallest iv offset.
 585   int max_ct        = 0;
 586   int max_vw        = 0;
 587   int max_idx       = -1;
 588   int min_size      = max_jint;
 589   int min_iv_offset = max_jint;
 590   for (uint j = 0; j < memops.size(); j++) {
 591     MemNode* s = memops.at(j)->as_Mem();
 592     if (s->is_Store()) {
 593       int vw = vector_width_in_bytes(s);
 594       assert(vw > 1, "sanity");
 595       SWPointer p(s, this, NULL, false);
 596       if (cmp_ct.at(j) >  max_ct ||
 597           cmp_ct.at(j) == max_ct &&
 598             (vw >  max_vw ||
 599              vw == max_vw &&
 600               (data_size(s) <  min_size ||
 601                data_size(s) == min_size &&
 602                  (p.offset_in_bytes() < min_iv_offset)))) {
 603         max_ct = cmp_ct.at(j);
 604         max_vw = vw;
 605         max_idx = j;
 606         min_size = data_size(s);
 607         min_iv_offset = p.offset_in_bytes();
 608       }
 609     }
 610   }
 611   // If no stores, look at loads
 612   if (max_ct == 0) {
 613     for (uint j = 0; j < memops.size(); j++) {
 614       MemNode* s = memops.at(j)->as_Mem();
 615       if (s->is_Load()) {
 616         int vw = vector_width_in_bytes(s);
 617         assert(vw > 1, "sanity");
 618         SWPointer p(s, this, NULL, false);
 619         if (cmp_ct.at(j) >  max_ct ||
 620             cmp_ct.at(j) == max_ct &&
 621               (vw >  max_vw ||
 622                vw == max_vw &&
 623                 (data_size(s) <  min_size ||
 624                  data_size(s) == min_size &&
 625                    (p.offset_in_bytes() < min_iv_offset)))) {
 626           max_ct = cmp_ct.at(j);
 627           max_vw = vw;
 628           max_idx = j;
 629           min_size = data_size(s);
 630           min_iv_offset = p.offset_in_bytes();
 631         }
 632       }
 633     }
 634   }
 635 
 636 #ifdef ASSERT
 637   if (TraceSuperWord && Verbose) {
 638     tty->print_cr("\nVector memops after find_align_to_ref");


 709     } else if (span % vw == 0) {
 710       // If span is a multiple of vw, we can simplify formula (1) to:
 711       //   (init_offset + i * span) % vw == 0
 712       //     =>
 713       //   (init_offset % vw) + ((i * span) % vw) == 0
 714       //     =>
 715       //   init_offset % vw == 0
 716       //
 717       // Because we add a multiple of vw to the initial offset, the final
 718       // offset is a multiple of vw if and only if init_offset is a multiple.
 719       //
 720       return (init_offset % vw) == 0;
 721     }
 722   }
 723   return false;
 724 }
 725 
 726 //---------------------------get_iv_adjustment---------------------------
 727 // Calculate loop's iv adjustment for this memory ops.
 728 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
 729   SWPointer align_to_ref_p(mem_ref, this, NULL, false);
 730   int offset = align_to_ref_p.offset_in_bytes();
 731   int scale  = align_to_ref_p.scale_in_bytes();
 732   int elt_size = align_to_ref_p.memory_size();
 733   int vw       = vector_width_in_bytes(mem_ref);
 734   assert(vw > 1, "sanity");
 735   int iv_adjustment;
 736   if (scale != 0) {
 737     int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
 738     // At least one iteration is executed in pre-loop by default. As result
 739     // several iterations are needed to align memory operations in main-loop even
 740     // if offset is 0.
 741     int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
 742     assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
 743            err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
 744     iv_adjustment = iv_adjustment_in_bytes/elt_size;
 745   } else {
 746     // This memory op is not dependent on iv (scale == 0)
 747     iv_adjustment = 0;
 748   }
 749 


 783         _nlist.at(j)->dump();
 784       }
 785     }
 786 #endif
 787     // Make the slice dependent on the root
 788     DepMem* slice = _dg.dep(n);
 789     _dg.make_edge(_dg.root(), slice);
 790 
 791     // Create a sink for the slice
 792     DepMem* slice_sink = _dg.make_node(NULL);
 793     _dg.make_edge(slice_sink, _dg.tail());
 794 
 795     // Now visit each pair of memory ops, creating the edges
 796     for (int j = _nlist.length() - 1; j >= 0 ; j--) {
 797       Node* s1 = _nlist.at(j);
 798 
 799       // If no dependency yet, use slice
 800       if (_dg.dep(s1)->in_cnt() == 0) {
 801         _dg.make_edge(slice, s1);
 802       }
 803       SWPointer p1(s1->as_Mem(), this, NULL, false);
 804       bool sink_dependent = true;
 805       for (int k = j - 1; k >= 0; k--) {
 806         Node* s2 = _nlist.at(k);
 807         if (s1->is_Load() && s2->is_Load())
 808           continue;
 809         SWPointer p2(s2->as_Mem(), this, NULL, false);
 810 
 811         int cmp = p1.cmp(p2);
 812         if (SuperWordRTDepCheck &&
 813             p1.base() != p2.base() && p1.valid() && p2.valid()) {
 814           // Create a runtime check to disambiguate
 815           OrderedPair pp(p1.base(), p2.base());
 816           _disjoint_ptrs.append_if_missing(pp);
 817         } else if (!SWPointer::not_equal(cmp)) {
 818           // Possibly same address
 819           _dg.make_edge(s1, s2);
 820           sink_dependent = false;
 821         }
 822       }
 823       if (sink_dependent) {
 824         _dg.make_edge(s1, slice_sink);
 825       }
 826     }
 827 #ifndef PRODUCT
 828     if (TraceSuperWord) {
 829       tty->print_cr("\nDependence graph for slice: %d", n->_idx);


 929   return false;
 930 }
 931 
 932 //------------------------------are_adjacent_refs---------------------------
 933 // Is s1 immediately before s2 in memory?
 934 bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
 935   if (!s1->is_Mem() || !s2->is_Mem()) return false;
 936   if (!in_bb(s1)    || !in_bb(s2))    return false;
 937 
 938   // Do not use superword for non-primitives
 939   if (!is_java_primitive(s1->as_Mem()->memory_type()) ||
 940       !is_java_primitive(s2->as_Mem()->memory_type())) {
 941     return false;
 942   }
 943 
 944   // FIXME - co_locate_pack fails on Stores in different mem-slices, so
 945   // only pack memops that are in the same alias set until that's fixed.
 946   if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
 947       _phase->C->get_alias_index(s2->as_Mem()->adr_type()))
 948     return false;
 949   SWPointer p1(s1->as_Mem(), this, NULL, false);
 950   SWPointer p2(s2->as_Mem(), this, NULL, false);
 951   if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
 952   int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
 953   return diff == data_size(s1);
 954 }
 955 
 956 //------------------------------isomorphic---------------------------
 957 // Are s1 and s2 similar?
 958 bool SuperWord::isomorphic(Node* s1, Node* s2) {
 959   if (s1->Opcode() != s2->Opcode()) return false;
 960   if (s1->req() != s2->req()) return false;
 961   if (s1->in(0) != s2->in(0)) return false;
 962   if (!same_velt_type(s1, s2)) return false;
 963   return true;
 964 }
 965 
 966 //------------------------------independent---------------------------
 967 // Is there no data path from s1 to s2 or s2 to s1?
 968 bool SuperWord::independent(Node* s1, Node* s2) {
 969   //  assert(s1->Opcode() == s2->Opcode(), "check isomorphic first");
 970   int d1 = depth(s1);


1749   // Insert extract (unpack) operations for scalar uses
1750   for (int i = 0; i < _packset.length(); i++) {
1751     insert_extracts(_packset.at(i));
1752   }
1753 
1754   Compile* C = _phase->C;
1755   uint max_vlen_in_bytes = 0;
1756   for (int i = 0; i < _block.length(); i++) {
1757     Node* n = _block.at(i);
1758     Node_List* p = my_pack(n);
1759     if (p && n == executed_last(p)) {
1760       uint vlen = p->size();
1761       uint vlen_in_bytes = 0;
1762       Node* vn = NULL;
1763       Node* low_adr = p->at(0);
1764       Node* first   = executed_first(p);
1765       int   opc = n->Opcode();
1766       if (n->is_Load()) {
1767         Node* ctl = n->in(MemNode::Control);
1768         Node* mem = first->in(MemNode::Memory);
1769         SWPointer p1(n->as_Mem(), this, NULL, false);
1770         // Identify the memory dependency for the new loadVector node by
1771         // walking up through memory chain.
1772         // This is done to give flexibility to the new loadVector node so that
1773         // it can move above independent storeVector nodes.
1774         while (mem->is_StoreVector()) {
1775           SWPointer p2(mem->as_Mem(), this, NULL, false);
1776           int cmp = p1.cmp(p2);
1777           if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
1778             mem = mem->in(MemNode::Memory);
1779           } else {
1780             break; // dependent memory
1781           }
1782         }
1783         Node* adr = low_adr->in(MemNode::Address);
1784         const TypePtr* atyp = n->adr_type();
1785         vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
1786         vlen_in_bytes = vn->as_LoadVector()->memory_size();
1787       } else if (n->is_Store()) {
1788         // Promote value to be stored to vector
1789         Node* val = vector_opd(p, MemNode::ValueIn);
1790         Node* ctl = n->in(MemNode::Control);
1791         Node* mem = first->in(MemNode::Memory);
1792         Node* adr = low_adr->in(MemNode::Address);
1793         const TypePtr* atyp = n->adr_type();
1794         vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
1795         vlen_in_bytes = vn->as_StoreVector()->memory_size();


2272           }
2273         }
2274       }
2275     }
2276   }
2277 #ifndef PRODUCT
2278   if (TraceSuperWord && Verbose) {
2279     for (int i = 0; i < _block.length(); i++) {
2280       Node* n = _block.at(i);
2281       velt_type(n)->dump();
2282       tty->print("\t");
2283       n->dump();
2284     }
2285   }
2286 #endif
2287 }
2288 
2289 //------------------------------memory_alignment---------------------------
2290 // Alignment within a vector memory reference
2291 int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
2292   SWPointer p(s, this, NULL, false);
2293   if (!p.valid()) {
2294     return bottom_align;
2295   }
2296   int vw = vector_width_in_bytes(s);
2297   if (vw < 2) {
2298     return bottom_align; // No vectors for this type
2299   }
2300   int offset  = p.offset_in_bytes();
2301   offset     += iv_adjust*p.memory_size();
2302   int off_rem = offset % vw;
2303   int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
2304   return off_mod;
2305 }
2306 
2307 //---------------------------container_type---------------------------
2308 // Smallest type containing range of values
2309 const Type* SuperWord::container_type(Node* n) {
2310   if (n->is_Mem()) {
2311     BasicType bt = n->as_Mem()->memory_type();
2312     if (n->is_Store() && (bt == T_CHAR)) {


2436 // to align_to_ref will be a position zero in the vector.
2437 //   (iv + k) mod vector_align == 0
2438 void SuperWord::align_initial_loop_index(MemNode* align_to_ref) {
2439   CountedLoopNode *main_head = lp()->as_CountedLoop();
2440   assert(main_head->is_main_loop(), "");
2441   CountedLoopEndNode* pre_end = get_pre_loop_end(main_head);
2442   assert(pre_end != NULL, "we must have a correct pre-loop");
2443   Node *pre_opaq1 = pre_end->limit();
2444   assert(pre_opaq1->Opcode() == Op_Opaque1, "");
2445   Opaque1Node *pre_opaq = (Opaque1Node*)pre_opaq1;
2446   Node *lim0 = pre_opaq->in(1);
2447 
2448   // Where we put new limit calculations
2449   Node *pre_ctrl = pre_end->loopnode()->in(LoopNode::EntryControl);
2450 
2451   // Ensure the original loop limit is available from the
2452   // pre-loop Opaque1 node.
2453   Node *orig_limit = pre_opaq->original_loop_limit();
2454   assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
2455 
2456   SWPointer align_to_ref_p(align_to_ref, this, NULL, false);
2457   assert(align_to_ref_p.valid(), "sanity");
2458 
2459   // Given:
2460   //     lim0 == original pre loop limit
2461   //     V == v_align (power of 2)
2462   //     invar == extra invariant piece of the address expression
2463   //     e == offset [ +/- invar ]
2464   //
2465   // When reassociating expressions involving '%' the basic rules are:
2466   //     (a - b) % k == 0   =>  a % k == b % k
2467   // and:
2468   //     (a + b) % k == 0   =>  a % k == (k - b) % k
2469   //
2470   // For stride > 0 && scale > 0,
2471   //   Derive the new pre-loop limit "lim" such that the two constraints:
2472   //     (1) lim = lim0 + N           (where N is some positive integer < V)
2473   //     (2) (e + lim) % V == 0
2474   //   are true.
2475   //
2476   //   Substituting (1) into (2),


2610 
2611 
2612 //------------------------------init---------------------------
2613 void SuperWord::init() {
2614   _dg.init();
2615   _packset.clear();
2616   _disjoint_ptrs.clear();
2617   _block.clear();
2618   _data_entry.clear();
2619   _mem_slice_head.clear();
2620   _mem_slice_tail.clear();
2621   _iteration_first.clear();
2622   _iteration_last.clear();
2623   _node_info.clear();
2624   _align_to_ref = NULL;
2625   _lpt = NULL;
2626   _lp = NULL;
2627   _bb = NULL;
2628   _iv = NULL;
2629   _race_possible = 0;
2630   _early_return = false;
2631   _num_work_vecs = 0;
2632   _num_reductions = 0;
2633 }
2634 
2635 //------------------------------restart---------------------------
2636 void SuperWord::restart() {
2637   _dg.init();
2638   _packset.clear();
2639   _disjoint_ptrs.clear();
2640   _block.clear();
2641   _data_entry.clear();
2642   _mem_slice_head.clear();
2643   _mem_slice_tail.clear();
2644   _node_info.clear();
2645 }
2646 
2647 //------------------------------print_packset---------------------------
2648 void SuperWord::print_packset() {
2649 #ifndef PRODUCT
2650   tty->print_cr("packset");


2681 void SuperWord::print_stmt(Node* s) {
2682 #ifndef PRODUCT
2683   tty->print(" align: %d \t", alignment(s));
2684   s->dump();
2685 #endif
2686 }
2687 
2688 //------------------------------blank---------------------------
2689 char* SuperWord::blank(uint depth) {
2690   static char blanks[101];
2691   assert(depth < 101, "too deep");
2692   for (uint i = 0; i < depth; i++) blanks[i] = ' ';
2693   blanks[depth] = '\0';
2694   return blanks;
2695 }
2696 
2697 
2698 //==============================SWPointer===========================
2699 
2700 //----------------------------SWPointer------------------------
2701 SWPointer::SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only) :
2702   _mem(mem), _slp(slp),  _base(NULL),  _adr(NULL),
2703   _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
2704   _nstack(nstack), _analyze_only(analyze_only),
2705   _stack_idx(0) {
2706 
2707   Node* adr = mem->in(MemNode::Address);
2708   if (!adr->is_AddP()) {
2709     assert(!valid(), "too complex");
2710     return;
2711   }
2712   // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant)
2713   Node* base = adr->in(AddPNode::Base);
2714   // The base address should be loop invariant
2715   if (!invariant(base)) {
2716     assert(!valid(), "base address is loop variant");
2717     return;
2718   }
2719   //unsafe reference could not be aligned appropriately without runtime checking
2720   if (base == NULL || base->bottom_type() == Type::TOP) {
2721     assert(!valid(), "unsafe access");
2722     return;
2723   }
2724   for (int i = 0; i < 3; i++) {
2725     if (!scaled_iv_plus_offset(adr->in(AddPNode::Offset))) {
2726       assert(!valid(), "too complex");
2727       return;
2728     }
2729     adr = adr->in(AddPNode::Address);
2730     if (base == adr || !adr->is_AddP()) {
2731       break; // stop looking at addp's
2732     }
2733   }
2734   _base = base;
2735   _adr  = adr;
2736   assert(valid(), "Usable");
2737 }
2738 
2739 // Following is used to create a temporary object during
2740 // the pattern match of an address expression.
2741 SWPointer::SWPointer(SWPointer* p) :
2742   _mem(p->_mem), _slp(p->_slp),  _base(NULL),  _adr(NULL),
2743   _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
2744   _nstack(p->_nstack), _analyze_only(p->_analyze_only),
2745   _stack_idx(p->_stack_idx) {}
2746 
2747 //------------------------scaled_iv_plus_offset--------------------
2748 // Match: k*iv + offset
2749 // where: k is a constant that maybe zero, and
2750 //        offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional
2751 bool SWPointer::scaled_iv_plus_offset(Node* n) {
2752   if (scaled_iv(n)) {
2753     return true;
2754   }
2755   if (offset_plus_k(n)) {
2756     return true;
2757   }
2758   int opc = n->Opcode();
2759   if (opc == Op_AddI) {
2760     if (scaled_iv(n->in(1)) && offset_plus_k(n->in(2))) {
2761       return true;
2762     }
2763     if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
2764       return true;
2765     }


2768       return true;
2769     }
2770     if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
2771       _scale *= -1;
2772       return true;
2773     }
2774   }
2775   return false;
2776 }
2777 
2778 //----------------------------scaled_iv------------------------
2779 // Match: k*iv where k is a constant that's not zero
2780 bool SWPointer::scaled_iv(Node* n) {
2781   if (_scale != 0) {
2782     return false;  // already found a scale
2783   }
2784   if (n == iv()) {
2785     _scale = 1;
2786     return true;
2787   }
2788   if (_analyze_only && (invariant(n) == false)) {
2789     _nstack->push(n, _stack_idx++);
2790   }
2791   int opc = n->Opcode();
2792   if (opc == Op_MulI) {
2793     if (n->in(1) == iv() && n->in(2)->is_Con()) {
2794       _scale = n->in(2)->get_int();
2795       return true;
2796     } else if (n->in(2) == iv() && n->in(1)->is_Con()) {
2797       _scale = n->in(1)->get_int();
2798       return true;
2799     }
2800   } else if (opc == Op_LShiftI) {
2801     if (n->in(1) == iv() && n->in(2)->is_Con()) {
2802       _scale = 1 << n->in(2)->get_int();
2803       return true;
2804     }
2805   } else if (opc == Op_ConvI2L) {
2806     if (scaled_iv_plus_offset(n->in(1))) {
2807       return true;
2808     }
2809   } else if (opc == Op_LShiftL) {
2810     if (!has_iv() && _invar == NULL) {


2828 //----------------------------offset_plus_k------------------------
2829 // Match: offset is (k [+/- invariant])
2830 // where k maybe zero and invariant is optional, but not both.
2831 bool SWPointer::offset_plus_k(Node* n, bool negate) {
2832   int opc = n->Opcode();
2833   if (opc == Op_ConI) {
2834     _offset += negate ? -(n->get_int()) : n->get_int();
2835     return true;
2836   } else if (opc == Op_ConL) {
2837     // Okay if value fits into an int
2838     const TypeLong* t = n->find_long_type();
2839     if (t->higher_equal(TypeLong::INT)) {
2840       jlong loff = n->get_long();
2841       jint  off  = (jint)loff;
2842       _offset += negate ? -off : loff;
2843       return true;
2844     }
2845     return false;
2846   }
2847   if (_invar != NULL) return false; // already have an invariant
2848   if (_analyze_only && (invariant(n) == false)) {
2849     _nstack->push(n, _stack_idx++);
2850   }
2851   if (opc == Op_AddI) {
2852     if (n->in(2)->is_Con() && invariant(n->in(1))) {
2853       _negate_invar = negate;
2854       _invar = n->in(1);
2855       _offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
2856       return true;
2857     } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
2858       _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
2859       _negate_invar = negate;
2860       _invar = n->in(2);
2861       return true;
2862     }
2863   }
2864   if (opc == Op_SubI) {
2865     if (n->in(2)->is_Con() && invariant(n->in(1))) {
2866       _negate_invar = negate;
2867       _invar = n->in(1);
2868       _offset += !negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
2869       return true;
2870     } else if (n->in(1)->is_Con() && invariant(n->in(2))) {


< prev index next >