New src/share/vm/opto/superword.cpp

   1 /*
   2  * Copyright (c) 2007, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 #include "precompiled.hpp"
  25 #include "compiler/compileLog.hpp"
  26 #include "libadt/vectset.hpp"
  27 #include "memory/allocation.inline.hpp"
  28 #include "opto/addnode.hpp"
  29 #include "opto/callnode.hpp"
  30 #include "opto/castnode.hpp"
  31 #include "opto/convertnode.hpp"
  32 #include "opto/divnode.hpp"
  33 #include "opto/matcher.hpp"
  34 #include "opto/memnode.hpp"
  35 #include "opto/mulnode.hpp"
  36 #include "opto/opcodes.hpp"
  37 #include "opto/opaquenode.hpp"
  38 #include "opto/superword.hpp"
  39 #include "opto/vectornode.hpp"
  40 
  41 //
  42 //                  S U P E R W O R D   T R A N S F O R M
  43 //=============================================================================
  44 
  45 //------------------------------SuperWord---------------------------
  46 SuperWord::SuperWord(PhaseIdealLoop* phase) :
  47   _phase(phase),
  48   _igvn(phase->_igvn),
  49   _arena(phase->C->comp_arena()),
  50   _packset(arena(), 8,  0, NULL),         // packs for the current block
  51   _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
  52   _block(arena(), 8,  0, NULL),           // nodes in current block
  53   _data_entry(arena(), 8,  0, NULL),      // nodes with all inputs from outside
  54   _mem_slice_head(arena(), 8,  0, NULL),  // memory slice heads
  55   _mem_slice_tail(arena(), 8,  0, NULL),  // memory slice tails
  56   _node_info(arena(), 8,  0, SWNodeInfo::initial), // info needed per node
  57   _clone_map(phase->C->clone_map()),      // map of nodes created in cloning
  58   _align_to_ref(NULL),                    // memory reference to align vectors to
  59   _disjoint_ptrs(arena(), 8,  0, OrderedPair::initial), // runtime disambiguated pointer pairs
  60   _dg(_arena),                            // dependence graph
  61   _visited(arena()),                      // visited node set
  62   _post_visited(arena()),                 // post visited node set
  63   _n_idx_list(arena(), 8),                // scratch list of (node,index) pairs
  64   _stk(arena(), 8, 0, NULL),              // scratch stack of nodes
  65   _nlist(arena(), 8, 0, NULL),            // scratch list of nodes
  66   _lpt(NULL),                             // loop tree node
  67   _lp(NULL),                              // LoopNode
  68   _bb(NULL),                              // basic block
  69   _iv(NULL),                              // induction var
  70   _race_possible(false),                  // cases where SDMU is true
  71   _early_return(true),                    // analysis evaluations routine
  72   _num_work_vecs(0),                      // amount of vector work we have
  73   _num_reductions(0),                     // amount of reduction work we have
  74   _do_vector_loop(phase->C->do_vector_loop()),  // whether to do vectorization/simd style
  75   _ii_first(-1),                          // first loop generation index - only if do_vector_loop()
  76   _ii_last(-1),                           // last loop generation index - only if do_vector_loop()
  77   _ii_order(arena(), 8, 0, 0),
  78   _vector_loop_debug(phase->C->has_method() && phase->C->method_has_option("VectorizeDebug"))
  79 {}
  80 
  81 //------------------------------transform_loop---------------------------
  82 void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
  83   assert(UseSuperWord, "should be");
  84   // Do vectors exist on this architecture?
  85   if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
  86 
  87   assert(lpt->_head->is_CountedLoop(), "must be");
  88   CountedLoopNode *cl = lpt->_head->as_CountedLoop();
  89 
  90   if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
  91 
  92   if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops
  93 
  94   // Check for no control flow in body (other than exit)
  95   Node *cl_exit = cl->loopexit();
  96   if (cl_exit->in(0) != lpt->_head) return;
  97 
  98   // Make sure the are no extra control users of the loop backedge
  99   if (cl->back_control()->outcnt() != 1) {
 100     return;
 101   }
 102 
 103   // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
 104   CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
 105   if (pre_end == NULL) return;
 106   Node *pre_opaq1 = pre_end->limit();
 107   if (pre_opaq1->Opcode() != Op_Opaque1) return;
 108 
 109   init(); // initialize data structures
 110 
 111   set_lpt(lpt);
 112   set_lp(cl);
 113 
 114   // For now, define one block which is the entire loop body
 115   set_bb(cl);
 116 
 117   if (do_optimization) {
 118     assert(_packset.length() == 0, "packset must be empty");
 119     SLP_extract();
 120   }
 121 }
 122 
 123 //------------------------------early unrolling analysis------------------------------
 124 void SuperWord::unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor) {
 125   bool not_slp = false;
 126   ResourceMark rm;
 127   size_t ignored_size = lpt()->_body.size();
 128   int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
 129   Node_Stack nstack((int)ignored_size);
 130   Node *cl_exit = cl->loopexit();
 131 
 132   // First clear the entries
 133   for (uint i = 0; i < lpt()->_body.size(); i++) {
 134     ignored_loop_nodes[i] = -1;
 135   }
 136 
 137   int max_vector = Matcher::max_vector_size(T_INT);
 138 
 139   // Process the loop, some/all of the stack entries will not be in order, ergo
 140   // need to preprocess the ignored initial state before we process the loop
 141   for (uint i = 0; i < lpt()->_body.size(); i++) {
 142     Node* n = lpt()->_body.at(i);
 143     if (n == cl->incr() ||
 144       n->is_reduction() ||
 145       n->is_AddP() ||
 146       n->is_Cmp() ||
 147       n->is_IfTrue() ||
 148       n->is_CountedLoop() ||
 149       (n == cl_exit)) {
 150       ignored_loop_nodes[i] = n->_idx;
 151       continue;
 152     }
 153 
 154     if (n->is_If()) {
 155       IfNode *iff = n->as_If();
 156       if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
 157         if (lpt()->is_loop_exit(iff)) {
 158           ignored_loop_nodes[i] = n->_idx;
 159           continue;
 160         }
 161       }
 162     }
 163 
 164     if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
 165       Node* n_tail = n->in(LoopNode::LoopBackControl);
 166       if (n_tail != n->in(LoopNode::EntryControl)) {
 167         if (!n_tail->is_Mem()) {
 168           not_slp = true;
 169           break;
 170         }
 171       }
 172     }
 173 
 174     // This must happen after check of phi/if
 175     if (n->is_Phi() || n->is_If()) {
 176       ignored_loop_nodes[i] = n->_idx;
 177       continue;
 178     }
 179 
 180     if (n->is_LoadStore() || n->is_MergeMem() ||
 181       (n->is_Proj() && !n->as_Proj()->is_CFG())) {
 182       not_slp = true;
 183       break;
 184     }
 185 
 186     if (n->is_Mem()) {
 187       Node* adr = n->in(MemNode::Address);
 188       Node* n_ctrl = _phase->get_ctrl(adr);
 189 
 190       // save a queue of post process nodes
 191       if (n_ctrl != NULL && lpt()->is_member(_phase->get_loop(n_ctrl))) {
 192         MemNode* current = n->as_Mem();
 193         BasicType bt = current->memory_type();
 194         if (is_java_primitive(bt) == false) {
 195           ignored_loop_nodes[i] = n->_idx;
 196           continue;
 197         }
 198 
 199         // Process the memory expression
 200         int stack_idx = 0;
 201         bool have_side_effects = true;
 202         if (adr->is_AddP() == false) {
 203           nstack.push(adr, stack_idx++);
 204         }
 205         else {
 206           // Mark the components of the memory operation in nstack
 207           SWPointer p1(current, this, &nstack, true);
 208           have_side_effects = p1.node_stack()->is_nonempty();
 209         }
 210 
 211         // Process the pointer stack
 212         while (have_side_effects) {
 213           Node* pointer_node = nstack.node();
 214           for (uint j = 0; j < lpt()->_body.size(); j++) {
 215             Node* cur_node = lpt()->_body.at(j);
 216             if (cur_node == pointer_node) {
 217               ignored_loop_nodes[j] = cur_node->_idx;
 218               break;
 219             }
 220           }
 221           nstack.pop();
 222           have_side_effects = nstack.is_nonempty();
 223         }
 224       }
 225     }
 226   }
 227 
 228   if (not_slp == false) {
 229     // Now we try to find the maximum supported consistent vector which the machine
 230     // description can use
 231     for (uint i = 0; i < lpt()->_body.size(); i++) {
 232       if (ignored_loop_nodes[i] != -1) continue;
 233 
 234       BasicType bt;
 235       Node* n = lpt()->_body.at(i);
 236       if (n->is_Store()) {
 237         bt = n->as_Mem()->memory_type();
 238       }
 239       else {
 240         bt = n->bottom_type()->basic_type();
 241       }
 242 
 243       int cur_max_vector = Matcher::max_vector_size(bt);
 244 
 245       // If a max vector exists which is not larger than _local_loop_unroll_factor
 246       // stop looking, we already have the max vector to map to.
 247       if (cur_max_vector <= local_loop_unroll_factor) {
 248         not_slp = true;
 249 #ifndef PRODUCT
 250         if (TraceSuperWordLoopUnrollAnalysis) {
 251           tty->print_cr("slp analysis fails: unroll limit equals max vector\n");
 252         }
 253 #endif
 254         break;
 255       }
 256 
 257       // Map the maximal common vector
 258       if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
 259         if (cur_max_vector < max_vector) {
 260           max_vector = cur_max_vector;
 261         }
 262       }
 263     }
 264     if (not_slp == false) {
 265       local_loop_unroll_factor = max_vector;
 266     }
 267     cl->mark_passed_slp();
 268     cl->set_slp_max_unroll(local_loop_unroll_factor);
 269   }
 270 }
 271 
 272 //------------------------------SLP_extract---------------------------
 273 // Extract the superword level parallelism
 274 //
 275 // 1) A reverse post-order of nodes in the block is constructed.  By scanning
 276 //    this list from first to last, all definitions are visited before their uses.
 277 //
 278 // 2) A point-to-point dependence graph is constructed between memory references.
 279 //    This simplies the upcoming "independence" checker.
 280 //
 281 // 3) The maximum depth in the node graph from the beginning of the block
 282 //    to each node is computed.  This is used to prune the graph search
 283 //    in the independence checker.
 284 //
 285 // 4) For integer types, the necessary bit width is propagated backwards
 286 //    from stores to allow packed operations on byte, char, and short
 287 //    integers.  This reverses the promotion to type "int" that javac
 288 //    did for operations like: char c1,c2,c3;  c1 = c2 + c3.
 289 //
 290 // 5) One of the memory references is picked to be an aligned vector reference.
 291 //    The pre-loop trip count is adjusted to align this reference in the
 292 //    unrolled body.
 293 //
 294 // 6) The initial set of pack pairs is seeded with memory references.
 295 //
 296 // 7) The set of pack pairs is extended by following use->def and def->use links.
 297 //
 298 // 8) The pairs are combined into vector sized packs.
 299 //
 300 // 9) Reorder the memory slices to co-locate members of the memory packs.
 301 //
 302 // 10) Generate ideal vector nodes for the final set of packs and where necessary,
 303 //    inserting scalar promotion, vector creation from multiple scalars, and
 304 //    extraction of scalar values from vectors.
 305 //
 306 void SuperWord::SLP_extract() {
 307 
 308 #ifndef PRODUCT
 309   if (_do_vector_loop && TraceSuperWord) {
 310     tty->print("SuperWord::SLP_extract\n");
 311     tty->print("input loop\n");
 312     _lpt->dump_head();
 313     _lpt->dump();
 314     for (uint i = 0; i < _lpt->_body.size(); i++) {
 315       _lpt->_body.at(i)->dump();
 316     }
 317   }
 318 #endif
 319   // Ready the block
 320   if (!construct_bb()) {
 321     return; // Exit if no interesting nodes or complex graph.
 322   }
 323   // build    _dg, _disjoint_ptrs
 324   dependence_graph();
 325 
 326   // compute function depth(Node*)
 327   compute_max_depth();
 328 
 329   if (_do_vector_loop) {
 330     if (mark_generations() != -1) {
 331       hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly
 332 
 333       if (!construct_bb()) {
 334         return; // Exit if no interesting nodes or complex graph.
 335       }
 336       dependence_graph();
 337       compute_max_depth();
 338     }
 339 
 340 #ifndef PRODUCT
 341     if (TraceSuperWord) {
 342       tty->print_cr("\nSuperWord::_do_vector_loop: graph after hoist_loads_in_graph");
 343       _lpt->dump_head();
 344       for (int j = 0; j < _block.length(); j++) {
 345         Node* n = _block.at(j);
 346         int d = depth(n);
 347         for (int i = 0;  i < d; i++) tty->print("%s", "  ");
 348         tty->print("%d :", d);
 349         n->dump();
 350       }
 351     }
 352 #endif
 353   }
 354 
 355   compute_vector_element_type();
 356 
 357   // Attempt vectorization
 358 
 359   find_adjacent_refs();
 360 
 361   extend_packlist();
 362 
 363   if (_do_vector_loop) {
 364     if (_packset.length() == 0) {
 365 #ifndef PRODUCT
 366       if (TraceSuperWord) {
 367         tty->print_cr("\nSuperWord::_do_vector_loop DFA could not build packset, now trying to build anyway");
 368       }
 369 #endif
 370       pack_parallel();
 371     }
 372   }
 373 
 374   combine_packs();
 375 
 376   construct_my_pack_map();
 377 
 378   filter_packs();
 379 
 380   schedule();
 381 
 382   output();
 383 }
 384 
 385 //------------------------------find_adjacent_refs---------------------------
 386 // Find the adjacent memory references and create pack pairs for them.
 387 // This is the initial set of packs that will then be extended by
 388 // following use->def and def->use links.  The align positions are
 389 // assigned relative to the reference "align_to_ref"
 390 void SuperWord::find_adjacent_refs() {
 391   // Get list of memory operations
 392   Node_List memops;
 393   for (int i = 0; i < _block.length(); i++) {
 394     Node* n = _block.at(i);
 395     if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
 396         is_java_primitive(n->as_Mem()->memory_type())) {
 397       int align = memory_alignment(n->as_Mem(), 0);
 398       if (align != bottom_align) {
 399         memops.push(n);
 400       }
 401     }
 402   }
 403 
 404   Node_List align_to_refs;
 405   int best_iv_adjustment = 0;
 406   MemNode* best_align_to_mem_ref = NULL;
 407 
 408   while (memops.size() != 0) {
 409     // Find a memory reference to align to.
 410     MemNode* mem_ref = find_align_to_ref(memops);
 411     if (mem_ref == NULL) break;
 412     align_to_refs.push(mem_ref);
 413     int iv_adjustment = get_iv_adjustment(mem_ref);
 414 
 415     if (best_align_to_mem_ref == NULL) {
 416       // Set memory reference which is the best from all memory operations
 417       // to be used for alignment. The pre-loop trip count is modified to align
 418       // this reference to a vector-aligned address.
 419       best_align_to_mem_ref = mem_ref;
 420       best_iv_adjustment = iv_adjustment;
 421     }
 422 
 423     SWPointer align_to_ref_p(mem_ref, this, NULL, false);
 424     // Set alignment relative to "align_to_ref" for all related memory operations.
 425     for (int i = memops.size() - 1; i >= 0; i--) {
 426       MemNode* s = memops.at(i)->as_Mem();
 427       if (isomorphic(s, mem_ref)) {
 428         SWPointer p2(s, this, NULL, false);
 429         if (p2.comparable(align_to_ref_p)) {
 430           int align = memory_alignment(s, iv_adjustment);
 431           set_alignment(s, align);
 432         }
 433       }
 434     }
 435 
 436     // Create initial pack pairs of memory operations for which
 437     // alignment is set and vectors will be aligned.
 438     bool create_pack = true;
 439     if (memory_alignment(mem_ref, best_iv_adjustment) == 0 || _do_vector_loop) {
 440       if (!Matcher::misaligned_vectors_ok()) {
 441         int vw = vector_width(mem_ref);
 442         int vw_best = vector_width(best_align_to_mem_ref);
 443         if (vw > vw_best) {
 444           // Do not vectorize a memory access with more elements per vector
 445           // if unaligned memory access is not allowed because number of
 446           // iterations in pre-loop will be not enough to align it.
 447           create_pack = false;
 448         } else {
 449           SWPointer p2(best_align_to_mem_ref, this, NULL, false);
 450           if (align_to_ref_p.invar() != p2.invar()) {
 451             // Do not vectorize memory accesses with different invariants
 452             // if unaligned memory accesses are not allowed.
 453             create_pack = false;
 454           }
 455         }
 456       }
 457     } else {
 458       if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
 459         // Can't allow vectorization of unaligned memory accesses with the
 460         // same type since it could be overlapped accesses to the same array.
 461         create_pack = false;
 462       } else {
 463         // Allow independent (different type) unaligned memory operations
 464         // if HW supports them.
 465         if (!Matcher::misaligned_vectors_ok()) {
 466           create_pack = false;
 467         } else {
 468           // Check if packs of the same memory type but
 469           // with a different alignment were created before.
 470           for (uint i = 0; i < align_to_refs.size(); i++) {
 471             MemNode* mr = align_to_refs.at(i)->as_Mem();
 472             if (same_velt_type(mr, mem_ref) &&
 473                 memory_alignment(mr, iv_adjustment) != 0)
 474               create_pack = false;
 475           }
 476         }
 477       }
 478     }
 479     if (create_pack) {
 480       for (uint i = 0; i < memops.size(); i++) {
 481         Node* s1 = memops.at(i);
 482         int align = alignment(s1);
 483         if (align == top_align) continue;
 484         for (uint j = 0; j < memops.size(); j++) {
 485           Node* s2 = memops.at(j);
 486           if (alignment(s2) == top_align) continue;
 487           if (s1 != s2 && are_adjacent_refs(s1, s2)) {
 488             if (stmts_can_pack(s1, s2, align)) {
 489               Node_List* pair = new Node_List();
 490               pair->push(s1);
 491               pair->push(s2);
 492               if (!_do_vector_loop || _clone_map.idx(s1->_idx) == _clone_map.idx(s2->_idx)) {
 493                 _packset.append(pair);
 494               }
 495             }
 496           }
 497         }
 498       }
 499     } else { // Don't create unaligned pack
 500       // First, remove remaining memory ops of the same type from the list.
 501       for (int i = memops.size() - 1; i >= 0; i--) {
 502         MemNode* s = memops.at(i)->as_Mem();
 503         if (same_velt_type(s, mem_ref)) {
 504           memops.remove(i);
 505         }
 506       }
 507 
 508       // Second, remove already constructed packs of the same type.
 509       for (int i = _packset.length() - 1; i >= 0; i--) {
 510         Node_List* p = _packset.at(i);
 511         MemNode* s = p->at(0)->as_Mem();
 512         if (same_velt_type(s, mem_ref)) {
 513           remove_pack_at(i);
 514         }
 515       }
 516 
 517       // If needed find the best memory reference for loop alignment again.
 518       if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
 519         // Put memory ops from remaining packs back on memops list for
 520         // the best alignment search.
 521         uint orig_msize = memops.size();
 522         for (int i = 0; i < _packset.length(); i++) {
 523           Node_List* p = _packset.at(i);
 524           MemNode* s = p->at(0)->as_Mem();
 525           assert(!same_velt_type(s, mem_ref), "sanity");
 526           memops.push(s);
 527         }
 528         MemNode* best_align_to_mem_ref = find_align_to_ref(memops);
 529         if (best_align_to_mem_ref == NULL) break;
 530         best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref);
 531         // Restore list.
 532         while (memops.size() > orig_msize)
 533           (void)memops.pop();
 534       }
 535     } // unaligned memory accesses
 536 
 537     // Remove used mem nodes.
 538     for (int i = memops.size() - 1; i >= 0; i--) {
 539       MemNode* m = memops.at(i)->as_Mem();
 540       if (alignment(m) != top_align) {
 541         memops.remove(i);
 542       }
 543     }
 544 
 545   } // while (memops.size() != 0
 546   set_align_to_ref(best_align_to_mem_ref);
 547 
 548 #ifndef PRODUCT
 549   if (TraceSuperWord) {
 550     tty->print_cr("\nAfter find_adjacent_refs");
 551     print_packset();
 552   }
 553 #endif
 554 }
 555 
 556 //------------------------------find_align_to_ref---------------------------
 557 // Find a memory reference to align the loop induction variable to.
 558 // Looks first at stores then at loads, looking for a memory reference
 559 // with the largest number of references similar to it.
 560 MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
 561   GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
 562 
 563   // Count number of comparable memory ops
 564   for (uint i = 0; i < memops.size(); i++) {
 565     MemNode* s1 = memops.at(i)->as_Mem();
 566     SWPointer p1(s1, this, NULL, false);
 567     // Discard if pre loop can't align this reference
 568     if (!ref_is_alignable(p1)) {
 569       *cmp_ct.adr_at(i) = 0;
 570       continue;
 571     }
 572     for (uint j = i+1; j < memops.size(); j++) {
 573       MemNode* s2 = memops.at(j)->as_Mem();
 574       if (isomorphic(s1, s2)) {
 575         SWPointer p2(s2, this, NULL, false);
 576         if (p1.comparable(p2)) {
 577           (*cmp_ct.adr_at(i))++;
 578           (*cmp_ct.adr_at(j))++;
 579         }
 580       }
 581     }
 582   }
 583 
 584   // Find Store (or Load) with the greatest number of "comparable" references,
 585   // biggest vector size, smallest data size and smallest iv offset.
 586   int max_ct        = 0;
 587   int max_vw        = 0;
 588   int max_idx       = -1;
 589   int min_size      = max_jint;
 590   int min_iv_offset = max_jint;
 591   for (uint j = 0; j < memops.size(); j++) {
 592     MemNode* s = memops.at(j)->as_Mem();
 593     if (s->is_Store()) {
 594       int vw = vector_width_in_bytes(s);
 595       assert(vw > 1, "sanity");
 596       SWPointer p(s, this, NULL, false);
 597       if (cmp_ct.at(j) >  max_ct ||
 598           cmp_ct.at(j) == max_ct &&
 599             (vw >  max_vw ||
 600              vw == max_vw &&
 601               (data_size(s) <  min_size ||
 602                data_size(s) == min_size &&
 603                  (p.offset_in_bytes() < min_iv_offset)))) {
 604         max_ct = cmp_ct.at(j);
 605         max_vw = vw;
 606         max_idx = j;
 607         min_size = data_size(s);
 608         min_iv_offset = p.offset_in_bytes();
 609       }
 610     }
 611   }
 612   // If no stores, look at loads
 613   if (max_ct == 0) {
 614     for (uint j = 0; j < memops.size(); j++) {
 615       MemNode* s = memops.at(j)->as_Mem();
 616       if (s->is_Load()) {
 617         int vw = vector_width_in_bytes(s);
 618         assert(vw > 1, "sanity");
 619         SWPointer p(s, this, NULL, false);
 620         if (cmp_ct.at(j) >  max_ct ||
 621             cmp_ct.at(j) == max_ct &&
 622               (vw >  max_vw ||
 623                vw == max_vw &&
 624                 (data_size(s) <  min_size ||
 625                  data_size(s) == min_size &&
 626                    (p.offset_in_bytes() < min_iv_offset)))) {
 627           max_ct = cmp_ct.at(j);
 628           max_vw = vw;
 629           max_idx = j;
 630           min_size = data_size(s);
 631           min_iv_offset = p.offset_in_bytes();
 632         }
 633       }
 634     }
 635   }
 636 
 637 #ifdef ASSERT
 638   if (TraceSuperWord && Verbose) {
 639     tty->print_cr("\nVector memops after find_align_to_ref");
 640     for (uint i = 0; i < memops.size(); i++) {
 641       MemNode* s = memops.at(i)->as_Mem();
 642       s->dump();
 643     }
 644   }
 645 #endif
 646 
 647   if (max_ct > 0) {
 648 #ifdef ASSERT
 649     if (TraceSuperWord) {
 650       tty->print("\nVector align to node: ");
 651       memops.at(max_idx)->as_Mem()->dump();
 652     }
 653 #endif
 654     return memops.at(max_idx)->as_Mem();
 655   }
 656   return NULL;
 657 }
 658 
 659 //------------------------------ref_is_alignable---------------------------
 660 // Can the preloop align the reference to position zero in the vector?
 661 bool SuperWord::ref_is_alignable(SWPointer& p) {
 662   if (!p.has_iv()) {
 663     return true;   // no induction variable
 664   }
 665   CountedLoopEndNode* pre_end = get_pre_loop_end(lp()->as_CountedLoop());
 666   assert(pre_end != NULL, "we must have a correct pre-loop");
 667   assert(pre_end->stride_is_con(), "pre loop stride is constant");
 668   int preloop_stride = pre_end->stride_con();
 669 
 670   int span = preloop_stride * p.scale_in_bytes();
 671   int mem_size = p.memory_size();
 672   int offset   = p.offset_in_bytes();
 673   // Stride one accesses are alignable if offset is aligned to memory operation size.
 674   // Offset can be unaligned when UseUnalignedAccesses is used.
 675   if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) {
 676     return true;
 677   }
 678   // If the initial offset from start of the object is computable,
 679   // check if the pre-loop can align the final offset accordingly.
 680   //
 681   // In other words: Can we find an i such that the offset
 682   // after i pre-loop iterations is aligned to vw?
 683   //   (init_offset + pre_loop) % vw == 0              (1)
 684   // where
 685   //   pre_loop = i * span
 686   // is the number of bytes added to the offset by i pre-loop iterations.
 687   //
 688   // For this to hold we need pre_loop to increase init_offset by
 689   //   pre_loop = vw - (init_offset % vw)
 690   //
 691   // This is only possible if pre_loop is divisible by span because each
 692   // pre-loop iteration increases the initial offset by 'span' bytes:
 693   //   (vw - (init_offset % vw)) % span == 0
 694   //
 695   int vw = vector_width_in_bytes(p.mem());
 696   assert(vw > 1, "sanity");
 697   Node* init_nd = pre_end->init_trip();
 698   if (init_nd->is_Con() && p.invar() == NULL) {
 699     int init = init_nd->bottom_type()->is_int()->get_con();
 700     int init_offset = init * p.scale_in_bytes() + offset;
 701     assert(init_offset >= 0, "positive offset from object start");
 702     if (vw % span == 0) {
 703       // If vm is a multiple of span, we use formula (1).
 704       if (span > 0) {
 705         return (vw - (init_offset % vw)) % span == 0;
 706       } else {
 707         assert(span < 0, "nonzero stride * scale");
 708         return (init_offset % vw) % -span == 0;
 709       }
 710     } else if (span % vw == 0) {
 711       // If span is a multiple of vw, we can simplify formula (1) to:
 712       //   (init_offset + i * span) % vw == 0
 713       //     =>
 714       //   (init_offset % vw) + ((i * span) % vw) == 0
 715       //     =>
 716       //   init_offset % vw == 0
 717       //
 718       // Because we add a multiple of vw to the initial offset, the final
 719       // offset is a multiple of vw if and only if init_offset is a multiple.
 720       //
 721       return (init_offset % vw) == 0;
 722     }
 723   }
 724   return false;
 725 }
 726 
 727 //---------------------------get_iv_adjustment---------------------------
 728 // Calculate loop's iv adjustment for this memory ops.
 729 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
 730   SWPointer align_to_ref_p(mem_ref, this, NULL, false);
 731   int offset = align_to_ref_p.offset_in_bytes();
 732   int scale  = align_to_ref_p.scale_in_bytes();
 733   int elt_size = align_to_ref_p.memory_size();
 734   int vw       = vector_width_in_bytes(mem_ref);
 735   assert(vw > 1, "sanity");
 736   int iv_adjustment;
 737   if (scale != 0) {
 738     int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
 739     // At least one iteration is executed in pre-loop by default. As result
 740     // several iterations are needed to align memory operations in main-loop even
 741     // if offset is 0.
 742     int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
 743     assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
 744            err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
 745     iv_adjustment = iv_adjustment_in_bytes/elt_size;
 746   } else {
 747     // This memory op is not dependent on iv (scale == 0)
 748     iv_adjustment = 0;
 749   }
 750 
 751 #ifndef PRODUCT
 752   if (TraceSuperWord)
 753     tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d",
 754                   offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
 755 #endif
 756   return iv_adjustment;
 757 }
 758 
 759 //---------------------------dependence_graph---------------------------
 760 // Construct dependency graph.
 761 // Add dependence edges to load/store nodes for memory dependence
 762 //    A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
 763 void SuperWord::dependence_graph() {
 764   // First, assign a dependence node to each memory node
 765   for (int i = 0; i < _block.length(); i++ ) {
 766     Node *n = _block.at(i);
 767     if (n->is_Mem() || n->is_Phi() && n->bottom_type() == Type::MEMORY) {
 768       _dg.make_node(n);
 769     }
 770   }
 771 
 772   // For each memory slice, create the dependences
 773   for (int i = 0; i < _mem_slice_head.length(); i++) {
 774     Node* n      = _mem_slice_head.at(i);
 775     Node* n_tail = _mem_slice_tail.at(i);
 776 
 777     // Get slice in predecessor order (last is first)
 778     mem_slice_preds(n_tail, n, _nlist);
 779 
 780 #ifndef PRODUCT
 781     if(TraceSuperWord && Verbose) {
 782       tty->print_cr("SuperWord::dependence_graph: built a new mem slice");
 783       for (int j = _nlist.length() - 1; j >= 0 ; j--) {
 784         _nlist.at(j)->dump();
 785       }
 786     }
 787 #endif
 788     // Make the slice dependent on the root
 789     DepMem* slice = _dg.dep(n);
 790     _dg.make_edge(_dg.root(), slice);
 791 
 792     // Create a sink for the slice
 793     DepMem* slice_sink = _dg.make_node(NULL);
 794     _dg.make_edge(slice_sink, _dg.tail());
 795 
 796     // Now visit each pair of memory ops, creating the edges
 797     for (int j = _nlist.length() - 1; j >= 0 ; j--) {
 798       Node* s1 = _nlist.at(j);
 799 
 800       // If no dependency yet, use slice
 801       if (_dg.dep(s1)->in_cnt() == 0) {
 802         _dg.make_edge(slice, s1);
 803       }
 804       SWPointer p1(s1->as_Mem(), this, NULL, false);
 805       bool sink_dependent = true;
 806       for (int k = j - 1; k >= 0; k--) {
 807         Node* s2 = _nlist.at(k);
 808         if (s1->is_Load() && s2->is_Load())
 809           continue;
 810         SWPointer p2(s2->as_Mem(), this, NULL, false);
 811 
 812         int cmp = p1.cmp(p2);
 813         if (SuperWordRTDepCheck &&
 814             p1.base() != p2.base() && p1.valid() && p2.valid()) {
 815           // Create a runtime check to disambiguate
 816           OrderedPair pp(p1.base(), p2.base());
 817           _disjoint_ptrs.append_if_missing(pp);
 818         } else if (!SWPointer::not_equal(cmp)) {
 819           // Possibly same address
 820           _dg.make_edge(s1, s2);
 821           sink_dependent = false;
 822         }
 823       }
 824       if (sink_dependent) {
 825         _dg.make_edge(s1, slice_sink);
 826       }
 827     }
 828 #ifndef PRODUCT
 829     if (TraceSuperWord) {
 830       tty->print_cr("\nDependence graph for slice: %d", n->_idx);
 831       for (int q = 0; q < _nlist.length(); q++) {
 832         _dg.print(_nlist.at(q));
 833       }
 834       tty->cr();
 835     }
 836 #endif
 837     _nlist.clear();
 838   }
 839 
 840 #ifndef PRODUCT
 841   if (TraceSuperWord) {
 842     tty->print_cr("\ndisjoint_ptrs: %s", _disjoint_ptrs.length() > 0 ? "" : "NONE");
 843     for (int r = 0; r < _disjoint_ptrs.length(); r++) {
 844       _disjoint_ptrs.at(r).print();
 845       tty->cr();
 846     }
 847     tty->cr();
 848   }
 849 #endif
 850 }
 851 
 852 //---------------------------mem_slice_preds---------------------------
 853 // Return a memory slice (node list) in predecessor order starting at "start"
 854 void SuperWord::mem_slice_preds(Node* start, Node* stop, GrowableArray<Node*> &preds) {
 855   assert(preds.length() == 0, "start empty");
 856   Node* n = start;
 857   Node* prev = NULL;
 858   while (true) {
 859     assert(in_bb(n), "must be in block");
 860     for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
 861       Node* out = n->fast_out(i);
 862       if (out->is_Load()) {
 863         if (in_bb(out)) {
 864           preds.push(out);
 865         }
 866       } else {
 867         // FIXME
 868         if (out->is_MergeMem() && !in_bb(out)) {
 869           // Either unrolling is causing a memory edge not to disappear,
 870           // or need to run igvn.optimize() again before SLP
 871         } else if (out->is_Phi() && out->bottom_type() == Type::MEMORY && !in_bb(out)) {
 872           // Ditto.  Not sure what else to check further.
 873         } else if (out->Opcode() == Op_StoreCM && out->in(MemNode::OopStore) == n) {
 874           // StoreCM has an input edge used as a precedence edge.
 875           // Maybe an issue when oop stores are vectorized.
 876         } else {
 877           assert(out == prev || prev == NULL, "no branches off of store slice");
 878         }
 879       }
 880     }
 881     if (n == stop) break;
 882     preds.push(n);
 883     prev = n;
 884     assert(n->is_Mem(), err_msg_res("unexpected node %s", n->Name()));
 885     n = n->in(MemNode::Memory);
 886   }
 887 }
 888 
 889 //------------------------------stmts_can_pack---------------------------
 890 // Can s1 and s2 be in a pack with s1 immediately preceding s2 and
 891 // s1 aligned at "align"
 892 bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
 893 
 894   // Do not use superword for non-primitives
 895   BasicType bt1 = velt_basic_type(s1);
 896   BasicType bt2 = velt_basic_type(s2);
 897   if(!is_java_primitive(bt1) || !is_java_primitive(bt2))
 898     return false;
 899   if (Matcher::max_vector_size(bt1) < 2) {
 900     return false; // No vectors for this type
 901   }
 902 
 903   if (isomorphic(s1, s2)) {
 904     if (independent(s1, s2) || reduction(s1, s2)) {
 905       if (!exists_at(s1, 0) && !exists_at(s2, 1)) {
 906         if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
 907           int s1_align = alignment(s1);
 908           int s2_align = alignment(s2);
 909           if (s1_align == top_align || s1_align == align) {
 910             if (s2_align == top_align || s2_align == align + data_size(s1)) {
 911               return true;
 912             }
 913           }
 914         }
 915       }
 916     }
 917   }
 918   return false;
 919 }
 920 
 921 //------------------------------exists_at---------------------------
 922 // Does s exist in a pack at position pos?
 923 bool SuperWord::exists_at(Node* s, uint pos) {
 924   for (int i = 0; i < _packset.length(); i++) {
 925     Node_List* p = _packset.at(i);
 926     if (p->at(pos) == s) {
 927       return true;
 928     }
 929   }
 930   return false;
 931 }
 932 
 933 //------------------------------are_adjacent_refs---------------------------
 934 // Is s1 immediately before s2 in memory?
 935 bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
 936   if (!s1->is_Mem() || !s2->is_Mem()) return false;
 937   if (!in_bb(s1)    || !in_bb(s2))    return false;
 938 
 939   // Do not use superword for non-primitives
 940   if (!is_java_primitive(s1->as_Mem()->memory_type()) ||
 941       !is_java_primitive(s2->as_Mem()->memory_type())) {
 942     return false;
 943   }
 944 
 945   // FIXME - co_locate_pack fails on Stores in different mem-slices, so
 946   // only pack memops that are in the same alias set until that's fixed.
 947   if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
 948       _phase->C->get_alias_index(s2->as_Mem()->adr_type()))
 949     return false;
 950   SWPointer p1(s1->as_Mem(), this, NULL, false);
 951   SWPointer p2(s2->as_Mem(), this, NULL, false);
 952   if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
 953   int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
 954   return diff == data_size(s1);
 955 }
 956 
 957 //------------------------------isomorphic---------------------------
 958 // Are s1 and s2 similar?
 959 bool SuperWord::isomorphic(Node* s1, Node* s2) {
 960   if (s1->Opcode() != s2->Opcode()) return false;
 961   if (s1->req() != s2->req()) return false;
 962   if (s1->in(0) != s2->in(0)) return false;
 963   if (!same_velt_type(s1, s2)) return false;
 964   return true;
 965 }
 966 
 967 //------------------------------independent---------------------------
 968 // Is there no data path from s1 to s2 or s2 to s1?
 969 bool SuperWord::independent(Node* s1, Node* s2) {
 970   //  assert(s1->Opcode() == s2->Opcode(), "check isomorphic first");
 971   int d1 = depth(s1);
 972   int d2 = depth(s2);
 973   if (d1 == d2) return s1 != s2;
 974   Node* deep    = d1 > d2 ? s1 : s2;
 975   Node* shallow = d1 > d2 ? s2 : s1;
 976 
 977   visited_clear();
 978 
 979   return independent_path(shallow, deep);
 980 }
 981 
 982 //------------------------------reduction---------------------------
 983 // Is there a data path between s1 and s2 and the nodes reductions?
 984 bool SuperWord::reduction(Node* s1, Node* s2) {
 985   bool retValue = false;
 986   int d1 = depth(s1);
 987   int d2 = depth(s2);
 988   if (d1 + 1 == d2) {
 989     if (s1->is_reduction() && s2->is_reduction()) {
 990       // This is an ordered set, so s1 should define s2
 991       for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
 992         Node* t1 = s1->fast_out(i);
 993         if (t1 == s2) {
 994           // both nodes are reductions and connected
 995           retValue = true;
 996         }
 997       }
 998     }
 999   }
1000 
1001   return retValue;
1002 }
1003 
1004 //------------------------------independent_path------------------------------
1005 // Helper for independent
1006 bool SuperWord::independent_path(Node* shallow, Node* deep, uint dp) {
1007   if (dp >= 1000) return false; // stop deep recursion
1008   visited_set(deep);
1009   int shal_depth = depth(shallow);
1010   assert(shal_depth <= depth(deep), "must be");
1011   for (DepPreds preds(deep, _dg); !preds.done(); preds.next()) {
1012     Node* pred = preds.current();
1013     if (in_bb(pred) && !visited_test(pred)) {
1014       if (shallow == pred) {
1015         return false;
1016       }
1017       if (shal_depth < depth(pred) && !independent_path(shallow, pred, dp+1)) {
1018         return false;
1019       }
1020     }
1021   }
1022   return true;
1023 }
1024 
1025 //------------------------------set_alignment---------------------------
1026 void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
1027   set_alignment(s1, align);
1028   if (align == top_align || align == bottom_align) {
1029     set_alignment(s2, align);
1030   } else {
1031     set_alignment(s2, align + data_size(s1));
1032   }
1033 }
1034 
1035 //------------------------------data_size---------------------------
1036 int SuperWord::data_size(Node* s) {
1037   int bsize = type2aelembytes(velt_basic_type(s));
1038   assert(bsize != 0, "valid size");
1039   return bsize;
1040 }
1041 
1042 //------------------------------extend_packlist---------------------------
1043 // Extend packset by following use->def and def->use links from pack members.
1044 void SuperWord::extend_packlist() {
1045   bool changed;
1046   do {
1047     packset_sort(_packset.length());
1048     changed = false;
1049     for (int i = 0; i < _packset.length(); i++) {
1050       Node_List* p = _packset.at(i);
1051       changed |= follow_use_defs(p);
1052       changed |= follow_def_uses(p);
1053     }
1054   } while (changed);
1055 
1056   if (_race_possible) {
1057     for (int i = 0; i < _packset.length(); i++) {
1058       Node_List* p = _packset.at(i);
1059       order_def_uses(p);
1060     }
1061   }
1062 
1063 #ifndef PRODUCT
1064   if (TraceSuperWord) {
1065     tty->print_cr("\nAfter extend_packlist");
1066     print_packset();
1067   }
1068 #endif
1069 }
1070 
1071 //------------------------------follow_use_defs---------------------------
1072 // Extend the packset by visiting operand definitions of nodes in pack p
1073 bool SuperWord::follow_use_defs(Node_List* p) {
1074   assert(p->size() == 2, "just checking");
1075   Node* s1 = p->at(0);
1076   Node* s2 = p->at(1);
1077   assert(s1->req() == s2->req(), "just checking");
1078   assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
1079 
1080   if (s1->is_Load()) return false;
1081 
1082   int align = alignment(s1);
1083   bool changed = false;
1084   int start = s1->is_Store() ? MemNode::ValueIn   : 1;
1085   int end   = s1->is_Store() ? MemNode::ValueIn+1 : s1->req();
1086   for (int j = start; j < end; j++) {
1087     Node* t1 = s1->in(j);
1088     Node* t2 = s2->in(j);
1089     if (!in_bb(t1) || !in_bb(t2))
1090       continue;
1091     if (stmts_can_pack(t1, t2, align)) {
1092       if (est_savings(t1, t2) >= 0) {
1093         Node_List* pair = new Node_List();
1094         pair->push(t1);
1095         pair->push(t2);
1096         _packset.append(pair);
1097         set_alignment(t1, t2, align);
1098         changed = true;
1099       }
1100     }
1101   }
1102   return changed;
1103 }
1104 
1105 //------------------------------follow_def_uses---------------------------
1106 // Extend the packset by visiting uses of nodes in pack p
1107 bool SuperWord::follow_def_uses(Node_List* p) {
1108   bool changed = false;
1109   Node* s1 = p->at(0);
1110   Node* s2 = p->at(1);
1111   assert(p->size() == 2, "just checking");
1112   assert(s1->req() == s2->req(), "just checking");
1113   assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
1114 
1115   if (s1->is_Store()) return false;
1116 
1117   int align = alignment(s1);
1118   int savings = -1;
1119   int num_s1_uses = 0;
1120   Node* u1 = NULL;
1121   Node* u2 = NULL;
1122   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1123     Node* t1 = s1->fast_out(i);
1124     num_s1_uses++;
1125     if (!in_bb(t1)) continue;
1126     for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
1127       Node* t2 = s2->fast_out(j);
1128       if (!in_bb(t2)) continue;
1129       if (!opnd_positions_match(s1, t1, s2, t2))
1130         continue;
1131       if (stmts_can_pack(t1, t2, align)) {
1132         int my_savings = est_savings(t1, t2);
1133         if (my_savings > savings) {
1134           savings = my_savings;
1135           u1 = t1;
1136           u2 = t2;
1137         }
1138       }
1139     }
1140   }
1141   if (num_s1_uses > 1) {
1142     _race_possible = true;
1143   }
1144   if (savings >= 0) {
1145     Node_List* pair = new Node_List();
1146     pair->push(u1);
1147     pair->push(u2);
1148     _packset.append(pair);
1149     set_alignment(u1, u2, align);
1150     changed = true;
1151   }
1152   return changed;
1153 }
1154 
1155 //------------------------------order_def_uses---------------------------
1156 // For extended packsets, ordinally arrange uses packset by major component
1157 void SuperWord::order_def_uses(Node_List* p) {
1158   Node* s1 = p->at(0);
1159 
1160   if (s1->is_Store()) return;
1161 
1162   // reductions are always managed beforehand
1163   if (s1->is_reduction()) return;
1164 
1165   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1166     Node* t1 = s1->fast_out(i);
1167 
1168     // Only allow operand swap on commuting operations
1169     if (!t1->is_Add() && !t1->is_Mul()) {
1170       break;
1171     }
1172 
1173     // Now find t1's packset
1174     Node_List* p2 = NULL;
1175     for (int j = 0; j < _packset.length(); j++) {
1176       p2 = _packset.at(j);
1177       Node* first = p2->at(0);
1178       if (t1 == first) {
1179         break;
1180       }
1181       p2 = NULL;
1182     }
1183     // Arrange all sub components by the major component
1184     if (p2 != NULL) {
1185       for (uint j = 1; j < p->size(); j++) {
1186         Node* d1 = p->at(j);
1187         Node* u1 = p2->at(j);
1188         opnd_positions_match(s1, t1, d1, u1);
1189       }
1190     }
1191   }
1192 }
1193 
1194 //---------------------------opnd_positions_match-------------------------
1195 // Is the use of d1 in u1 at the same operand position as d2 in u2?
1196 bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) {
1197   // check reductions to see if they are marshalled to represent the reduction
1198   // operator in a specified opnd
1199   if (u1->is_reduction() && u2->is_reduction()) {
1200     // ensure reductions have phis and reduction definitions feeding the 1st operand
1201     Node* first = u1->in(2);
1202     if (first->is_Phi() || first->is_reduction()) {
1203       u1->swap_edges(1, 2);
1204     }
1205     // ensure reductions have phis and reduction definitions feeding the 1st operand
1206     first = u2->in(2);
1207     if (first->is_Phi() || first->is_reduction()) {
1208       u2->swap_edges(1, 2);
1209     }
1210     return true;
1211   }
1212 
1213   uint ct = u1->req();
1214   if (ct != u2->req()) return false;
1215   uint i1 = 0;
1216   uint i2 = 0;
1217   do {
1218     for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break;
1219     for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break;
1220     if (i1 != i2) {
1221       if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) {
1222         // Further analysis relies on operands position matching.
1223         u2->swap_edges(i1, i2);
1224       } else {
1225         return false;
1226       }
1227     }
1228   } while (i1 < ct);
1229   return true;
1230 }
1231 
1232 //------------------------------est_savings---------------------------
1233 // Estimate the savings from executing s1 and s2 as a pack
1234 int SuperWord::est_savings(Node* s1, Node* s2) {
1235   int save_in = 2 - 1; // 2 operations per instruction in packed form
1236 
1237   // inputs
1238   for (uint i = 1; i < s1->req(); i++) {
1239     Node* x1 = s1->in(i);
1240     Node* x2 = s2->in(i);
1241     if (x1 != x2) {
1242       if (are_adjacent_refs(x1, x2)) {
1243         save_in += adjacent_profit(x1, x2);
1244       } else if (!in_packset(x1, x2)) {
1245         save_in -= pack_cost(2);
1246       } else {
1247         save_in += unpack_cost(2);
1248       }
1249     }
1250   }
1251 
1252   // uses of result
1253   uint ct = 0;
1254   int save_use = 0;
1255   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1256     Node* s1_use = s1->fast_out(i);
1257     for (int j = 0; j < _packset.length(); j++) {
1258       Node_List* p = _packset.at(j);
1259       if (p->at(0) == s1_use) {
1260         for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) {
1261           Node* s2_use = s2->fast_out(k);
1262           if (p->at(p->size()-1) == s2_use) {
1263             ct++;
1264             if (are_adjacent_refs(s1_use, s2_use)) {
1265               save_use += adjacent_profit(s1_use, s2_use);
1266             }
1267           }
1268         }
1269       }
1270     }
1271   }
1272 
1273   if (ct < s1->outcnt()) save_use += unpack_cost(1);
1274   if (ct < s2->outcnt()) save_use += unpack_cost(1);
1275 
1276   return MAX2(save_in, save_use);
1277 }
1278 
1279 //------------------------------costs---------------------------
1280 int SuperWord::adjacent_profit(Node* s1, Node* s2) { return 2; }
1281 int SuperWord::pack_cost(int ct)   { return ct; }
1282 int SuperWord::unpack_cost(int ct) { return ct; }
1283 
1284 //------------------------------combine_packs---------------------------
1285 // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last
1286 void SuperWord::combine_packs() {
1287   bool changed = true;
1288   // Combine packs regardless max vector size.
1289   while (changed) {
1290     changed = false;
1291     for (int i = 0; i < _packset.length(); i++) {
1292       Node_List* p1 = _packset.at(i);
1293       if (p1 == NULL) continue;
1294       // Because of sorting we can start at i + 1
1295       for (int j = i + 1; j < _packset.length(); j++) {
1296         Node_List* p2 = _packset.at(j);
1297         if (p2 == NULL) continue;
1298         if (i == j) continue;
1299         if (p1->at(p1->size()-1) == p2->at(0)) {
1300           for (uint k = 1; k < p2->size(); k++) {
1301             p1->push(p2->at(k));
1302           }
1303           _packset.at_put(j, NULL);
1304           changed = true;
1305         }
1306       }
1307     }
1308   }
1309 
1310   // Split packs which have size greater then max vector size.
1311   for (int i = 0; i < _packset.length(); i++) {
1312     Node_List* p1 = _packset.at(i);
1313     if (p1 != NULL) {
1314       BasicType bt = velt_basic_type(p1->at(0));
1315       uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector
1316       assert(is_power_of_2(max_vlen), "sanity");
1317       uint psize = p1->size();
1318       if (!is_power_of_2(psize)) {
1319         // Skip pack which can't be vector.
1320         // case1: for(...) { a[i] = i; }    elements values are different (i+x)
1321         // case2: for(...) { a[i] = b[i+1]; }  can't align both, load and store
1322         _packset.at_put(i, NULL);
1323         continue;
1324       }
1325       if (psize > max_vlen) {
1326         Node_List* pack = new Node_List();
1327         for (uint j = 0; j < psize; j++) {
1328           pack->push(p1->at(j));
1329           if (pack->size() >= max_vlen) {
1330             assert(is_power_of_2(pack->size()), "sanity");
1331             _packset.append(pack);
1332             pack = new Node_List();
1333           }
1334         }
1335         _packset.at_put(i, NULL);
1336       }
1337     }
1338   }
1339 
1340   // Compress list.
1341   for (int i = _packset.length() - 1; i >= 0; i--) {
1342     Node_List* p1 = _packset.at(i);
1343     if (p1 == NULL) {
1344       _packset.remove_at(i);
1345     }
1346   }
1347 
1348 #ifndef PRODUCT
1349   if (TraceSuperWord) {
1350     tty->print_cr("\nAfter combine_packs");
1351     print_packset();
1352   }
1353 #endif
1354 }
1355 
1356 //-----------------------------construct_my_pack_map--------------------------
1357 // Construct the map from nodes to packs.  Only valid after the
1358 // point where a node is only in one pack (after combine_packs).
1359 void SuperWord::construct_my_pack_map() {
1360   Node_List* rslt = NULL;
1361   for (int i = 0; i < _packset.length(); i++) {
1362     Node_List* p = _packset.at(i);
1363     for (uint j = 0; j < p->size(); j++) {
1364       Node* s = p->at(j);
1365       assert(my_pack(s) == NULL, "only in one pack");
1366       set_my_pack(s, p);
1367     }
1368   }
1369 }
1370 
1371 //------------------------------filter_packs---------------------------
1372 // Remove packs that are not implemented or not profitable.
1373 void SuperWord::filter_packs() {
1374   // Remove packs that are not implemented
1375   for (int i = _packset.length() - 1; i >= 0; i--) {
1376     Node_List* pk = _packset.at(i);
1377     bool impl = implemented(pk);
1378     if (!impl) {
1379 #ifndef PRODUCT
1380       if (TraceSuperWord && Verbose) {
1381         tty->print_cr("Unimplemented");
1382         pk->at(0)->dump();
1383       }
1384 #endif
1385       remove_pack_at(i);
1386     }
1387     Node *n = pk->at(0);
1388     if (n->is_reduction()) {
1389       _num_reductions++;
1390     } else {
1391       _num_work_vecs++;
1392     }
1393   }
1394 
1395   // Remove packs that are not profitable
1396   bool changed;
1397   do {
1398     changed = false;
1399     for (int i = _packset.length() - 1; i >= 0; i--) {
1400       Node_List* pk = _packset.at(i);
1401       bool prof = profitable(pk);
1402       if (!prof) {
1403 #ifndef PRODUCT
1404         if (TraceSuperWord && Verbose) {
1405           tty->print_cr("Unprofitable");
1406           pk->at(0)->dump();
1407         }
1408 #endif
1409         remove_pack_at(i);
1410         changed = true;
1411       }
1412     }
1413   } while (changed);
1414 
1415 #ifndef PRODUCT
1416   if (TraceSuperWord) {
1417     tty->print_cr("\nAfter filter_packs");
1418     print_packset();
1419     tty->cr();
1420   }
1421 #endif
1422 }
1423 
1424 //------------------------------implemented---------------------------
1425 // Can code be generated for pack p?
1426 bool SuperWord::implemented(Node_List* p) {
1427   bool retValue = false;
1428   Node* p0 = p->at(0);
1429   if (p0 != NULL) {
1430     int opc = p0->Opcode();
1431     uint size = p->size();
1432     if (p0->is_reduction()) {
1433       const Type *arith_type = p0->bottom_type();
1434       // Length 2 reductions of INT/LONG do not offer performance benefits
1435       if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) {
1436         retValue = false;
1437       } else {
1438         retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
1439       }
1440     } else {
1441       retValue = VectorNode::implemented(opc, size, velt_basic_type(p0));
1442     }
1443   }
1444   return retValue;
1445 }
1446 
1447 //------------------------------same_inputs--------------------------
1448 // For pack p, are all idx operands the same?
1449 static bool same_inputs(Node_List* p, int idx) {
1450   Node* p0 = p->at(0);
1451   uint vlen = p->size();
1452   Node* p0_def = p0->in(idx);
1453   for (uint i = 1; i < vlen; i++) {
1454     Node* pi = p->at(i);
1455     Node* pi_def = pi->in(idx);
1456     if (p0_def != pi_def)
1457       return false;
1458   }
1459   return true;
1460 }
1461 
1462 //------------------------------profitable---------------------------
1463 // For pack p, are all operands and all uses (with in the block) vector?
1464 bool SuperWord::profitable(Node_List* p) {
1465   Node* p0 = p->at(0);
1466   uint start, end;
1467   VectorNode::vector_operands(p0, &start, &end);
1468 
1469   // Return false if some inputs are not vectors or vectors with different
1470   // size or alignment.
1471   // Also, for now, return false if not scalar promotion case when inputs are
1472   // the same. Later, implement PackNode and allow differing, non-vector inputs
1473   // (maybe just the ones from outside the block.)
1474   for (uint i = start; i < end; i++) {
1475     if (!is_vector_use(p0, i))
1476       return false;
1477   }
1478   // Check if reductions are connected
1479   if (p0->is_reduction()) {
1480     Node* second_in = p0->in(2);
1481     Node_List* second_pk = my_pack(second_in);
1482     if ((second_pk == NULL) || (_num_work_vecs == _num_reductions)) {
1483       // Remove reduction flag if no parent pack or if not enough work
1484       // to cover reduction expansion overhead
1485       p0->remove_flag(Node::Flag_is_reduction);
1486       return false;
1487     } else if (second_pk->size() != p->size()) {
1488       return false;
1489     }
1490   }
1491   if (VectorNode::is_shift(p0)) {
1492     // For now, return false if shift count is vector or not scalar promotion
1493     // case (different shift counts) because it is not supported yet.
1494     Node* cnt = p0->in(2);
1495     Node_List* cnt_pk = my_pack(cnt);
1496     if (cnt_pk != NULL)
1497       return false;
1498     if (!same_inputs(p, 2))
1499       return false;
1500   }
1501   if (!p0->is_Store()) {
1502     // For now, return false if not all uses are vector.
1503     // Later, implement ExtractNode and allow non-vector uses (maybe
1504     // just the ones outside the block.)
1505     for (uint i = 0; i < p->size(); i++) {
1506       Node* def = p->at(i);
1507       for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
1508         Node* use = def->fast_out(j);
1509         for (uint k = 0; k < use->req(); k++) {
1510           Node* n = use->in(k);
1511           if (def == n) {
1512             // reductions can be loop carried dependences
1513             if (def->is_reduction() && use->is_Phi())
1514               continue;
1515             if (!is_vector_use(use, k)) {
1516               return false;
1517             }
1518           }
1519         }
1520       }
1521     }
1522   }
1523   return true;
1524 }
1525 
1526 //------------------------------schedule---------------------------
1527 // Adjust the memory graph for the packed operations
1528 void SuperWord::schedule() {
1529 
1530   // Co-locate in the memory graph the members of each memory pack
1531   for (int i = 0; i < _packset.length(); i++) {
1532     co_locate_pack(_packset.at(i));
1533   }
1534 }
1535 
1536 //-------------------------------remove_and_insert-------------------
1537 // Remove "current" from its current position in the memory graph and insert
1538 // it after the appropriate insertion point (lip or uip).
1539 void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip,
1540                                   Node *uip, Unique_Node_List &sched_before) {
1541   Node* my_mem = current->in(MemNode::Memory);
1542   bool sched_up = sched_before.member(current);
1543 
1544   // remove current_store from its current position in the memmory graph
1545   for (DUIterator i = current->outs(); current->has_out(i); i++) {
1546     Node* use = current->out(i);
1547     if (use->is_Mem()) {
1548       assert(use->in(MemNode::Memory) == current, "must be");
1549       if (use == prev) { // connect prev to my_mem
1550           _igvn.replace_input_of(use, MemNode::Memory, my_mem);
1551           --i; //deleted this edge; rescan position
1552       } else if (sched_before.member(use)) {
1553         if (!sched_up) { // Will be moved together with current
1554           _igvn.replace_input_of(use, MemNode::Memory, uip);
1555           --i; //deleted this edge; rescan position
1556         }
1557       } else {
1558         if (sched_up) { // Will be moved together with current
1559           _igvn.replace_input_of(use, MemNode::Memory, lip);
1560           --i; //deleted this edge; rescan position
1561         }
1562       }
1563     }
1564   }
1565 
1566   Node *insert_pt =  sched_up ?  uip : lip;
1567 
1568   // all uses of insert_pt's memory state should use current's instead
1569   for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
1570     Node* use = insert_pt->out(i);
1571     if (use->is_Mem()) {
1572       assert(use->in(MemNode::Memory) == insert_pt, "must be");
1573       _igvn.replace_input_of(use, MemNode::Memory, current);
1574       --i; //deleted this edge; rescan position
1575     } else if (!sched_up && use->is_Phi() && use->bottom_type() == Type::MEMORY) {
1576       uint pos; //lip (lower insert point) must be the last one in the memory slice
1577       for (pos=1; pos < use->req(); pos++) {
1578         if (use->in(pos) == insert_pt) break;
1579       }
1580       _igvn.replace_input_of(use, pos, current);
1581       --i;
1582     }
1583   }
1584 
1585   //connect current to insert_pt
1586   _igvn.replace_input_of(current, MemNode::Memory, insert_pt);
1587 }
1588 
1589 //------------------------------co_locate_pack----------------------------------
1590 // To schedule a store pack, we need to move any sandwiched memory ops either before
1591 // or after the pack, based upon dependence information:
1592 // (1) If any store in the pack depends on the sandwiched memory op, the
1593 //     sandwiched memory op must be scheduled BEFORE the pack;
1594 // (2) If a sandwiched memory op depends on any store in the pack, the
1595 //     sandwiched memory op must be scheduled AFTER the pack;
1596 // (3) If a sandwiched memory op (say, memA) depends on another sandwiched
1597 //     memory op (say memB), memB must be scheduled before memA. So, if memA is
1598 //     scheduled before the pack, memB must also be scheduled before the pack;
1599 // (4) If there is no dependence restriction for a sandwiched memory op, we simply
1600 //     schedule this store AFTER the pack
1601 // (5) We know there is no dependence cycle, so there in no other case;
1602 // (6) Finally, all memory ops in another single pack should be moved in the same direction.
1603 //
1604 // To schedule a load pack, we use the memory state of either the first or the last load in
1605 // the pack, based on the dependence constraint.
1606 void SuperWord::co_locate_pack(Node_List* pk) {
1607   if (pk->at(0)->is_Store()) {
1608     MemNode* first     = executed_first(pk)->as_Mem();
1609     MemNode* last      = executed_last(pk)->as_Mem();
1610     Unique_Node_List schedule_before_pack;
1611     Unique_Node_List memops;
1612 
1613     MemNode* current   = last->in(MemNode::Memory)->as_Mem();
1614     MemNode* previous  = last;
1615     while (true) {
1616       assert(in_bb(current), "stay in block");
1617       memops.push(previous);
1618       for (DUIterator i = current->outs(); current->has_out(i); i++) {
1619         Node* use = current->out(i);
1620         if (use->is_Mem() && use != previous)
1621           memops.push(use);
1622       }
1623       if (current == first) break;
1624       previous = current;
1625       current  = current->in(MemNode::Memory)->as_Mem();
1626     }
1627 
1628     // determine which memory operations should be scheduled before the pack
1629     for (uint i = 1; i < memops.size(); i++) {
1630       Node *s1 = memops.at(i);
1631       if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) {
1632         for (uint j = 0; j< i; j++) {
1633           Node *s2 = memops.at(j);
1634           if (!independent(s1, s2)) {
1635             if (in_pack(s2, pk) || schedule_before_pack.member(s2)) {
1636               schedule_before_pack.push(s1); // s1 must be scheduled before
1637               Node_List* mem_pk = my_pack(s1);
1638               if (mem_pk != NULL) {
1639                 for (uint ii = 0; ii < mem_pk->size(); ii++) {
1640                   Node* s = mem_pk->at(ii);  // follow partner
1641                   if (memops.member(s) && !schedule_before_pack.member(s))
1642                     schedule_before_pack.push(s);
1643                 }
1644               }
1645               break;
1646             }
1647           }
1648         }
1649       }
1650     }
1651 
1652     Node*    upper_insert_pt = first->in(MemNode::Memory);
1653     // Following code moves loads connected to upper_insert_pt below aliased stores.
1654     // Collect such loads here and reconnect them back to upper_insert_pt later.
1655     memops.clear();
1656     for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
1657       Node* use = upper_insert_pt->out(i);
1658       if (use->is_Mem() && !use->is_Store()) {
1659         memops.push(use);
1660       }
1661     }
1662 
1663     MemNode* lower_insert_pt = last;
1664     previous                 = last; //previous store in pk
1665     current                  = last->in(MemNode::Memory)->as_Mem();
1666 
1667     // start scheduling from "last" to "first"
1668     while (true) {
1669       assert(in_bb(current), "stay in block");
1670       assert(in_pack(previous, pk), "previous stays in pack");
1671       Node* my_mem = current->in(MemNode::Memory);
1672 
1673       if (in_pack(current, pk)) {
1674         // Forward users of my memory state (except "previous) to my input memory state
1675         for (DUIterator i = current->outs(); current->has_out(i); i++) {
1676           Node* use = current->out(i);
1677           if (use->is_Mem() && use != previous) {
1678             assert(use->in(MemNode::Memory) == current, "must be");
1679             if (schedule_before_pack.member(use)) {
1680               _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt);
1681             } else {
1682               _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt);
1683             }
1684             --i; // deleted this edge; rescan position
1685           }
1686         }
1687         previous = current;
1688       } else { // !in_pack(current, pk) ==> a sandwiched store
1689         remove_and_insert(current, previous, lower_insert_pt, upper_insert_pt, schedule_before_pack);
1690       }
1691 
1692       if (current == first) break;
1693       current = my_mem->as_Mem();
1694     } // end while
1695 
1696     // Reconnect loads back to upper_insert_pt.
1697     for (uint i = 0; i < memops.size(); i++) {
1698       Node *ld = memops.at(i);
1699       if (ld->in(MemNode::Memory) != upper_insert_pt) {
1700         _igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt);
1701       }
1702     }
1703   } else if (pk->at(0)->is_Load()) { //load
1704     // all loads in the pack should have the same memory state. By default,
1705     // we use the memory state of the last load. However, if any load could
1706     // not be moved down due to the dependence constraint, we use the memory
1707     // state of the first load.
1708     Node* last_mem  = executed_last(pk)->in(MemNode::Memory);
1709     Node* first_mem = executed_first(pk)->in(MemNode::Memory);
1710     bool schedule_last = true;
1711     for (uint i = 0; i < pk->size(); i++) {
1712       Node* ld = pk->at(i);
1713       for (Node* current = last_mem; current != ld->in(MemNode::Memory);
1714            current=current->in(MemNode::Memory)) {
1715         assert(current != first_mem, "corrupted memory graph");
1716         if(current->is_Mem() && !independent(current, ld)){
1717           schedule_last = false; // a later store depends on this load
1718           break;
1719         }
1720       }
1721     }
1722 
1723     Node* mem_input = schedule_last ? last_mem : first_mem;
1724     _igvn.hash_delete(mem_input);
1725     // Give each load the same memory state
1726     for (uint i = 0; i < pk->size(); i++) {
1727       LoadNode* ld = pk->at(i)->as_Load();
1728       _igvn.replace_input_of(ld, MemNode::Memory, mem_input);
1729     }
1730   }
1731 }
1732 
1733 //------------------------------output---------------------------
1734 // Convert packs into vector node operations
1735 void SuperWord::output() {
1736   if (_packset.length() == 0) return;
1737 
1738 #ifndef PRODUCT
1739   if (TraceLoopOpts) {
1740     tty->print("SuperWord    ");
1741     lpt()->dump_head();
1742   }
1743 #endif
1744 
1745   // MUST ENSURE main loop's initial value is properly aligned:
1746   //  (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
1747 
1748   align_initial_loop_index(align_to_ref());
1749 
1750   // Insert extract (unpack) operations for scalar uses
1751   for (int i = 0; i < _packset.length(); i++) {
1752     insert_extracts(_packset.at(i));
1753   }
1754 
1755   Compile* C = _phase->C;
1756   uint max_vlen_in_bytes = 0;
1757   for (int i = 0; i < _block.length(); i++) {
1758     Node* n = _block.at(i);
1759     Node_List* p = my_pack(n);
1760     if (p && n == executed_last(p)) {
1761       uint vlen = p->size();
1762       uint vlen_in_bytes = 0;
1763       Node* vn = NULL;
1764       Node* low_adr = p->at(0);
1765       Node* first   = executed_first(p);
1766       int   opc = n->Opcode();
1767       if (n->is_Load()) {
1768         Node* ctl = n->in(MemNode::Control);
1769         Node* mem = first->in(MemNode::Memory);
1770         SWPointer p1(n->as_Mem(), this, NULL, false);
1771         // Identify the memory dependency for the new loadVector node by
1772         // walking up through memory chain.
1773         // This is done to give flexibility to the new loadVector node so that
1774         // it can move above independent storeVector nodes.
1775         while (mem->is_StoreVector()) {
1776           SWPointer p2(mem->as_Mem(), this, NULL, false);
1777           int cmp = p1.cmp(p2);
1778           if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
1779             mem = mem->in(MemNode::Memory);
1780           } else {
1781             break; // dependent memory
1782           }
1783         }
1784         Node* adr = low_adr->in(MemNode::Address);
1785         const TypePtr* atyp = n->adr_type();
1786         vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
1787         vlen_in_bytes = vn->as_LoadVector()->memory_size();
1788       } else if (n->is_Store()) {
1789         // Promote value to be stored to vector
1790         Node* val = vector_opd(p, MemNode::ValueIn);
1791         Node* ctl = n->in(MemNode::Control);
1792         Node* mem = first->in(MemNode::Memory);
1793         Node* adr = low_adr->in(MemNode::Address);
1794         const TypePtr* atyp = n->adr_type();
1795         vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
1796         vlen_in_bytes = vn->as_StoreVector()->memory_size();
1797       } else if (n->req() == 3) {
1798         // Promote operands to vector
1799         Node* in1 = NULL;
1800         bool node_isa_reduction = n->is_reduction();
1801         if (node_isa_reduction) {
1802           // the input to the first reduction operation is retained
1803           in1 = low_adr->in(1);
1804         } else {
1805           in1 = vector_opd(p, 1);
1806         }
1807         Node* in2 = vector_opd(p, 2);
1808         if (VectorNode::is_invariant_vector(in1) && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) {
1809           // Move invariant vector input into second position to avoid register spilling.
1810           Node* tmp = in1;
1811           in1 = in2;
1812           in2 = tmp;
1813         }
1814         if (node_isa_reduction) {
1815           const Type *arith_type = n->bottom_type();
1816           vn = ReductionNode::make(opc, NULL, in1, in2, arith_type->basic_type());
1817           if (in2->is_Load()) {
1818             vlen_in_bytes = in2->as_LoadVector()->memory_size();
1819           } else {
1820             vlen_in_bytes = in2->as_Vector()->length_in_bytes();
1821           }
1822         } else {
1823           vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
1824           vlen_in_bytes = vn->as_Vector()->length_in_bytes();
1825         }
1826       } else {
1827         ShouldNotReachHere();
1828       }
1829       assert(vn != NULL, "sanity");
1830       _igvn.register_new_node_with_optimizer(vn);
1831       _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
1832       for (uint j = 0; j < p->size(); j++) {
1833         Node* pm = p->at(j);
1834         _igvn.replace_node(pm, vn);
1835       }
1836       _igvn._worklist.push(vn);
1837 
1838       if (vlen_in_bytes > max_vlen_in_bytes) {
1839         max_vlen_in_bytes = vlen_in_bytes;
1840       }
1841 #ifdef ASSERT
1842       if (TraceNewVectors) {
1843         tty->print("new Vector node: ");
1844         vn->dump();
1845       }
1846 #endif
1847     }
1848   }
1849   C->set_max_vector_size(max_vlen_in_bytes);
1850 }
1851 
1852 //------------------------------vector_opd---------------------------
1853 // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
1854 Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
1855   Node* p0 = p->at(0);
1856   uint vlen = p->size();
1857   Node* opd = p0->in(opd_idx);
1858 
1859   if (same_inputs(p, opd_idx)) {
1860     if (opd->is_Vector() || opd->is_LoadVector()) {
1861       assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector");
1862       return opd; // input is matching vector
1863     }
1864     if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
1865       Compile* C = _phase->C;
1866       Node* cnt = opd;
1867       // Vector instructions do not mask shift count, do it here.
1868       juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
1869       const TypeInt* t = opd->find_int_type();
1870       if (t != NULL && t->is_con()) {
1871         juint shift = t->get_con();
1872         if (shift > mask) { // Unsigned cmp
1873           cnt = ConNode::make(TypeInt::make(shift & mask));
1874         }
1875       } else {
1876         if (t == NULL || t->_lo < 0 || t->_hi > (int)mask) {
1877           cnt = ConNode::make(TypeInt::make(mask));
1878           _igvn.register_new_node_with_optimizer(cnt);
1879           cnt = new AndINode(opd, cnt);
1880           _igvn.register_new_node_with_optimizer(cnt);
1881           _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
1882         }
1883         assert(opd->bottom_type()->isa_int(), "int type only");
1884         // Move non constant shift count into vector register.
1885         cnt = VectorNode::shift_count(p0, cnt, vlen, velt_basic_type(p0));
1886       }
1887       if (cnt != opd) {
1888         _igvn.register_new_node_with_optimizer(cnt);
1889         _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
1890       }
1891       return cnt;
1892     }
1893     assert(!opd->is_StoreVector(), "such vector is not expected here");
1894     // Convert scalar input to vector with the same number of elements as
1895     // p0's vector. Use p0's type because size of operand's container in
1896     // vector should match p0's size regardless operand's size.
1897     const Type* p0_t = velt_type(p0);
1898     VectorNode* vn = VectorNode::scalar2vector(opd, vlen, p0_t);
1899 
1900     _igvn.register_new_node_with_optimizer(vn);
1901     _phase->set_ctrl(vn, _phase->get_ctrl(opd));
1902 #ifdef ASSERT
1903     if (TraceNewVectors) {
1904       tty->print("new Vector node: ");
1905       vn->dump();
1906     }
1907 #endif
1908     return vn;
1909   }
1910 
1911   // Insert pack operation
1912   BasicType bt = velt_basic_type(p0);
1913   PackNode* pk = PackNode::make(opd, vlen, bt);
1914   DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); )
1915 
1916   for (uint i = 1; i < vlen; i++) {
1917     Node* pi = p->at(i);
1918     Node* in = pi->in(opd_idx);
1919     assert(my_pack(in) == NULL, "Should already have been unpacked");
1920     assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
1921     pk->add_opd(in);
1922   }
1923   _igvn.register_new_node_with_optimizer(pk);
1924   _phase->set_ctrl(pk, _phase->get_ctrl(opd));
1925 #ifdef ASSERT
1926   if (TraceNewVectors) {
1927     tty->print("new Vector node: ");
1928     pk->dump();
1929   }
1930 #endif
1931   return pk;
1932 }
1933 
1934 //------------------------------insert_extracts---------------------------
1935 // If a use of pack p is not a vector use, then replace the
1936 // use with an extract operation.
1937 void SuperWord::insert_extracts(Node_List* p) {
1938   if (p->at(0)->is_Store()) return;
1939   assert(_n_idx_list.is_empty(), "empty (node,index) list");
1940 
1941   // Inspect each use of each pack member.  For each use that is
1942   // not a vector use, replace the use with an extract operation.
1943 
1944   for (uint i = 0; i < p->size(); i++) {
1945     Node* def = p->at(i);
1946     for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
1947       Node* use = def->fast_out(j);
1948       for (uint k = 0; k < use->req(); k++) {
1949         Node* n = use->in(k);
1950         if (def == n) {
1951           if (!is_vector_use(use, k)) {
1952             _n_idx_list.push(use, k);
1953           }
1954         }
1955       }
1956     }
1957   }
1958 
1959   while (_n_idx_list.is_nonempty()) {
1960     Node* use = _n_idx_list.node();
1961     int   idx = _n_idx_list.index();
1962     _n_idx_list.pop();
1963     Node* def = use->in(idx);
1964 
1965     if (def->is_reduction()) continue;
1966 
1967     // Insert extract operation
1968     _igvn.hash_delete(def);
1969     int def_pos = alignment(def) / data_size(def);
1970 
1971     Node* ex = ExtractNode::make(def, def_pos, velt_basic_type(def));
1972     _igvn.register_new_node_with_optimizer(ex);
1973     _phase->set_ctrl(ex, _phase->get_ctrl(def));
1974     _igvn.replace_input_of(use, idx, ex);
1975     _igvn._worklist.push(def);
1976 
1977     bb_insert_after(ex, bb_idx(def));
1978     set_velt_type(ex, velt_type(def));
1979   }
1980 }
1981 
1982 //------------------------------is_vector_use---------------------------
1983 // Is use->in(u_idx) a vector use?
1984 bool SuperWord::is_vector_use(Node* use, int u_idx) {
1985   Node_List* u_pk = my_pack(use);
1986   if (u_pk == NULL) return false;
1987   if (use->is_reduction()) return true;
1988   Node* def = use->in(u_idx);
1989   Node_List* d_pk = my_pack(def);
1990   if (d_pk == NULL) {
1991     // check for scalar promotion
1992     Node* n = u_pk->at(0)->in(u_idx);
1993     for (uint i = 1; i < u_pk->size(); i++) {
1994       if (u_pk->at(i)->in(u_idx) != n) return false;
1995     }
1996     return true;
1997   }
1998   if (u_pk->size() != d_pk->size())
1999     return false;
2000   for (uint i = 0; i < u_pk->size(); i++) {
2001     Node* ui = u_pk->at(i);
2002     Node* di = d_pk->at(i);
2003     if (ui->in(u_idx) != di || alignment(ui) != alignment(di))
2004       return false;
2005   }
2006   return true;
2007 }
2008 
2009 //------------------------------construct_bb---------------------------
2010 // Construct reverse postorder list of block members
2011 bool SuperWord::construct_bb() {
2012   Node* entry = bb();
2013 
2014   assert(_stk.length() == 0,            "stk is empty");
2015   assert(_block.length() == 0,          "block is empty");
2016   assert(_data_entry.length() == 0,     "data_entry is empty");
2017   assert(_mem_slice_head.length() == 0, "mem_slice_head is empty");
2018   assert(_mem_slice_tail.length() == 0, "mem_slice_tail is empty");
2019 
2020   // Find non-control nodes with no inputs from within block,
2021   // create a temporary map from node _idx to bb_idx for use
2022   // by the visited and post_visited sets,
2023   // and count number of nodes in block.
2024   int bb_ct = 0;
2025   for (uint i = 0; i < lpt()->_body.size(); i++) {
2026     Node *n = lpt()->_body.at(i);
2027     set_bb_idx(n, i); // Create a temporary map
2028     if (in_bb(n)) {
2029       if (n->is_LoadStore() || n->is_MergeMem() ||
2030           (n->is_Proj() && !n->as_Proj()->is_CFG())) {
2031         // Bailout if the loop has LoadStore, MergeMem or data Proj
2032         // nodes. Superword optimization does not work with them.
2033         return false;
2034       }
2035       bb_ct++;
2036       if (!n->is_CFG()) {
2037         bool found = false;
2038         for (uint j = 0; j < n->req(); j++) {
2039           Node* def = n->in(j);
2040           if (def && in_bb(def)) {
2041             found = true;
2042             break;
2043           }
2044         }
2045         if (!found) {
2046           assert(n != entry, "can't be entry");
2047           _data_entry.push(n);
2048         }
2049       }
2050     }
2051   }
2052 
2053   // Find memory slices (head and tail)
2054   for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) {
2055     Node *n = lp()->fast_out(i);
2056     if (in_bb(n) && (n->is_Phi() && n->bottom_type() == Type::MEMORY)) {
2057       Node* n_tail  = n->in(LoopNode::LoopBackControl);
2058       if (n_tail != n->in(LoopNode::EntryControl)) {
2059         if (!n_tail->is_Mem()) {
2060           assert(n_tail->is_Mem(), err_msg_res("unexpected node for memory slice: %s", n_tail->Name()));
2061           return false; // Bailout
2062         }
2063         _mem_slice_head.push(n);
2064         _mem_slice_tail.push(n_tail);
2065       }
2066     }
2067   }
2068 
2069   // Create an RPO list of nodes in block
2070 
2071   visited_clear();
2072   post_visited_clear();
2073 
2074   // Push all non-control nodes with no inputs from within block, then control entry
2075   for (int j = 0; j < _data_entry.length(); j++) {
2076     Node* n = _data_entry.at(j);
2077     visited_set(n);
2078     _stk.push(n);
2079   }
2080   visited_set(entry);
2081   _stk.push(entry);
2082 
2083   // Do a depth first walk over out edges
2084   int rpo_idx = bb_ct - 1;
2085   int size;
2086   int reduction_uses = 0;
2087   while ((size = _stk.length()) > 0) {
2088     Node* n = _stk.top(); // Leave node on stack
2089     if (!visited_test_set(n)) {
2090       // forward arc in graph
2091     } else if (!post_visited_test(n)) {
2092       // cross or back arc
2093       for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
2094         Node *use = n->fast_out(i);
2095         if (in_bb(use) && !visited_test(use) &&
2096             // Don't go around backedge
2097             (!use->is_Phi() || n == entry)) {
2098           if (use->is_reduction()) {
2099             // First see if we can map the reduction on the given system we are on, then
2100             // make a data entry operation for each reduction we see.
2101             BasicType bt = use->bottom_type()->basic_type();
2102             if (ReductionNode::implemented(use->Opcode(), Matcher::min_vector_size(bt), bt)) {
2103               reduction_uses++;
2104             }
2105           }
2106           _stk.push(use);
2107         }
2108       }
2109       if (_stk.length() == size) {
2110         // There were no additional uses, post visit node now
2111         _stk.pop(); // Remove node from stack
2112         assert(rpo_idx >= 0, "");
2113         _block.at_put_grow(rpo_idx, n);
2114         rpo_idx--;
2115         post_visited_set(n);
2116         assert(rpo_idx >= 0 || _stk.is_empty(), "");
2117       }
2118     } else {
2119       _stk.pop(); // Remove post-visited node from stack
2120     }
2121   }
2122 
2123   // Create real map of block indices for nodes
2124   for (int j = 0; j < _block.length(); j++) {
2125     Node* n = _block.at(j);
2126     set_bb_idx(n, j);
2127   }
2128 
2129   // Ensure extra info is allocated.
2130   initialize_bb();
2131 
2132 #ifndef PRODUCT
2133   if (TraceSuperWord) {
2134     print_bb();
2135     tty->print_cr("\ndata entry nodes: %s", _data_entry.length() > 0 ? "" : "NONE");
2136     for (int m = 0; m < _data_entry.length(); m++) {
2137       tty->print("%3d ", m);
2138       _data_entry.at(m)->dump();
2139     }
2140     tty->print_cr("\nmemory slices: %s", _mem_slice_head.length() > 0 ? "" : "NONE");
2141     for (int m = 0; m < _mem_slice_head.length(); m++) {
2142       tty->print("%3d ", m); _mem_slice_head.at(m)->dump();
2143       tty->print("    ");    _mem_slice_tail.at(m)->dump();
2144     }
2145   }
2146 #endif
2147   assert(rpo_idx == -1 && bb_ct == _block.length(), "all block members found");
2148   return (_mem_slice_head.length() > 0) || (reduction_uses > 0) || (_data_entry.length() > 0);
2149 }
2150 
2151 //------------------------------initialize_bb---------------------------
2152 // Initialize per node info
2153 void SuperWord::initialize_bb() {
2154   Node* last = _block.at(_block.length() - 1);
2155   grow_node_info(bb_idx(last));
2156 }
2157 
2158 //------------------------------bb_insert_after---------------------------
2159 // Insert n into block after pos
2160 void SuperWord::bb_insert_after(Node* n, int pos) {
2161   int n_pos = pos + 1;
2162   // Make room
2163   for (int i = _block.length() - 1; i >= n_pos; i--) {
2164     _block.at_put_grow(i+1, _block.at(i));
2165   }
2166   for (int j = _node_info.length() - 1; j >= n_pos; j--) {
2167     _node_info.at_put_grow(j+1, _node_info.at(j));
2168   }
2169   // Set value
2170   _block.at_put_grow(n_pos, n);
2171   _node_info.at_put_grow(n_pos, SWNodeInfo::initial);
2172   // Adjust map from node->_idx to _block index
2173   for (int i = n_pos; i < _block.length(); i++) {
2174     set_bb_idx(_block.at(i), i);
2175   }
2176 }
2177 
2178 //------------------------------compute_max_depth---------------------------
2179 // Compute max depth for expressions from beginning of block
2180 // Use to prune search paths during test for independence.
2181 void SuperWord::compute_max_depth() {
2182   int ct = 0;
2183   bool again;
2184   do {
2185     again = false;
2186     for (int i = 0; i < _block.length(); i++) {
2187       Node* n = _block.at(i);
2188       if (!n->is_Phi()) {
2189         int d_orig = depth(n);
2190         int d_in   = 0;
2191         for (DepPreds preds(n, _dg); !preds.done(); preds.next()) {
2192           Node* pred = preds.current();
2193           if (in_bb(pred)) {
2194             d_in = MAX2(d_in, depth(pred));
2195           }
2196         }
2197         if (d_in + 1 != d_orig) {
2198           set_depth(n, d_in + 1);
2199           again = true;
2200         }
2201       }
2202     }
2203     ct++;
2204   } while (again);
2205 #ifndef PRODUCT
2206   if (TraceSuperWord && Verbose)
2207     tty->print_cr("compute_max_depth iterated: %d times", ct);
2208 #endif
2209 }
2210 
2211 //-------------------------compute_vector_element_type-----------------------
2212 // Compute necessary vector element type for expressions
2213 // This propagates backwards a narrower integer type when the
2214 // upper bits of the value are not needed.
2215 // Example:  char a,b,c;  a = b + c;
2216 // Normally the type of the add is integer, but for packed character
2217 // operations the type of the add needs to be char.
2218 void SuperWord::compute_vector_element_type() {
2219 #ifndef PRODUCT
2220   if (TraceSuperWord && Verbose)
2221     tty->print_cr("\ncompute_velt_type:");
2222 #endif
2223 
2224   // Initial type
2225   for (int i = 0; i < _block.length(); i++) {
2226     Node* n = _block.at(i);
2227     set_velt_type(n, container_type(n));
2228   }
2229 
2230   // Propagate integer narrowed type backwards through operations
2231   // that don't depend on higher order bits
2232   for (int i = _block.length() - 1; i >= 0; i--) {
2233     Node* n = _block.at(i);
2234     // Only integer types need be examined
2235     const Type* vtn = velt_type(n);
2236     if (vtn->basic_type() == T_INT) {
2237       uint start, end;
2238       VectorNode::vector_operands(n, &start, &end);
2239 
2240       for (uint j = start; j < end; j++) {
2241         Node* in  = n->in(j);
2242         // Don't propagate through a memory
2243         if (!in->is_Mem() && in_bb(in) && velt_type(in)->basic_type() == T_INT &&
2244             data_size(n) < data_size(in)) {
2245           bool same_type = true;
2246           for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
2247             Node *use = in->fast_out(k);
2248             if (!in_bb(use) || !same_velt_type(use, n)) {
2249               same_type = false;
2250               break;
2251             }
2252           }
2253           if (same_type) {
2254             // For right shifts of small integer types (bool, byte, char, short)
2255             // we need precise information about sign-ness. Only Load nodes have
2256             // this information because Store nodes are the same for signed and
2257             // unsigned values. And any arithmetic operation after a load may
2258             // expand a value to signed Int so such right shifts can't be used
2259             // because vector elements do not have upper bits of Int.
2260             const Type* vt = vtn;
2261             if (VectorNode::is_shift(in)) {
2262               Node* load = in->in(1);
2263               if (load->is_Load() && in_bb(load) && (velt_type(load)->basic_type() == T_INT)) {
2264                 vt = velt_type(load);
2265               } else if (in->Opcode() != Op_LShiftI) {
2266                 // Widen type to Int to avoid creation of right shift vector
2267                 // (align + data_size(s1) check in stmts_can_pack() will fail).
2268                 // Note, left shifts work regardless type.
2269                 vt = TypeInt::INT;
2270               }
2271             }
2272             set_velt_type(in, vt);
2273           }
2274         }
2275       }
2276     }
2277   }
2278 #ifndef PRODUCT
2279   if (TraceSuperWord && Verbose) {
2280     for (int i = 0; i < _block.length(); i++) {
2281       Node* n = _block.at(i);
2282       velt_type(n)->dump();
2283       tty->print("\t");
2284       n->dump();
2285     }
2286   }
2287 #endif
2288 }
2289 
2290 //------------------------------memory_alignment---------------------------
2291 // Alignment within a vector memory reference
2292 int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
2293   SWPointer p(s, this, NULL, false);
2294   if (!p.valid()) {
2295     return bottom_align;
2296   }
2297   int vw = vector_width_in_bytes(s);
2298   if (vw < 2) {
2299     return bottom_align; // No vectors for this type
2300   }
2301   int offset  = p.offset_in_bytes();
2302   offset     += iv_adjust*p.memory_size();
2303   int off_rem = offset % vw;
2304   int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
2305   return off_mod;
2306 }
2307 
2308 //---------------------------container_type---------------------------
2309 // Smallest type containing range of values
2310 const Type* SuperWord::container_type(Node* n) {
2311   if (n->is_Mem()) {
2312     BasicType bt = n->as_Mem()->memory_type();
2313     if (n->is_Store() && (bt == T_CHAR)) {
2314       // Use T_SHORT type instead of T_CHAR for stored values because any
2315       // preceding arithmetic operation extends values to signed Int.
2316       bt = T_SHORT;
2317     }
2318     if (n->Opcode() == Op_LoadUB) {
2319       // Adjust type for unsigned byte loads, it is important for right shifts.
2320       // T_BOOLEAN is used because there is no basic type representing type
2321       // TypeInt::UBYTE. Use of T_BOOLEAN for vectors is fine because only
2322       // size (one byte) and sign is important.
2323       bt = T_BOOLEAN;
2324     }
2325     return Type::get_const_basic_type(bt);
2326   }
2327   const Type* t = _igvn.type(n);
2328   if (t->basic_type() == T_INT) {
2329     // A narrow type of arithmetic operations will be determined by
2330     // propagating the type of memory operations.
2331     return TypeInt::INT;
2332   }
2333   return t;
2334 }
2335 
2336 bool SuperWord::same_velt_type(Node* n1, Node* n2) {
2337   const Type* vt1 = velt_type(n1);
2338   const Type* vt2 = velt_type(n2);
2339   if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) {
2340     // Compare vectors element sizes for integer types.
2341     return data_size(n1) == data_size(n2);
2342   }
2343   return vt1 == vt2;
2344 }
2345 
2346 //------------------------------in_packset---------------------------
2347 // Are s1 and s2 in a pack pair and ordered as s1,s2?
2348 bool SuperWord::in_packset(Node* s1, Node* s2) {
2349   for (int i = 0; i < _packset.length(); i++) {
2350     Node_List* p = _packset.at(i);
2351     assert(p->size() == 2, "must be");
2352     if (p->at(0) == s1 && p->at(p->size()-1) == s2) {
2353       return true;
2354     }
2355   }
2356   return false;
2357 }
2358 
2359 //------------------------------in_pack---------------------------
2360 // Is s in pack p?
2361 Node_List* SuperWord::in_pack(Node* s, Node_List* p) {
2362   for (uint i = 0; i < p->size(); i++) {
2363     if (p->at(i) == s) {
2364       return p;
2365     }
2366   }
2367   return NULL;
2368 }
2369 
2370 //------------------------------remove_pack_at---------------------------
2371 // Remove the pack at position pos in the packset
2372 void SuperWord::remove_pack_at(int pos) {
2373   Node_List* p = _packset.at(pos);
2374   for (uint i = 0; i < p->size(); i++) {
2375     Node* s = p->at(i);
2376     set_my_pack(s, NULL);
2377   }
2378   _packset.remove_at(pos);
2379 }
2380 
2381 void SuperWord::packset_sort(int n) {
2382   // simple bubble sort so that we capitalize with O(n) when its already sorted
2383   while (n != 0) {
2384     bool swapped = false;
2385     for (int i = 1; i < n; i++) {
2386       Node_List* q_low = _packset.at(i-1);
2387       Node_List* q_i = _packset.at(i);
2388 
2389       // only swap when we find something to swap
2390       if (alignment(q_low->at(0)) > alignment(q_i->at(0))) {
2391         Node_List* t = q_i;
2392         *(_packset.adr_at(i)) = q_low;
2393         *(_packset.adr_at(i-1)) = q_i;
2394         swapped = true;
2395       }
2396     }
2397     if (swapped == false) break;
2398     n--;
2399   }
2400 }
2401 
2402 //------------------------------executed_first---------------------------
2403 // Return the node executed first in pack p.  Uses the RPO block list
2404 // to determine order.
2405 Node* SuperWord::executed_first(Node_List* p) {
2406   Node* n = p->at(0);
2407   int n_rpo = bb_idx(n);
2408   for (uint i = 1; i < p->size(); i++) {
2409     Node* s = p->at(i);
2410     int s_rpo = bb_idx(s);
2411     if (s_rpo < n_rpo) {
2412       n = s;
2413       n_rpo = s_rpo;
2414     }
2415   }
2416   return n;
2417 }
2418 
2419 //------------------------------executed_last---------------------------
2420 // Return the node executed last in pack p.
2421 Node* SuperWord::executed_last(Node_List* p) {
2422   Node* n = p->at(0);
2423   int n_rpo = bb_idx(n);
2424   for (uint i = 1; i < p->size(); i++) {
2425     Node* s = p->at(i);
2426     int s_rpo = bb_idx(s);
2427     if (s_rpo > n_rpo) {
2428       n = s;
2429       n_rpo = s_rpo;
2430     }
2431   }
2432   return n;
2433 }
2434 
2435 //----------------------------align_initial_loop_index---------------------------
2436 // Adjust pre-loop limit so that in main loop, a load/store reference
2437 // to align_to_ref will be a position zero in the vector.
2438 //   (iv + k) mod vector_align == 0
2439 void SuperWord::align_initial_loop_index(MemNode* align_to_ref) {
2440   CountedLoopNode *main_head = lp()->as_CountedLoop();
2441   assert(main_head->is_main_loop(), "");
2442   CountedLoopEndNode* pre_end = get_pre_loop_end(main_head);
2443   assert(pre_end != NULL, "we must have a correct pre-loop");
2444   Node *pre_opaq1 = pre_end->limit();
2445   assert(pre_opaq1->Opcode() == Op_Opaque1, "");
2446   Opaque1Node *pre_opaq = (Opaque1Node*)pre_opaq1;
2447   Node *lim0 = pre_opaq->in(1);
2448 
2449   // Where we put new limit calculations
2450   Node *pre_ctrl = pre_end->loopnode()->in(LoopNode::EntryControl);
2451 
2452   // Ensure the original loop limit is available from the
2453   // pre-loop Opaque1 node.
2454   Node *orig_limit = pre_opaq->original_loop_limit();
2455   assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
2456 
2457   SWPointer align_to_ref_p(align_to_ref, this, NULL, false);
2458   assert(align_to_ref_p.valid(), "sanity");
2459 
2460   // Given:
2461   //     lim0 == original pre loop limit
2462   //     V == v_align (power of 2)
2463   //     invar == extra invariant piece of the address expression
2464   //     e == offset [ +/- invar ]
2465   //
2466   // When reassociating expressions involving '%' the basic rules are:
2467   //     (a - b) % k == 0   =>  a % k == b % k
2468   // and:
2469   //     (a + b) % k == 0   =>  a % k == (k - b) % k
2470   //
2471   // For stride > 0 && scale > 0,
2472   //   Derive the new pre-loop limit "lim" such that the two constraints:
2473   //     (1) lim = lim0 + N           (where N is some positive integer < V)
2474   //     (2) (e + lim) % V == 0
2475   //   are true.
2476   //
2477   //   Substituting (1) into (2),
2478   //     (e + lim0 + N) % V == 0
2479   //   solve for N:
2480   //     N = (V - (e + lim0)) % V
2481   //   substitute back into (1), so that new limit
2482   //     lim = lim0 + (V - (e + lim0)) % V
2483   //
2484   // For stride > 0 && scale < 0
2485   //   Constraints:
2486   //     lim = lim0 + N
2487   //     (e - lim) % V == 0
2488   //   Solving for lim:
2489   //     (e - lim0 - N) % V == 0
2490   //     N = (e - lim0) % V
2491   //     lim = lim0 + (e - lim0) % V
2492   //
2493   // For stride < 0 && scale > 0
2494   //   Constraints:
2495   //     lim = lim0 - N
2496   //     (e + lim) % V == 0
2497   //   Solving for lim:
2498   //     (e + lim0 - N) % V == 0
2499   //     N = (e + lim0) % V
2500   //     lim = lim0 - (e + lim0) % V
2501   //
2502   // For stride < 0 && scale < 0
2503   //   Constraints:
2504   //     lim = lim0 - N
2505   //     (e - lim) % V == 0
2506   //   Solving for lim:
2507   //     (e - lim0 + N) % V == 0
2508   //     N = (V - (e - lim0)) % V
2509   //     lim = lim0 - (V - (e - lim0)) % V
2510 
2511   int vw = vector_width_in_bytes(align_to_ref);
2512   int stride   = iv_stride();
2513   int scale    = align_to_ref_p.scale_in_bytes();
2514   int elt_size = align_to_ref_p.memory_size();
2515   int v_align  = vw / elt_size;
2516   assert(v_align > 1, "sanity");
2517   int offset   = align_to_ref_p.offset_in_bytes() / elt_size;
2518   Node *offsn  = _igvn.intcon(offset);
2519 
2520   Node *e = offsn;
2521   if (align_to_ref_p.invar() != NULL) {
2522     // incorporate any extra invariant piece producing (offset +/- invar) >>> log2(elt)
2523     Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
2524     Node* aref     = new URShiftINode(align_to_ref_p.invar(), log2_elt);
2525     _igvn.register_new_node_with_optimizer(aref);
2526     _phase->set_ctrl(aref, pre_ctrl);
2527     if (align_to_ref_p.negate_invar()) {
2528       e = new SubINode(e, aref);
2529     } else {
2530       e = new AddINode(e, aref);
2531     }
2532     _igvn.register_new_node_with_optimizer(e);
2533     _phase->set_ctrl(e, pre_ctrl);
2534   }
2535   if (vw > ObjectAlignmentInBytes) {
2536     // incorporate base e +/- base && Mask >>> log2(elt)
2537     Node* xbase = new CastP2XNode(NULL, align_to_ref_p.base());
2538     _igvn.register_new_node_with_optimizer(xbase);
2539 #ifdef _LP64
2540     xbase  = new ConvL2INode(xbase);
2541     _igvn.register_new_node_with_optimizer(xbase);
2542 #endif
2543     Node* mask = _igvn.intcon(vw-1);
2544     Node* masked_xbase  = new AndINode(xbase, mask);
2545     _igvn.register_new_node_with_optimizer(masked_xbase);
2546     Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
2547     Node* bref     = new URShiftINode(masked_xbase, log2_elt);
2548     _igvn.register_new_node_with_optimizer(bref);
2549     _phase->set_ctrl(bref, pre_ctrl);
2550     e = new AddINode(e, bref);
2551     _igvn.register_new_node_with_optimizer(e);
2552     _phase->set_ctrl(e, pre_ctrl);
2553   }
2554 
2555   // compute e +/- lim0
2556   if (scale < 0) {
2557     e = new SubINode(e, lim0);
2558   } else {
2559     e = new AddINode(e, lim0);
2560   }
2561   _igvn.register_new_node_with_optimizer(e);
2562   _phase->set_ctrl(e, pre_ctrl);
2563 
2564   if (stride * scale > 0) {
2565     // compute V - (e +/- lim0)
2566     Node* va  = _igvn.intcon(v_align);
2567     e = new SubINode(va, e);
2568     _igvn.register_new_node_with_optimizer(e);
2569     _phase->set_ctrl(e, pre_ctrl);
2570   }
2571   // compute N = (exp) % V
2572   Node* va_msk = _igvn.intcon(v_align - 1);
2573   Node* N = new AndINode(e, va_msk);
2574   _igvn.register_new_node_with_optimizer(N);
2575   _phase->set_ctrl(N, pre_ctrl);
2576 
2577   //   substitute back into (1), so that new limit
2578   //     lim = lim0 + N
2579   Node* lim;
2580   if (stride < 0) {
2581     lim = new SubINode(lim0, N);
2582   } else {
2583     lim = new AddINode(lim0, N);
2584   }
2585   _igvn.register_new_node_with_optimizer(lim);
2586   _phase->set_ctrl(lim, pre_ctrl);
2587   Node* constrained =
2588     (stride > 0) ? (Node*) new MinINode(lim, orig_limit)
2589                  : (Node*) new MaxINode(lim, orig_limit);
2590   _igvn.register_new_node_with_optimizer(constrained);
2591   _phase->set_ctrl(constrained, pre_ctrl);
2592   _igvn.hash_delete(pre_opaq);
2593   pre_opaq->set_req(1, constrained);
2594 }
2595 
2596 //----------------------------get_pre_loop_end---------------------------
2597 // Find pre loop end from main loop.  Returns null if none.
2598 CountedLoopEndNode* SuperWord::get_pre_loop_end(CountedLoopNode *cl) {
2599   Node *ctrl = cl->in(LoopNode::EntryControl);
2600   if (!ctrl->is_IfTrue() && !ctrl->is_IfFalse()) return NULL;
2601   Node *iffm = ctrl->in(0);
2602   if (!iffm->is_If()) return NULL;
2603   Node *p_f = iffm->in(0);
2604   if (!p_f->is_IfFalse()) return NULL;
2605   if (!p_f->in(0)->is_CountedLoopEnd()) return NULL;
2606   CountedLoopEndNode *pre_end = p_f->in(0)->as_CountedLoopEnd();
2607   CountedLoopNode* loop_node = pre_end->loopnode();
2608   if (loop_node == NULL || !loop_node->is_pre_loop()) return NULL;
2609   return pre_end;
2610 }
2611 
2612 
2613 //------------------------------init---------------------------
2614 void SuperWord::init() {
2615   _dg.init();
2616   _packset.clear();
2617   _disjoint_ptrs.clear();
2618   _block.clear();
2619   _data_entry.clear();
2620   _mem_slice_head.clear();
2621   _mem_slice_tail.clear();
2622   _iteration_first.clear();
2623   _iteration_last.clear();
2624   _node_info.clear();
2625   _align_to_ref = NULL;
2626   _lpt = NULL;
2627   _lp = NULL;
2628   _bb = NULL;
2629   _iv = NULL;
2630   _race_possible = 0;
2631   _early_return = false;
2632   _num_work_vecs = 0;
2633   _num_reductions = 0;
2634 }
2635 
2636 //------------------------------restart---------------------------
2637 void SuperWord::restart() {
2638   _dg.init();
2639   _packset.clear();
2640   _disjoint_ptrs.clear();
2641   _block.clear();
2642   _data_entry.clear();
2643   _mem_slice_head.clear();
2644   _mem_slice_tail.clear();
2645   _node_info.clear();
2646 }
2647 
2648 //------------------------------print_packset---------------------------
2649 void SuperWord::print_packset() {
2650 #ifndef PRODUCT
2651   tty->print_cr("packset");
2652   for (int i = 0; i < _packset.length(); i++) {
2653     tty->print_cr("Pack: %d", i);
2654     Node_List* p = _packset.at(i);
2655     print_pack(p);
2656   }
2657 #endif
2658 }
2659 
2660 //------------------------------print_pack---------------------------
2661 void SuperWord::print_pack(Node_List* p) {
2662   for (uint i = 0; i < p->size(); i++) {
2663     print_stmt(p->at(i));
2664   }
2665 }
2666 
2667 //------------------------------print_bb---------------------------
2668 void SuperWord::print_bb() {
2669 #ifndef PRODUCT
2670   tty->print_cr("\nBlock");
2671   for (int i = 0; i < _block.length(); i++) {
2672     Node* n = _block.at(i);
2673     tty->print("%d ", i);
2674     if (n) {
2675       n->dump();
2676     }
2677   }
2678 #endif
2679 }
2680 
2681 //------------------------------print_stmt---------------------------
2682 void SuperWord::print_stmt(Node* s) {
2683 #ifndef PRODUCT
2684   tty->print(" align: %d \t", alignment(s));
2685   s->dump();
2686 #endif
2687 }
2688 
2689 //------------------------------blank---------------------------
2690 char* SuperWord::blank(uint depth) {
2691   static char blanks[101];
2692   assert(depth < 101, "too deep");
2693   for (uint i = 0; i < depth; i++) blanks[i] = ' ';
2694   blanks[depth] = '\0';
2695   return blanks;
2696 }
2697 
2698 
2699 //==============================SWPointer===========================
2700 
2701 //----------------------------SWPointer------------------------
2702 SWPointer::SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only) :
2703   _mem(mem), _slp(slp),  _base(NULL),  _adr(NULL),
2704   _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
2705   _nstack(nstack), _analyze_only(analyze_only),
2706   _stack_idx(0) {
2707 
2708   Node* adr = mem->in(MemNode::Address);
2709   if (!adr->is_AddP()) {
2710     assert(!valid(), "too complex");
2711     return;
2712   }
2713   // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant)
2714   Node* base = adr->in(AddPNode::Base);
2715   // The base address should be loop invariant
2716   if (!invariant(base)) {
2717     assert(!valid(), "base address is loop variant");
2718     return;
2719   }
2720   //unsafe reference could not be aligned appropriately without runtime checking
2721   if (base == NULL || base->bottom_type() == Type::TOP) {
2722     assert(!valid(), "unsafe access");
2723     return;
2724   }
2725   for (int i = 0; i < 3; i++) {
2726     if (!scaled_iv_plus_offset(adr->in(AddPNode::Offset))) {
2727       assert(!valid(), "too complex");
2728       return;
2729     }
2730     adr = adr->in(AddPNode::Address);
2731     if (base == adr || !adr->is_AddP()) {
2732       break; // stop looking at addp's
2733     }
2734   }
2735   _base = base;
2736   _adr  = adr;
2737   assert(valid(), "Usable");
2738 }
2739 
2740 // Following is used to create a temporary object during
2741 // the pattern match of an address expression.
2742 SWPointer::SWPointer(SWPointer* p) :
2743   _mem(p->_mem), _slp(p->_slp),  _base(NULL),  _adr(NULL),
2744   _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
2745   _nstack(p->_nstack), _analyze_only(p->_analyze_only),
2746   _stack_idx(p->_stack_idx) {}
2747 
2748 //------------------------scaled_iv_plus_offset--------------------
2749 // Match: k*iv + offset
2750 // where: k is a constant that maybe zero, and
2751 //        offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional
2752 bool SWPointer::scaled_iv_plus_offset(Node* n) {
2753   if (scaled_iv(n)) {
2754     return true;
2755   }
2756   if (offset_plus_k(n)) {
2757     return true;
2758   }
2759   int opc = n->Opcode();
2760   if (opc == Op_AddI) {
2761     if (scaled_iv(n->in(1)) && offset_plus_k(n->in(2))) {
2762       return true;
2763     }
2764     if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
2765       return true;
2766     }
2767   } else if (opc == Op_SubI) {
2768     if (scaled_iv(n->in(1)) && offset_plus_k(n->in(2), true)) {
2769       return true;
2770     }
2771     if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
2772       _scale *= -1;
2773       return true;
2774     }
2775   }
2776   return false;
2777 }
2778 
2779 //----------------------------scaled_iv------------------------
2780 // Match: k*iv where k is a constant that's not zero
2781 bool SWPointer::scaled_iv(Node* n) {
2782   if (_scale != 0) {
2783     return false;  // already found a scale
2784   }
2785   if (n == iv()) {
2786     _scale = 1;
2787     return true;
2788   }
2789   if (_analyze_only && (invariant(n) == false)) {
2790     _nstack->push(n, _stack_idx++);
2791   }
2792   int opc = n->Opcode();
2793   if (opc == Op_MulI) {
2794     if (n->in(1) == iv() && n->in(2)->is_Con()) {
2795       _scale = n->in(2)->get_int();
2796       return true;
2797     } else if (n->in(2) == iv() && n->in(1)->is_Con()) {
2798       _scale = n->in(1)->get_int();
2799       return true;
2800     }
2801   } else if (opc == Op_LShiftI) {
2802     if (n->in(1) == iv() && n->in(2)->is_Con()) {
2803       _scale = 1 << n->in(2)->get_int();
2804       return true;
2805     }
2806   } else if (opc == Op_ConvI2L) {
2807     if (scaled_iv_plus_offset(n->in(1))) {
2808       return true;
2809     }
2810   } else if (opc == Op_LShiftL) {
2811     if (!has_iv() && _invar == NULL) {
2812       // Need to preserve the current _offset value, so
2813       // create a temporary object for this expression subtree.
2814       // Hacky, so should re-engineer the address pattern match.
2815       SWPointer tmp(this);
2816       if (tmp.scaled_iv_plus_offset(n->in(1))) {
2817         if (tmp._invar == NULL) {
2818           int mult = 1 << n->in(2)->get_int();
2819           _scale   = tmp._scale  * mult;
2820           _offset += tmp._offset * mult;
2821           return true;
2822         }
2823       }
2824     }
2825   }
2826   return false;
2827 }
2828 
2829 //----------------------------offset_plus_k------------------------
2830 // Match: offset is (k [+/- invariant])
2831 // where k maybe zero and invariant is optional, but not both.
2832 bool SWPointer::offset_plus_k(Node* n, bool negate) {
2833   int opc = n->Opcode();
2834   if (opc == Op_ConI) {
2835     _offset += negate ? -(n->get_int()) : n->get_int();
2836     return true;
2837   } else if (opc == Op_ConL) {
2838     // Okay if value fits into an int
2839     const TypeLong* t = n->find_long_type();
2840     if (t->higher_equal(TypeLong::INT)) {
2841       jlong loff = n->get_long();
2842       jint  off  = (jint)loff;
2843       _offset += negate ? -off : loff;
2844       return true;
2845     }
2846     return false;
2847   }
2848   if (_invar != NULL) return false; // already have an invariant
2849   if (_analyze_only && (invariant(n) == false)) {
2850     _nstack->push(n, _stack_idx++);
2851   }
2852   if (opc == Op_AddI) {
2853     if (n->in(2)->is_Con() && invariant(n->in(1))) {
2854       _negate_invar = negate;
2855       _invar = n->in(1);
2856       _offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
2857       return true;
2858     } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
2859       _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
2860       _negate_invar = negate;
2861       _invar = n->in(2);
2862       return true;
2863     }
2864   }
2865   if (opc == Op_SubI) {
2866     if (n->in(2)->is_Con() && invariant(n->in(1))) {
2867       _negate_invar = negate;
2868       _invar = n->in(1);
2869       _offset += !negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
2870       return true;
2871     } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
2872       _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
2873       _negate_invar = !negate;
2874       _invar = n->in(2);
2875       return true;
2876     }
2877   }
2878   if (invariant(n)) {
2879     _negate_invar = negate;
2880     _invar = n;
2881     return true;
2882   }
2883   return false;
2884 }
2885 
2886 //----------------------------print------------------------
2887 void SWPointer::print() {
2888 #ifndef PRODUCT
2889   tty->print("base: %d  adr: %d  scale: %d  offset: %d  invar: %c%d\n",
2890              _base != NULL ? _base->_idx : 0,
2891              _adr  != NULL ? _adr->_idx  : 0,
2892              _scale, _offset,
2893              _negate_invar?'-':'+',
2894              _invar != NULL ? _invar->_idx : 0);
2895 #endif
2896 }
2897 
2898 // ========================= OrderedPair =====================
2899 
2900 const OrderedPair OrderedPair::initial;
2901 
2902 // ========================= SWNodeInfo =====================
2903 
2904 const SWNodeInfo SWNodeInfo::initial;
2905 
2906 
2907 // ============================ DepGraph ===========================
2908 
2909 //------------------------------make_node---------------------------
2910 // Make a new dependence graph node for an ideal node.
2911 DepMem* DepGraph::make_node(Node* node) {
2912   DepMem* m = new (_arena) DepMem(node);
2913   if (node != NULL) {
2914     assert(_map.at_grow(node->_idx) == NULL, "one init only");
2915     _map.at_put_grow(node->_idx, m);
2916   }
2917   return m;
2918 }
2919 
2920 //------------------------------make_edge---------------------------
2921 // Make a new dependence graph edge from dpred -> dsucc
2922 DepEdge* DepGraph::make_edge(DepMem* dpred, DepMem* dsucc) {
2923   DepEdge* e = new (_arena) DepEdge(dpred, dsucc, dsucc->in_head(), dpred->out_head());
2924   dpred->set_out_head(e);
2925   dsucc->set_in_head(e);
2926   return e;
2927 }
2928 
2929 // ========================== DepMem ========================
2930 
2931 //------------------------------in_cnt---------------------------
2932 int DepMem::in_cnt() {
2933   int ct = 0;
2934   for (DepEdge* e = _in_head; e != NULL; e = e->next_in()) ct++;
2935   return ct;
2936 }
2937 
2938 //------------------------------out_cnt---------------------------
2939 int DepMem::out_cnt() {
2940   int ct = 0;
2941   for (DepEdge* e = _out_head; e != NULL; e = e->next_out()) ct++;
2942   return ct;
2943 }
2944 
2945 //------------------------------print-----------------------------
2946 void DepMem::print() {
2947 #ifndef PRODUCT
2948   tty->print("  DepNode %d (", _node->_idx);
2949   for (DepEdge* p = _in_head; p != NULL; p = p->next_in()) {
2950     Node* pred = p->pred()->node();
2951     tty->print(" %d", pred != NULL ? pred->_idx : 0);
2952   }
2953   tty->print(") [");
2954   for (DepEdge* s = _out_head; s != NULL; s = s->next_out()) {
2955     Node* succ = s->succ()->node();
2956     tty->print(" %d", succ != NULL ? succ->_idx : 0);
2957   }
2958   tty->print_cr(" ]");
2959 #endif
2960 }
2961 
2962 // =========================== DepEdge =========================
2963 
2964 //------------------------------DepPreds---------------------------
2965 void DepEdge::print() {
2966 #ifndef PRODUCT
2967   tty->print_cr("DepEdge: %d [ %d ]", _pred->node()->_idx, _succ->node()->_idx);
2968 #endif
2969 }
2970 
2971 // =========================== DepPreds =========================
2972 // Iterator over predecessor edges in the dependence graph.
2973 
2974 //------------------------------DepPreds---------------------------
2975 DepPreds::DepPreds(Node* n, DepGraph& dg) {
2976   _n = n;
2977   _done = false;
2978   if (_n->is_Store() || _n->is_Load()) {
2979     _next_idx = MemNode::Address;
2980     _end_idx  = n->req();
2981     _dep_next = dg.dep(_n)->in_head();
2982   } else if (_n->is_Mem()) {
2983     _next_idx = 0;
2984     _end_idx  = 0;
2985     _dep_next = dg.dep(_n)->in_head();
2986   } else {
2987     _next_idx = 1;
2988     _end_idx  = _n->req();
2989     _dep_next = NULL;
2990   }
2991   next();
2992 }
2993 
2994 //------------------------------next---------------------------
2995 void DepPreds::next() {
2996   if (_dep_next != NULL) {
2997     _current  = _dep_next->pred()->node();
2998     _dep_next = _dep_next->next_in();
2999   } else if (_next_idx < _end_idx) {
3000     _current  = _n->in(_next_idx++);
3001   } else {
3002     _done = true;
3003   }
3004 }
3005 
3006 // =========================== DepSuccs =========================
3007 // Iterator over successor edges in the dependence graph.
3008 
3009 //------------------------------DepSuccs---------------------------
3010 DepSuccs::DepSuccs(Node* n, DepGraph& dg) {
3011   _n = n;
3012   _done = false;
3013   if (_n->is_Load()) {
3014     _next_idx = 0;
3015     _end_idx  = _n->outcnt();
3016     _dep_next = dg.dep(_n)->out_head();
3017   } else if (_n->is_Mem() || _n->is_Phi() && _n->bottom_type() == Type::MEMORY) {
3018     _next_idx = 0;
3019     _end_idx  = 0;
3020     _dep_next = dg.dep(_n)->out_head();
3021   } else {
3022     _next_idx = 0;
3023     _end_idx  = _n->outcnt();
3024     _dep_next = NULL;
3025   }
3026   next();
3027 }
3028 
3029 //-------------------------------next---------------------------
3030 void DepSuccs::next() {
3031   if (_dep_next != NULL) {
3032     _current  = _dep_next->succ()->node();
3033     _dep_next = _dep_next->next_out();
3034   } else if (_next_idx < _end_idx) {
3035     _current  = _n->raw_out(_next_idx++);
3036   } else {
3037     _done = true;
3038   }
3039 }
3040 
3041 //
3042 // --------------------------------- vectorization/simd -----------------------------------
3043 //
3044 Node*  SuperWord::find_phi_for_mem_dep(LoadNode* ld) {
3045   assert(in_bb(ld), "must be in block");
3046   if (_clone_map.gen(ld->_idx) == _ii_first) {
3047 #ifndef PRODUCT
3048     if (_vector_loop_debug) {
3049       tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(ld->_idx)=%d",
3050                     _clone_map.gen(ld->_idx));
3051     }
3052 #endif
3053     return NULL; //we think that any ld in the first gen being vectorizable
3054   }
3055 
3056   Node* mem = ld->in(MemNode::Memory);
3057   if (mem->outcnt() <= 1) {
3058     // we don't want to remove the only edge from mem node to load
3059 #ifndef PRODUCT
3060     if (_vector_loop_debug) {
3061       tty->print_cr("SuperWord::find_phi_for_mem_dep input node %d to load %d has no other outputs and edge mem->load cannot be removed",
3062                     mem->_idx, ld->_idx);
3063       ld->dump();
3064       mem->dump();
3065     }
3066 #endif
3067     return NULL;
3068   }
3069   if (!in_bb(mem) || _clone_map.gen(mem->_idx) == _clone_map.gen(ld->_idx)) {
3070 #ifndef PRODUCT
3071     if (_vector_loop_debug) {
3072       tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(mem->_idx)=%d",
3073                     _clone_map.gen(mem->_idx));
3074     }
3075 #endif
3076     return NULL; // does not depend on loop volatile node or depends on the same generation
3077   }
3078 
3079   //otherwise first node should depend on mem-phi
3080   Node* first = first_node(ld);
3081   assert(first->is_Load(), "must be Load");
3082   Node* phi = first->as_Load()->in(MemNode::Memory);
3083   if (!phi->is_Phi() || phi->bottom_type() != Type::MEMORY) {
3084 #ifndef PRODUCT
3085     if (_vector_loop_debug) {
3086       tty->print_cr("SuperWord::find_phi_for_mem_dep load is not vectorizable node, since it's `first` does not take input from mem phi");
3087       ld->dump();
3088       first->dump();
3089     }
3090 #endif
3091     return NULL;
3092   }
3093 
3094   Node* tail = 0;
3095   for (int m = 0; m < _mem_slice_head.length(); m++) {
3096     if (_mem_slice_head.at(m) == phi) {
3097       tail = _mem_slice_tail.at(m);
3098     }
3099   }
3100   if (tail == 0) { //test that found phi is in the list  _mem_slice_head
3101 #ifndef PRODUCT
3102     if (_vector_loop_debug) {
3103       tty->print_cr("SuperWord::find_phi_for_mem_dep load %d is not vectorizable node, its phi %d is not _mem_slice_head",
3104                     ld->_idx, phi->_idx);
3105       ld->dump();
3106       phi->dump();
3107     }
3108 #endif
3109     return NULL;
3110   }
3111 
3112   // now all conditions are met
3113   return phi;
3114 }
3115 
3116 Node* SuperWord::first_node(Node* nd) {
3117   for (int ii = 0; ii < _iteration_first.length(); ii++) {
3118     Node* nnn = _iteration_first.at(ii);
3119     if (_clone_map.idx(nnn->_idx) == _clone_map.idx(nd->_idx)) {
3120 #ifndef PRODUCT
3121       if (_vector_loop_debug) {
3122         tty->print_cr("SuperWord::first_node: %d is the first iteration node for %d (_clone_map.idx(nnn->_idx) = %d)",
3123                       nnn->_idx, nd->_idx, _clone_map.idx(nnn->_idx));
3124       }
3125 #endif
3126       return nnn;
3127     }
3128   }
3129 
3130 #ifndef PRODUCT
3131   if (_vector_loop_debug) {
3132     tty->print_cr("SuperWord::first_node: did not find first iteration node for %d (_clone_map.idx(nd->_idx)=%d)",
3133                   nd->_idx, _clone_map.idx(nd->_idx));
3134   }
3135 #endif
3136   return 0;
3137 }
3138 
3139 Node* SuperWord::last_node(Node* nd) {
3140   for (int ii = 0; ii < _iteration_last.length(); ii++) {
3141     Node* nnn = _iteration_last.at(ii);
3142     if (_clone_map.idx(nnn->_idx) == _clone_map.idx(nd->_idx)) {
3143 #ifndef PRODUCT
3144       if (_vector_loop_debug) {
3145         tty->print_cr("SuperWord::last_node _clone_map.idx(nnn->_idx)=%d, _clone_map.idx(nd->_idx)=%d",
3146                       _clone_map.idx(nnn->_idx), _clone_map.idx(nd->_idx));
3147       }
3148 #endif
3149       return nnn;
3150     }
3151   }
3152   return 0;
3153 }
3154 
3155 int SuperWord::mark_generations() {
3156   Node *ii_err = 0, *tail_err;
3157   for (int i = 0; i < _mem_slice_head.length(); i++) {
3158     Node* phi  = _mem_slice_head.at(i);
3159     assert(phi->is_Phi(), "must be phi");
3160 
3161     Node* tail = _mem_slice_tail.at(i);
3162     if (_ii_last == -1) {
3163       tail_err = tail;
3164       _ii_last = _clone_map.gen(tail->_idx);
3165     }
3166     else if (_ii_last != _clone_map.gen(tail->_idx)) {
3167 #ifndef PRODUCT
3168       if (TraceSuperWord && Verbose) {
3169         tty->print_cr("SuperWord::mark_generations _ii_last error - found different generations in two tail nodes ");
3170         tail->dump();
3171         tail_err->dump();
3172       }
3173 #endif
3174       return -1;
3175     }
3176 
3177     // find first iteration in the loop
3178     for (DUIterator_Fast imax, i = phi->fast_outs(imax); i < imax; i++) {
3179       Node* ii = phi->fast_out(i);
3180       if (in_bb(ii) && ii->is_Store()) { // we speculate that normally Stores of one and one only generation have deps from mem phi
3181         if (_ii_first == -1) {
3182           ii_err = ii;
3183           _ii_first = _clone_map.gen(ii->_idx);
3184         } else if (_ii_first != _clone_map.gen(ii->_idx)) {
3185 #ifndef PRODUCT
3186           if (TraceSuperWord && Verbose) {
3187             tty->print_cr("SuperWord::mark_generations _ii_first error - found different generations in two nodes ");
3188             ii->dump();
3189             ii_err->dump();
3190           }
3191 #endif
3192           return -1; // this phi has Stores from different generations of unroll and cannot be simd/vectorized
3193         }
3194       }
3195     }//for (DUIterator_Fast imax,
3196   }//for (int i...
3197 
3198   if (_ii_first == -1 || _ii_last == -1) {
3199 #ifndef PRODUCT
3200     if (TraceSuperWord && Verbose) {
3201       tty->print_cr("SuperWord::mark_generations unknown error, something vent wrong");
3202     }
3203 #endif
3204     return -1; // something vent wrong
3205   }
3206   // collect nodes in the first and last generations
3207   assert(_iteration_first.length() == 0, "_iteration_first must be empty");
3208   assert(_iteration_last.length() == 0, "_iteration_last must be empty");
3209   for (int j = 0; j < _block.length(); j++) {
3210     Node* n = _block.at(j);
3211     node_idx_t gen = _clone_map.gen(n->_idx);
3212     if ((signed)gen == _ii_first) {
3213       _iteration_first.push(n);
3214     } else if ((signed)gen == _ii_last) {
3215       _iteration_last.push(n);
3216     }
3217   }
3218 
3219   // building order of iterations
3220   assert(_ii_order.length() == 0, "should be empty");
3221   if (ii_err != 0) {
3222     assert(in_bb(ii_err) && ii_err->is_Store(), "should be Store in bb");
3223     Node* nd = ii_err;
3224     while(_clone_map.gen(nd->_idx) != _ii_last) {
3225       _ii_order.push(_clone_map.gen(nd->_idx));
3226       bool found = false;
3227       for (DUIterator_Fast imax, i = nd->fast_outs(imax); i < imax; i++) {
3228         Node* use = nd->fast_out(i);
3229         if (_clone_map.idx(use->_idx) == _clone_map.idx(nd->_idx) && use->as_Store()->in(MemNode::Memory) == nd) {
3230           found = true;
3231           nd = use;
3232           break;
3233         }
3234       }//for
3235 
3236       if (found == false) {
3237 #ifndef PRODUCT
3238         if (TraceSuperWord && Verbose) {
3239           tty->print_cr("SuperWord::mark_generations: Cannot build order of iterations - no dependent Store for %d", nd->_idx);
3240         }
3241 #endif
3242         _ii_order.clear();
3243         return -1;
3244       }
3245     } //while
3246     _ii_order.push(_clone_map.gen(nd->_idx));
3247   }
3248 
3249 #ifndef PRODUCT
3250   if (_vector_loop_debug) {
3251     tty->print_cr("SuperWord::mark_generations");
3252     tty->print_cr("First generation (%d) nodes:", _ii_first);
3253     for (int ii = 0; ii < _iteration_first.length(); ii++)  _iteration_first.at(ii)->dump();
3254     tty->print_cr("Last generation (%d) nodes:", _ii_last);
3255     for (int ii = 0; ii < _iteration_last.length(); ii++)  _iteration_last.at(ii)->dump();
3256     tty->print_cr(" ");
3257 
3258     tty->print("SuperWord::List of generations: ");
3259     for (int jj = 0; jj < _ii_order.length(); ++jj) {
3260       tty->print("%d:%d ", jj, _ii_order.at(jj));
3261     }
3262     tty->print_cr(" ");
3263   }
3264 #endif
3265 
3266   return _ii_first;
3267 }
3268 
3269 bool SuperWord::fix_commutative_inputs(Node* gold, Node* fix) {
3270   assert(gold->is_Add() && fix->is_Add() || gold->is_Mul() && fix->is_Mul(), "should be only Add or Mul nodes");
3271   assert(_clone_map.idx(gold->_idx) == _clone_map.idx(fix->_idx), "should be clones of the same node");
3272   Node* gin1 = gold->in(1);
3273   Node* gin2 = gold->in(2);
3274   Node* fin1 = fix->in(1);
3275   Node* fin2 = fix->in(2);
3276   bool swapped = false;
3277 
3278   if (in_bb(gin1) && in_bb(gin2) && in_bb(fin1) && in_bb(fin1)) {
3279     if (_clone_map.idx(gin1->_idx) == _clone_map.idx(fin1->_idx) &&
3280         _clone_map.idx(gin2->_idx) == _clone_map.idx(fin2->_idx)) {
3281       return true; // nothing to fix
3282     }
3283     if (_clone_map.idx(gin1->_idx) == _clone_map.idx(fin2->_idx) &&
3284         _clone_map.idx(gin2->_idx) == _clone_map.idx(fin1->_idx)) {
3285       fix->swap_edges(1, 2);
3286       swapped = true;
3287     }
3288   }
3289   // at least one input comes from outside of bb
3290   if (gin1->_idx == fin1->_idx)  {
3291     return true; // nothing to fix
3292   }
3293   if (!swapped && (gin1->_idx == fin2->_idx || gin2->_idx == fin1->_idx))  { //swapping is expensive, check condition first
3294     fix->swap_edges(1, 2);
3295     swapped = true;
3296   }
3297 
3298   if (swapped) {
3299 #ifndef PRODUCT
3300     if (_vector_loop_debug) {
3301       tty->print_cr("SuperWord::fix_commutative_inputs: fixed node %d", fix->_idx);
3302     }
3303 #endif
3304     return true;
3305   }
3306 
3307 #ifndef PRODUCT
3308   if (TraceSuperWord && Verbose) {
3309     tty->print_cr("SuperWord::fix_commutative_inputs: cannot fix node %d", fix->_idx);
3310   }
3311 #endif
3312   return false;
3313 }
3314 
3315 bool SuperWord::pack_parallel() {
3316 #ifndef PRODUCT
3317   if (_vector_loop_debug) {
3318     tty->print_cr("SuperWord::pack_parallel: START");
3319   }
3320 #endif
3321 
3322   _packset.clear();
3323 
3324   for (int ii = 0; ii < _iteration_first.length(); ii++) {
3325     Node* nd = _iteration_first.at(ii);
3326     if (in_bb(nd) && (nd->is_Load() || nd->is_Store() || nd->is_Add() || nd->is_Mul())) {
3327       Node_List* pk = new Node_List();
3328       pk->push(nd);
3329       for (int gen = 1; gen < _ii_order.length(); ++gen) {
3330         for (int kk = 0; kk < _block.length(); kk++) {
3331           Node* clone = _block.at(kk);
3332           if (_clone_map.idx(clone->_idx) == _clone_map.idx(nd->_idx) &&
3333               _clone_map.gen(clone->_idx) == _ii_order.at(gen)) {
3334             if (nd->is_Add() || nd->is_Mul()) {
3335               fix_commutative_inputs(nd, clone);
3336             }
3337             pk->push(clone);
3338             if (pk->size() == 4) {
3339               _packset.append(pk);
3340 #ifndef PRODUCT
3341               if (_vector_loop_debug) {
3342                 tty->print_cr("SuperWord::pack_parallel: added pack ");
3343                 pk->dump();
3344               }
3345 #endif
3346               if (_clone_map.gen(clone->_idx) != _ii_last) {
3347                 pk = new Node_List();
3348               }
3349             }
3350             break;
3351           }
3352         }
3353       }//for
3354     }//if
3355   }//for
3356 
3357 #ifndef PRODUCT
3358   if (_vector_loop_debug) {
3359     tty->print_cr("SuperWord::pack_parallel: END");
3360   }
3361 #endif
3362 
3363   return true;
3364 }
3365 
3366 bool SuperWord::hoist_loads_in_graph() {
3367   GrowableArray<Node*> loads;
3368 
3369 #ifndef PRODUCT
3370   if (_vector_loop_debug) {
3371     tty->print_cr("SuperWord::hoist_loads_in_graph: total number _mem_slice_head.length() = %d", _mem_slice_head.length());
3372   }
3373 #endif
3374 
3375   for (int i = 0; i < _mem_slice_head.length(); i++) {
3376     Node* n = _mem_slice_head.at(i);
3377     if ( !in_bb(n) || !n->is_Phi() || n->bottom_type() != Type::MEMORY) {
3378 #ifndef PRODUCT
3379       if (TraceSuperWord && Verbose) {
3380         tty->print_cr("SuperWord::hoist_loads_in_graph: skipping unexpected node n=%d", n->_idx);
3381       }
3382 #endif
3383       continue;
3384     }
3385 
3386 #ifndef PRODUCT
3387     if (_vector_loop_debug) {
3388       tty->print_cr("SuperWord::hoist_loads_in_graph: processing phi %d  = _mem_slice_head.at(%d);", n->_idx, i);
3389     }
3390 #endif
3391 
3392     for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
3393       Node* ld = n->fast_out(i);
3394       if (ld->is_Load() && ld->as_Load()->in(MemNode::Memory) == n && in_bb(ld)) {
3395         for (int i = 0; i < _block.length(); i++) {
3396           Node* ld2 = _block.at(i);
3397           if (ld2->is_Load() &&
3398               _clone_map.idx(ld->_idx) == _clone_map.idx(ld2->_idx) &&
3399               _clone_map.gen(ld->_idx) != _clone_map.gen(ld2->_idx)) { // <= do not collect the first generation ld
3400 #ifndef PRODUCT
3401             if (_vector_loop_debug) {
3402               tty->print_cr("SuperWord::hoist_loads_in_graph: will try to hoist load ld2->_idx=%d, cloned from %d (ld->_idx=%d)",
3403                             ld2->_idx, _clone_map.idx(ld->_idx), ld->_idx);
3404             }
3405 #endif
3406             // could not do on-the-fly, since iterator is immutable
3407             loads.push(ld2);
3408           }
3409         }// for
3410       }//if
3411     }//for (DUIterator_Fast imax,
3412   }//for (int i = 0; i
3413 
3414   for (int i = 0; i < loads.length(); i++) {
3415     LoadNode* ld = loads.at(i)->as_Load();
3416     Node* phi = find_phi_for_mem_dep(ld);
3417     if (phi != NULL) {
3418 #ifndef PRODUCT
3419       if (_vector_loop_debug) {
3420         tty->print_cr("SuperWord::hoist_loads_in_graph replacing MemNode::Memory(%d) edge in %d with one from %d",
3421                       MemNode::Memory, ld->_idx, phi->_idx);
3422       }
3423 #endif
3424       _igvn.replace_input_of(ld, MemNode::Memory, phi);
3425     }
3426   }//for
3427 
3428   restart(); // invalidate all basic structures, since we rebuilt the graph
3429 
3430 #ifndef PRODUCT
3431   if (TraceSuperWord && Verbose) {
3432     tty->print_cr("\nSuperWord::hoist_loads_in_graph() the graph was rebuilt, all structures invalidated and need rebuild");
3433   }
3434 #endif
3435   return true;
3436 }
3437