1 /*
   2  * Copyright (c) 2007, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 #include "precompiled.hpp"
  25 #include "compiler/compileLog.hpp"
  26 #include "libadt/vectset.hpp"
  27 #include "memory/allocation.inline.hpp"
  28 #include "opto/addnode.hpp"
  29 #include "opto/callnode.hpp"
  30 #include "opto/castnode.hpp"
  31 #include "opto/convertnode.hpp"
  32 #include "opto/divnode.hpp"
  33 #include "opto/matcher.hpp"
  34 #include "opto/memnode.hpp"
  35 #include "opto/mulnode.hpp"
  36 #include "opto/opcodes.hpp"
  37 #include "opto/opaquenode.hpp"
  38 #include "opto/superword.hpp"
  39 #include "opto/vectornode.hpp"
  40 #include "opto/movenode.hpp"
  41 
  42 //
  43 //                  S U P E R W O R D   T R A N S F O R M
  44 //=============================================================================
  45 
  46 //------------------------------SuperWord---------------------------
  47 SuperWord::SuperWord(PhaseIdealLoop* phase) :
  48   _phase(phase),
  49   _igvn(phase->_igvn),
  50   _arena(phase->C->comp_arena()),
  51   _packset(arena(), 8,  0, NULL),         // packs for the current block
  52   _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
  53   _block(arena(), 8,  0, NULL),           // nodes in current block
  54   _data_entry(arena(), 8,  0, NULL),      // nodes with all inputs from outside
  55   _mem_slice_head(arena(), 8,  0, NULL),  // memory slice heads
  56   _mem_slice_tail(arena(), 8,  0, NULL),  // memory slice tails
  57   _node_info(arena(), 8,  0, SWNodeInfo::initial), // info needed per node
  58   _clone_map(phase->C->clone_map()),      // map of nodes created in cloning
  59   _cmovev_kit(_arena, this),              // map to facilitate CMoveVD creation
  60   _align_to_ref(NULL),                    // memory reference to align vectors to
  61   _disjoint_ptrs(arena(), 8,  0, OrderedPair::initial), // runtime disambiguated pointer pairs
  62   _dg(_arena),                            // dependence graph
  63   _visited(arena()),                      // visited node set
  64   _post_visited(arena()),                 // post visited node set
  65   _n_idx_list(arena(), 8),                // scratch list of (node,index) pairs
  66   _stk(arena(), 8, 0, NULL),              // scratch stack of nodes
  67   _nlist(arena(), 8, 0, NULL),            // scratch list of nodes
  68   _lpt(NULL),                             // loop tree node
  69   _lp(NULL),                              // LoopNode
  70   _bb(NULL),                              // basic block
  71   _iv(NULL),                              // induction var
  72   _race_possible(false),                  // cases where SDMU is true
  73   _early_return(true),                    // analysis evaluations routine
  74   _num_work_vecs(0),                      // amount of vector work we have
  75   _num_reductions(0),                     // amount of reduction work we have
  76   _do_vector_loop(phase->C->do_vector_loop()),  // whether to do vectorization/simd style
  77   _do_reserve_copy(DoReserveCopyInSuperWord),
  78   _ii_first(-1),                          // first loop generation index - only if do_vector_loop()
  79   _ii_last(-1),                           // last loop generation index - only if do_vector_loop()
  80   _ii_order(arena(), 8, 0, 0)
  81 {
  82 #ifndef PRODUCT
  83   _vector_loop_debug = 0;
  84   if (_phase->C->method() != NULL) {
  85     _vector_loop_debug = phase->C->directive()->VectorizeDebugOption;
  86   }
  87 
  88 #endif
  89 }
  90 
  91 //------------------------------transform_loop---------------------------
  92 void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
  93   assert(UseSuperWord, "should be");
  94   // Do vectors exist on this architecture?
  95   if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
  96 
  97   assert(lpt->_head->is_CountedLoop(), "must be");
  98   CountedLoopNode *cl = lpt->_head->as_CountedLoop();
  99 
 100   if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
 101 
 102   if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops
 103   // Check for no control flow in body (other than exit)
 104   Node *cl_exit = cl->loopexit();
 105   if (cl_exit->in(0) != lpt->_head) {
 106     #ifndef PRODUCT
 107       if (TraceSuperWord) {
 108         tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
 109         tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
 110         tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();
 111         tty->print("lpt->_head %d", lpt->_head->_idx); lpt->_head->dump();
 112         lpt->dump_head();
 113       }
 114     #endif
 115     return;
 116   }
 117 
 118   // Make sure the are no extra control users of the loop backedge
 119   if (cl->back_control()->outcnt() != 1) {
 120     return;
 121   }
 122 
 123   // We only re-enter slp when we vector mapped a queried loop and we want to
 124   // continue unrolling, in this case, slp is not subsequently done.
 125   if (cl->do_unroll_only()) return;
 126 
 127   // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
 128   CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
 129   if (pre_end == NULL) return;
 130   Node *pre_opaq1 = pre_end->limit();
 131   if (pre_opaq1->Opcode() != Op_Opaque1) return;
 132 
 133   init(); // initialize data structures
 134 
 135   set_lpt(lpt);
 136   set_lp(cl);
 137 
 138   // For now, define one block which is the entire loop body
 139   set_bb(cl);
 140 
 141   if (do_optimization) {
 142     assert(_packset.length() == 0, "packset must be empty");
 143     SLP_extract();
 144   }
 145 }
 146 
 147 //------------------------------early unrolling analysis------------------------------
 148 void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
 149   bool is_slp = true;
 150   ResourceMark rm;
 151   size_t ignored_size = lpt()->_body.size();
 152   int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
 153   Node_Stack nstack((int)ignored_size);
 154   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
 155   Node *cl_exit = cl->loopexit();
 156 
 157   // First clear the entries
 158   for (uint i = 0; i < lpt()->_body.size(); i++) {
 159     ignored_loop_nodes[i] = -1;
 160   }
 161 
 162   int max_vector = Matcher::max_vector_size(T_INT);
 163 
 164   // Process the loop, some/all of the stack entries will not be in order, ergo
 165   // need to preprocess the ignored initial state before we process the loop
 166   for (uint i = 0; i < lpt()->_body.size(); i++) {
 167     Node* n = lpt()->_body.at(i);
 168     if (n == cl->incr() ||
 169       n->is_reduction() ||
 170       n->is_AddP() ||
 171       n->is_Cmp() ||
 172       n->is_IfTrue() ||
 173       n->is_CountedLoop() ||
 174       (n == cl_exit)) {
 175       ignored_loop_nodes[i] = n->_idx;
 176       continue;
 177     }
 178 
 179     if (n->is_If()) {
 180       IfNode *iff = n->as_If();
 181       if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
 182         if (lpt()->is_loop_exit(iff)) {
 183           ignored_loop_nodes[i] = n->_idx;
 184           continue;
 185         }
 186       }
 187     }
 188 
 189     if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
 190       Node* n_tail = n->in(LoopNode::LoopBackControl);
 191       if (n_tail != n->in(LoopNode::EntryControl)) {
 192         if (!n_tail->is_Mem()) {
 193           is_slp = false;
 194           break;
 195         }
 196       }
 197     }
 198 
 199     // This must happen after check of phi/if
 200     if (n->is_Phi() || n->is_If()) {
 201       ignored_loop_nodes[i] = n->_idx;
 202       continue;
 203     }
 204 
 205     if (n->is_LoadStore() || n->is_MergeMem() ||
 206       (n->is_Proj() && !n->as_Proj()->is_CFG())) {
 207       is_slp = false;
 208       break;
 209     }
 210 
 211     // Ignore nodes with non-primitive type.
 212     BasicType bt;
 213     if (n->is_Mem()) {
 214       bt = n->as_Mem()->memory_type();
 215     } else {
 216       bt = n->bottom_type()->basic_type();
 217     }
 218     if (is_java_primitive(bt) == false) {
 219       ignored_loop_nodes[i] = n->_idx;
 220       continue;
 221     }
 222 
 223     if (n->is_Mem()) {
 224       MemNode* current = n->as_Mem();
 225       Node* adr = n->in(MemNode::Address);
 226       Node* n_ctrl = _phase->get_ctrl(adr);
 227 
 228       // save a queue of post process nodes
 229       if (n_ctrl != NULL && lpt()->is_member(_phase->get_loop(n_ctrl))) {
 230         // Process the memory expression
 231         int stack_idx = 0;
 232         bool have_side_effects = true;
 233         if (adr->is_AddP() == false) {
 234           nstack.push(adr, stack_idx++);
 235         } else {
 236           // Mark the components of the memory operation in nstack
 237           SWPointer p1(current, this, &nstack, true);
 238           have_side_effects = p1.node_stack()->is_nonempty();
 239         }
 240 
 241         // Process the pointer stack
 242         while (have_side_effects) {
 243           Node* pointer_node = nstack.node();
 244           for (uint j = 0; j < lpt()->_body.size(); j++) {
 245             Node* cur_node = lpt()->_body.at(j);
 246             if (cur_node == pointer_node) {
 247               ignored_loop_nodes[j] = cur_node->_idx;
 248               break;
 249             }
 250           }
 251           nstack.pop();
 252           have_side_effects = nstack.is_nonempty();
 253         }
 254       }
 255     }
 256   }
 257 
 258   if (is_slp) {
 259     // Now we try to find the maximum supported consistent vector which the machine
 260     // description can use
 261     for (uint i = 0; i < lpt()->_body.size(); i++) {
 262       if (ignored_loop_nodes[i] != -1) continue;
 263 
 264       BasicType bt;
 265       Node* n = lpt()->_body.at(i);
 266       if (n->is_Mem()) {
 267         bt = n->as_Mem()->memory_type();
 268       } else {
 269         bt = n->bottom_type()->basic_type();
 270       }
 271       if (is_java_primitive(bt) == false) continue;
 272 
 273       int cur_max_vector = Matcher::max_vector_size(bt);
 274 
 275       // If a max vector exists which is not larger than _local_loop_unroll_factor
 276       // stop looking, we already have the max vector to map to.
 277       if (cur_max_vector < local_loop_unroll_factor) {
 278         is_slp = false;
 279         NOT_PRODUCT(if (TraceSuperWordLoopUnrollAnalysis) tty->print_cr("slp analysis fails: unroll limit greater than max vector\n"));
 280         break;
 281       }
 282 
 283       // Map the maximal common vector
 284       if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
 285         if (cur_max_vector < max_vector) {
 286           max_vector = cur_max_vector;
 287         }
 288       }
 289     }
 290     if (is_slp) {
 291       local_loop_unroll_factor = max_vector;
 292       cl->mark_passed_slp();
 293     }
 294     cl->mark_was_slp();
 295     cl->set_slp_max_unroll(local_loop_unroll_factor);
 296   }
 297 }
 298 
 299 //------------------------------SLP_extract---------------------------
 300 // Extract the superword level parallelism
 301 //
 302 // 1) A reverse post-order of nodes in the block is constructed.  By scanning
 303 //    this list from first to last, all definitions are visited before their uses.
 304 //
 305 // 2) A point-to-point dependence graph is constructed between memory references.
 306 //    This simplies the upcoming "independence" checker.
 307 //
 308 // 3) The maximum depth in the node graph from the beginning of the block
 309 //    to each node is computed.  This is used to prune the graph search
 310 //    in the independence checker.
 311 //
 312 // 4) For integer types, the necessary bit width is propagated backwards
 313 //    from stores to allow packed operations on byte, char, and short
 314 //    integers.  This reverses the promotion to type "int" that javac
 315 //    did for operations like: char c1,c2,c3;  c1 = c2 + c3.
 316 //
 317 // 5) One of the memory references is picked to be an aligned vector reference.
 318 //    The pre-loop trip count is adjusted to align this reference in the
 319 //    unrolled body.
 320 //
 321 // 6) The initial set of pack pairs is seeded with memory references.
 322 //
 323 // 7) The set of pack pairs is extended by following use->def and def->use links.
 324 //
 325 // 8) The pairs are combined into vector sized packs.
 326 //
 327 // 9) Reorder the memory slices to co-locate members of the memory packs.
 328 //
 329 // 10) Generate ideal vector nodes for the final set of packs and where necessary,
 330 //    inserting scalar promotion, vector creation from multiple scalars, and
 331 //    extraction of scalar values from vectors.
 332 //
 333 void SuperWord::SLP_extract() {
 334 
 335 #ifndef PRODUCT
 336   if (_do_vector_loop && TraceSuperWord) {
 337     tty->print("SuperWord::SLP_extract\n");
 338     tty->print("input loop\n");
 339     _lpt->dump_head();
 340     _lpt->dump();
 341     for (uint i = 0; i < _lpt->_body.size(); i++) {
 342       _lpt->_body.at(i)->dump();
 343     }
 344   }
 345 #endif
 346   // Ready the block
 347   if (!construct_bb()) {
 348     return; // Exit if no interesting nodes or complex graph.
 349   }
 350   // build    _dg, _disjoint_ptrs
 351   dependence_graph();
 352 
 353   // compute function depth(Node*)
 354   compute_max_depth();
 355 
 356   if (_do_vector_loop) {
 357     if (mark_generations() != -1) {
 358       hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly
 359 
 360       if (!construct_bb()) {
 361         return; // Exit if no interesting nodes or complex graph.
 362       }
 363       dependence_graph();
 364       compute_max_depth();
 365     }
 366 
 367 #ifndef PRODUCT
 368     if (TraceSuperWord) {
 369       tty->print_cr("\nSuperWord::_do_vector_loop: graph after hoist_loads_in_graph");
 370       _lpt->dump_head();
 371       for (int j = 0; j < _block.length(); j++) {
 372         Node* n = _block.at(j);
 373         int d = depth(n);
 374         for (int i = 0;  i < d; i++) tty->print("%s", "  ");
 375         tty->print("%d :", d);
 376         n->dump();
 377       }
 378     }
 379 #endif
 380   }
 381 
 382   compute_vector_element_type();
 383 
 384   // Attempt vectorization
 385 
 386   find_adjacent_refs();
 387 
 388   extend_packlist();
 389 
 390   if (_do_vector_loop) {
 391     if (_packset.length() == 0) {
 392 #ifndef PRODUCT
 393       if (TraceSuperWord) {
 394         tty->print_cr("\nSuperWord::_do_vector_loop DFA could not build packset, now trying to build anyway");
 395       }
 396 #endif
 397       pack_parallel();
 398     }
 399   }
 400 
 401   combine_packs();
 402 
 403   construct_my_pack_map();
 404 
 405   if (_do_vector_loop) {
 406     merge_packs_to_cmovd();
 407   }
 408 
 409   filter_packs();
 410 
 411   schedule();
 412 
 413   output();
 414 }
 415 
 416 //------------------------------find_adjacent_refs---------------------------
 417 // Find the adjacent memory references and create pack pairs for them.
 418 // This is the initial set of packs that will then be extended by
 419 // following use->def and def->use links.  The align positions are
 420 // assigned relative to the reference "align_to_ref"
 421 void SuperWord::find_adjacent_refs() {
 422   // Get list of memory operations
 423   Node_List memops;
 424   for (int i = 0; i < _block.length(); i++) {
 425     Node* n = _block.at(i);
 426     if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
 427         is_java_primitive(n->as_Mem()->memory_type())) {
 428       int align = memory_alignment(n->as_Mem(), 0);
 429       if (align != bottom_align) {
 430         memops.push(n);
 431       }
 432     }
 433   }
 434 
 435   Node_List align_to_refs;
 436   int best_iv_adjustment = 0;
 437   MemNode* best_align_to_mem_ref = NULL;
 438 
 439   while (memops.size() != 0) {
 440     // Find a memory reference to align to.
 441     MemNode* mem_ref = find_align_to_ref(memops);
 442     if (mem_ref == NULL) break;
 443     align_to_refs.push(mem_ref);
 444     int iv_adjustment = get_iv_adjustment(mem_ref);
 445 
 446     if (best_align_to_mem_ref == NULL) {
 447       // Set memory reference which is the best from all memory operations
 448       // to be used for alignment. The pre-loop trip count is modified to align
 449       // this reference to a vector-aligned address.
 450       best_align_to_mem_ref = mem_ref;
 451       best_iv_adjustment = iv_adjustment;
 452       NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
 453     }
 454 
 455     SWPointer align_to_ref_p(mem_ref, this, NULL, false);
 456     // Set alignment relative to "align_to_ref" for all related memory operations.
 457     for (int i = memops.size() - 1; i >= 0; i--) {
 458       MemNode* s = memops.at(i)->as_Mem();
 459       if (isomorphic(s, mem_ref) &&
 460            (!_do_vector_loop || same_origin_idx(s, mem_ref))) {
 461         SWPointer p2(s, this, NULL, false);
 462         if (p2.comparable(align_to_ref_p)) {
 463           int align = memory_alignment(s, iv_adjustment);
 464           set_alignment(s, align);
 465         }
 466       }
 467     }
 468 
 469     // Create initial pack pairs of memory operations for which
 470     // alignment is set and vectors will be aligned.
 471     bool create_pack = true;
 472     if (memory_alignment(mem_ref, best_iv_adjustment) == 0 || _do_vector_loop) {
 473       if (!Matcher::misaligned_vectors_ok()) {
 474         int vw = vector_width(mem_ref);
 475         int vw_best = vector_width(best_align_to_mem_ref);
 476         if (vw > vw_best) {
 477           // Do not vectorize a memory access with more elements per vector
 478           // if unaligned memory access is not allowed because number of
 479           // iterations in pre-loop will be not enough to align it.
 480           create_pack = false;
 481         } else {
 482           SWPointer p2(best_align_to_mem_ref, this, NULL, false);
 483           if (align_to_ref_p.invar() != p2.invar()) {
 484             // Do not vectorize memory accesses with different invariants
 485             // if unaligned memory accesses are not allowed.
 486             create_pack = false;
 487           }
 488         }
 489       }
 490     } else {
 491       if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
 492         // Can't allow vectorization of unaligned memory accesses with the
 493         // same type since it could be overlapped accesses to the same array.
 494         create_pack = false;
 495       } else {
 496         // Allow independent (different type) unaligned memory operations
 497         // if HW supports them.
 498         if (!Matcher::misaligned_vectors_ok()) {
 499           create_pack = false;
 500         } else {
 501           // Check if packs of the same memory type but
 502           // with a different alignment were created before.
 503           for (uint i = 0; i < align_to_refs.size(); i++) {
 504             MemNode* mr = align_to_refs.at(i)->as_Mem();
 505             if (same_velt_type(mr, mem_ref) &&
 506                 memory_alignment(mr, iv_adjustment) != 0)
 507               create_pack = false;
 508           }
 509         }
 510       }
 511     }
 512     if (create_pack) {
 513       for (uint i = 0; i < memops.size(); i++) {
 514         Node* s1 = memops.at(i);
 515         int align = alignment(s1);
 516         if (align == top_align) continue;
 517         for (uint j = 0; j < memops.size(); j++) {
 518           Node* s2 = memops.at(j);
 519           if (alignment(s2) == top_align) continue;
 520           if (s1 != s2 && are_adjacent_refs(s1, s2)) {
 521             if (stmts_can_pack(s1, s2, align)) {
 522               Node_List* pair = new Node_List();
 523               pair->push(s1);
 524               pair->push(s2);
 525               if (!_do_vector_loop || same_origin_idx(s1, s2)) {
 526                 _packset.append(pair);
 527               }
 528             }
 529           }
 530         }
 531       }
 532     } else { // Don't create unaligned pack
 533       // First, remove remaining memory ops of the same type from the list.
 534       for (int i = memops.size() - 1; i >= 0; i--) {
 535         MemNode* s = memops.at(i)->as_Mem();
 536         if (same_velt_type(s, mem_ref)) {
 537           memops.remove(i);
 538         }
 539       }
 540 
 541       // Second, remove already constructed packs of the same type.
 542       for (int i = _packset.length() - 1; i >= 0; i--) {
 543         Node_List* p = _packset.at(i);
 544         MemNode* s = p->at(0)->as_Mem();
 545         if (same_velt_type(s, mem_ref)) {
 546           remove_pack_at(i);
 547         }
 548       }
 549 
 550       // If needed find the best memory reference for loop alignment again.
 551       if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
 552         // Put memory ops from remaining packs back on memops list for
 553         // the best alignment search.
 554         uint orig_msize = memops.size();
 555         for (int i = 0; i < _packset.length(); i++) {
 556           Node_List* p = _packset.at(i);
 557           MemNode* s = p->at(0)->as_Mem();
 558           assert(!same_velt_type(s, mem_ref), "sanity");
 559           memops.push(s);
 560         }
 561         MemNode* best_align_to_mem_ref = find_align_to_ref(memops);
 562         if (best_align_to_mem_ref == NULL) {
 563           NOT_PRODUCT(if (TraceSuperWord) tty->print_cr("SuperWord::find_adjacent_refs(): best_align_to_mem_ref == NULL");)
 564           break;
 565         }
 566         best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref);
 567         NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
 568         // Restore list.
 569         while (memops.size() > orig_msize)
 570           (void)memops.pop();
 571       }
 572     } // unaligned memory accesses
 573 
 574     // Remove used mem nodes.
 575     for (int i = memops.size() - 1; i >= 0; i--) {
 576       MemNode* m = memops.at(i)->as_Mem();
 577       if (alignment(m) != top_align) {
 578         memops.remove(i);
 579       }
 580     }
 581 
 582   } // while (memops.size() != 0
 583   set_align_to_ref(best_align_to_mem_ref);
 584 
 585 #ifndef PRODUCT
 586   if (TraceSuperWord) {
 587     tty->print_cr("\nAfter find_adjacent_refs");
 588     print_packset();
 589   }
 590 #endif
 591 }
 592 
 593 #ifndef PRODUCT
 594 void SuperWord::find_adjacent_refs_trace_1(Node* best_align_to_mem_ref, int best_iv_adjustment) {
 595   if (is_trace_adjacent()) {
 596     tty->print("SuperWord::find_adjacent_refs best_align_to_mem_ref = %d, best_iv_adjustment = %d",
 597        best_align_to_mem_ref->_idx, best_iv_adjustment);
 598        best_align_to_mem_ref->dump();
 599   }
 600 }
 601 #endif
 602 
 603 //------------------------------find_align_to_ref---------------------------
 604 // Find a memory reference to align the loop induction variable to.
 605 // Looks first at stores then at loads, looking for a memory reference
 606 // with the largest number of references similar to it.
 607 MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
 608   GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
 609 
 610   // Count number of comparable memory ops
 611   for (uint i = 0; i < memops.size(); i++) {
 612     MemNode* s1 = memops.at(i)->as_Mem();
 613     SWPointer p1(s1, this, NULL, false);
 614     // Discard if pre loop can't align this reference
 615     if (!ref_is_alignable(p1)) {
 616       *cmp_ct.adr_at(i) = 0;
 617       continue;
 618     }
 619     for (uint j = i+1; j < memops.size(); j++) {
 620       MemNode* s2 = memops.at(j)->as_Mem();
 621       if (isomorphic(s1, s2)) {
 622         SWPointer p2(s2, this, NULL, false);
 623         if (p1.comparable(p2)) {
 624           (*cmp_ct.adr_at(i))++;
 625           (*cmp_ct.adr_at(j))++;
 626         }
 627       }
 628     }
 629   }
 630 
 631   // Find Store (or Load) with the greatest number of "comparable" references,
 632   // biggest vector size, smallest data size and smallest iv offset.
 633   int max_ct        = 0;
 634   int max_vw        = 0;
 635   int max_idx       = -1;
 636   int min_size      = max_jint;
 637   int min_iv_offset = max_jint;
 638   for (uint j = 0; j < memops.size(); j++) {
 639     MemNode* s = memops.at(j)->as_Mem();
 640     if (s->is_Store()) {
 641       int vw = vector_width_in_bytes(s);
 642       assert(vw > 1, "sanity");
 643       SWPointer p(s, this, NULL, false);
 644       if (cmp_ct.at(j) >  max_ct ||
 645           cmp_ct.at(j) == max_ct &&
 646             (vw >  max_vw ||
 647              vw == max_vw &&
 648               (data_size(s) <  min_size ||
 649                data_size(s) == min_size &&
 650                  (p.offset_in_bytes() < min_iv_offset)))) {
 651         max_ct = cmp_ct.at(j);
 652         max_vw = vw;
 653         max_idx = j;
 654         min_size = data_size(s);
 655         min_iv_offset = p.offset_in_bytes();
 656       }
 657     }
 658   }
 659   // If no stores, look at loads
 660   if (max_ct == 0) {
 661     for (uint j = 0; j < memops.size(); j++) {
 662       MemNode* s = memops.at(j)->as_Mem();
 663       if (s->is_Load()) {
 664         int vw = vector_width_in_bytes(s);
 665         assert(vw > 1, "sanity");
 666         SWPointer p(s, this, NULL, false);
 667         if (cmp_ct.at(j) >  max_ct ||
 668             cmp_ct.at(j) == max_ct &&
 669               (vw >  max_vw ||
 670                vw == max_vw &&
 671                 (data_size(s) <  min_size ||
 672                  data_size(s) == min_size &&
 673                    (p.offset_in_bytes() < min_iv_offset)))) {
 674           max_ct = cmp_ct.at(j);
 675           max_vw = vw;
 676           max_idx = j;
 677           min_size = data_size(s);
 678           min_iv_offset = p.offset_in_bytes();
 679         }
 680       }
 681     }
 682   }
 683 
 684 #ifdef ASSERT
 685   if (TraceSuperWord && Verbose) {
 686     tty->print_cr("\nVector memops after find_align_to_ref");
 687     for (uint i = 0; i < memops.size(); i++) {
 688       MemNode* s = memops.at(i)->as_Mem();
 689       s->dump();
 690     }
 691   }
 692 #endif
 693 
 694   if (max_ct > 0) {
 695 #ifdef ASSERT
 696     if (TraceSuperWord) {
 697       tty->print("\nVector align to node: ");
 698       memops.at(max_idx)->as_Mem()->dump();
 699     }
 700 #endif
 701     return memops.at(max_idx)->as_Mem();
 702   }
 703   return NULL;
 704 }
 705 
 706 //------------------------------ref_is_alignable---------------------------
 707 // Can the preloop align the reference to position zero in the vector?
 708 bool SuperWord::ref_is_alignable(SWPointer& p) {
 709   if (!p.has_iv()) {
 710     return true;   // no induction variable
 711   }
 712   CountedLoopEndNode* pre_end = get_pre_loop_end(lp()->as_CountedLoop());
 713   assert(pre_end != NULL, "we must have a correct pre-loop");
 714   assert(pre_end->stride_is_con(), "pre loop stride is constant");
 715   int preloop_stride = pre_end->stride_con();
 716 
 717   int span = preloop_stride * p.scale_in_bytes();
 718   int mem_size = p.memory_size();
 719   int offset   = p.offset_in_bytes();
 720   // Stride one accesses are alignable if offset is aligned to memory operation size.
 721   // Offset can be unaligned when UseUnalignedAccesses is used.
 722   if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) {
 723     return true;
 724   }
 725   // If the initial offset from start of the object is computable,
 726   // check if the pre-loop can align the final offset accordingly.
 727   //
 728   // In other words: Can we find an i such that the offset
 729   // after i pre-loop iterations is aligned to vw?
 730   //   (init_offset + pre_loop) % vw == 0              (1)
 731   // where
 732   //   pre_loop = i * span
 733   // is the number of bytes added to the offset by i pre-loop iterations.
 734   //
 735   // For this to hold we need pre_loop to increase init_offset by
 736   //   pre_loop = vw - (init_offset % vw)
 737   //
 738   // This is only possible if pre_loop is divisible by span because each
 739   // pre-loop iteration increases the initial offset by 'span' bytes:
 740   //   (vw - (init_offset % vw)) % span == 0
 741   //
 742   int vw = vector_width_in_bytes(p.mem());
 743   assert(vw > 1, "sanity");
 744   Node* init_nd = pre_end->init_trip();
 745   if (init_nd->is_Con() && p.invar() == NULL) {
 746     int init = init_nd->bottom_type()->is_int()->get_con();
 747     int init_offset = init * p.scale_in_bytes() + offset;
 748     assert(init_offset >= 0, "positive offset from object start");
 749     if (vw % span == 0) {
 750       // If vm is a multiple of span, we use formula (1).
 751       if (span > 0) {
 752         return (vw - (init_offset % vw)) % span == 0;
 753       } else {
 754         assert(span < 0, "nonzero stride * scale");
 755         return (init_offset % vw) % -span == 0;
 756       }
 757     } else if (span % vw == 0) {
 758       // If span is a multiple of vw, we can simplify formula (1) to:
 759       //   (init_offset + i * span) % vw == 0
 760       //     =>
 761       //   (init_offset % vw) + ((i * span) % vw) == 0
 762       //     =>
 763       //   init_offset % vw == 0
 764       //
 765       // Because we add a multiple of vw to the initial offset, the final
 766       // offset is a multiple of vw if and only if init_offset is a multiple.
 767       //
 768       return (init_offset % vw) == 0;
 769     }
 770   }
 771   return false;
 772 }
 773 
 774 //---------------------------get_iv_adjustment---------------------------
 775 // Calculate loop's iv adjustment for this memory ops.
 776 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
 777   SWPointer align_to_ref_p(mem_ref, this, NULL, false);
 778   int offset = align_to_ref_p.offset_in_bytes();
 779   int scale  = align_to_ref_p.scale_in_bytes();
 780   int elt_size = align_to_ref_p.memory_size();
 781   int vw       = vector_width_in_bytes(mem_ref);
 782   assert(vw > 1, "sanity");
 783   int iv_adjustment;
 784   if (scale != 0) {
 785     int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
 786     // At least one iteration is executed in pre-loop by default. As result
 787     // several iterations are needed to align memory operations in main-loop even
 788     // if offset is 0.
 789     int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
 790     assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
 791            "(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size);
 792     iv_adjustment = iv_adjustment_in_bytes/elt_size;
 793   } else {
 794     // This memory op is not dependent on iv (scale == 0)
 795     iv_adjustment = 0;
 796   }
 797 
 798 #ifndef PRODUCT
 799   if (TraceSuperWord) {
 800     tty->print("SuperWord::get_iv_adjustment: n = %d, noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d: ",
 801       mem_ref->_idx, offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
 802     mem_ref->dump();
 803   }
 804 #endif
 805   return iv_adjustment;
 806 }
 807 
 808 //---------------------------dependence_graph---------------------------
 809 // Construct dependency graph.
 810 // Add dependence edges to load/store nodes for memory dependence
 811 //    A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
 812 void SuperWord::dependence_graph() {
 813   // First, assign a dependence node to each memory node
 814   for (int i = 0; i < _block.length(); i++ ) {
 815     Node *n = _block.at(i);
 816     if (n->is_Mem() || n->is_Phi() && n->bottom_type() == Type::MEMORY) {
 817       _dg.make_node(n);
 818     }
 819   }
 820 
 821   // For each memory slice, create the dependences
 822   for (int i = 0; i < _mem_slice_head.length(); i++) {
 823     Node* n      = _mem_slice_head.at(i);
 824     Node* n_tail = _mem_slice_tail.at(i);
 825 
 826     // Get slice in predecessor order (last is first)
 827     mem_slice_preds(n_tail, n, _nlist);
 828 
 829 #ifndef PRODUCT
 830     if(TraceSuperWord && Verbose) {
 831       tty->print_cr("SuperWord::dependence_graph: built a new mem slice");
 832       for (int j = _nlist.length() - 1; j >= 0 ; j--) {
 833         _nlist.at(j)->dump();
 834       }
 835     }
 836 #endif
 837     // Make the slice dependent on the root
 838     DepMem* slice = _dg.dep(n);
 839     _dg.make_edge(_dg.root(), slice);
 840 
 841     // Create a sink for the slice
 842     DepMem* slice_sink = _dg.make_node(NULL);
 843     _dg.make_edge(slice_sink, _dg.tail());
 844 
 845     // Now visit each pair of memory ops, creating the edges
 846     for (int j = _nlist.length() - 1; j >= 0 ; j--) {
 847       Node* s1 = _nlist.at(j);
 848 
 849       // If no dependency yet, use slice
 850       if (_dg.dep(s1)->in_cnt() == 0) {
 851         _dg.make_edge(slice, s1);
 852       }
 853       SWPointer p1(s1->as_Mem(), this, NULL, false);
 854       bool sink_dependent = true;
 855       for (int k = j - 1; k >= 0; k--) {
 856         Node* s2 = _nlist.at(k);
 857         if (s1->is_Load() && s2->is_Load())
 858           continue;
 859         SWPointer p2(s2->as_Mem(), this, NULL, false);
 860 
 861         int cmp = p1.cmp(p2);
 862         if (SuperWordRTDepCheck &&
 863             p1.base() != p2.base() && p1.valid() && p2.valid()) {
 864           // Create a runtime check to disambiguate
 865           OrderedPair pp(p1.base(), p2.base());
 866           _disjoint_ptrs.append_if_missing(pp);
 867         } else if (!SWPointer::not_equal(cmp)) {
 868           // Possibly same address
 869           _dg.make_edge(s1, s2);
 870           sink_dependent = false;
 871         }
 872       }
 873       if (sink_dependent) {
 874         _dg.make_edge(s1, slice_sink);
 875       }
 876     }
 877 #ifndef PRODUCT
 878     if (TraceSuperWord) {
 879       tty->print_cr("\nDependence graph for slice: %d", n->_idx);
 880       for (int q = 0; q < _nlist.length(); q++) {
 881         _dg.print(_nlist.at(q));
 882       }
 883       tty->cr();
 884     }
 885 #endif
 886     _nlist.clear();
 887   }
 888 
 889 #ifndef PRODUCT
 890   if (TraceSuperWord) {
 891     tty->print_cr("\ndisjoint_ptrs: %s", _disjoint_ptrs.length() > 0 ? "" : "NONE");
 892     for (int r = 0; r < _disjoint_ptrs.length(); r++) {
 893       _disjoint_ptrs.at(r).print();
 894       tty->cr();
 895     }
 896     tty->cr();
 897   }
 898 #endif
 899 }
 900 
 901 //---------------------------mem_slice_preds---------------------------
 902 // Return a memory slice (node list) in predecessor order starting at "start"
 903 void SuperWord::mem_slice_preds(Node* start, Node* stop, GrowableArray<Node*> &preds) {
 904   assert(preds.length() == 0, "start empty");
 905   Node* n = start;
 906   Node* prev = NULL;
 907   while (true) {
 908     NOT_PRODUCT( if(is_trace_mem_slice()) tty->print_cr("SuperWord::mem_slice_preds: n %d", n->_idx);)
 909     assert(in_bb(n), "must be in block");
 910     for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
 911       Node* out = n->fast_out(i);
 912       if (out->is_Load()) {
 913         if (in_bb(out)) {
 914           preds.push(out);
 915           NOT_PRODUCT(if (TraceSuperWord && Verbose) tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", out->_idx);)
 916         }
 917       } else {
 918         // FIXME
 919         if (out->is_MergeMem() && !in_bb(out)) {
 920           // Either unrolling is causing a memory edge not to disappear,
 921           // or need to run igvn.optimize() again before SLP
 922         } else if (out->is_Phi() && out->bottom_type() == Type::MEMORY && !in_bb(out)) {
 923           // Ditto.  Not sure what else to check further.
 924         } else if (out->Opcode() == Op_StoreCM && out->in(MemNode::OopStore) == n) {
 925           // StoreCM has an input edge used as a precedence edge.
 926           // Maybe an issue when oop stores are vectorized.
 927         } else {
 928           assert(out == prev || prev == NULL, "no branches off of store slice");
 929         }
 930       }//else
 931     }//for
 932     if (n == stop) break;
 933     preds.push(n);
 934     NOT_PRODUCT(if (TraceSuperWord && Verbose) tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", n->_idx);)
 935     prev = n;
 936     assert(n->is_Mem(), "unexpected node %s", n->Name());
 937     n = n->in(MemNode::Memory);
 938   }
 939 }
 940 
 941 //------------------------------stmts_can_pack---------------------------
 942 // Can s1 and s2 be in a pack with s1 immediately preceding s2 and
 943 // s1 aligned at "align"
 944 bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
 945 
 946   // Do not use superword for non-primitives
 947   BasicType bt1 = velt_basic_type(s1);
 948   BasicType bt2 = velt_basic_type(s2);
 949   if(!is_java_primitive(bt1) || !is_java_primitive(bt2))
 950     return false;
 951   if (Matcher::max_vector_size(bt1) < 2) {
 952     return false; // No vectors for this type
 953   }
 954 
 955   if (isomorphic(s1, s2)) {
 956     if (independent(s1, s2) || reduction(s1, s2)) {
 957       if (!exists_at(s1, 0) && !exists_at(s2, 1)) {
 958         if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
 959           int s1_align = alignment(s1);
 960           int s2_align = alignment(s2);
 961           if (s1_align == top_align || s1_align == align) {
 962             if (s2_align == top_align || s2_align == align + data_size(s1)) {
 963               return true;
 964             }
 965           }
 966         }
 967       }
 968     }
 969   }
 970   return false;
 971 }
 972 
 973 //------------------------------exists_at---------------------------
 974 // Does s exist in a pack at position pos?
 975 bool SuperWord::exists_at(Node* s, uint pos) {
 976   for (int i = 0; i < _packset.length(); i++) {
 977     Node_List* p = _packset.at(i);
 978     if (p->at(pos) == s) {
 979       return true;
 980     }
 981   }
 982   return false;
 983 }
 984 
 985 //------------------------------are_adjacent_refs---------------------------
 986 // Is s1 immediately before s2 in memory?
 987 bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
 988   if (!s1->is_Mem() || !s2->is_Mem()) return false;
 989   if (!in_bb(s1)    || !in_bb(s2))    return false;
 990 
 991   // Do not use superword for non-primitives
 992   if (!is_java_primitive(s1->as_Mem()->memory_type()) ||
 993       !is_java_primitive(s2->as_Mem()->memory_type())) {
 994     return false;
 995   }
 996 
 997   // FIXME - co_locate_pack fails on Stores in different mem-slices, so
 998   // only pack memops that are in the same alias set until that's fixed.
 999   if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
1000       _phase->C->get_alias_index(s2->as_Mem()->adr_type()))
1001     return false;
1002   SWPointer p1(s1->as_Mem(), this, NULL, false);
1003   SWPointer p2(s2->as_Mem(), this, NULL, false);
1004   if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
1005   int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
1006   return diff == data_size(s1);
1007 }
1008 
1009 //------------------------------isomorphic---------------------------
1010 // Are s1 and s2 similar?
1011 bool SuperWord::isomorphic(Node* s1, Node* s2) {
1012   if (s1->Opcode() != s2->Opcode()) return false;
1013   if (s1->req() != s2->req()) return false;
1014   if (s1->in(0) != s2->in(0)) return false;
1015   if (!same_velt_type(s1, s2)) return false;
1016   return true;
1017 }
1018 
1019 //------------------------------independent---------------------------
1020 // Is there no data path from s1 to s2 or s2 to s1?
1021 bool SuperWord::independent(Node* s1, Node* s2) {
1022   //  assert(s1->Opcode() == s2->Opcode(), "check isomorphic first");
1023   int d1 = depth(s1);
1024   int d2 = depth(s2);
1025   if (d1 == d2) return s1 != s2;
1026   Node* deep    = d1 > d2 ? s1 : s2;
1027   Node* shallow = d1 > d2 ? s2 : s1;
1028 
1029   visited_clear();
1030 
1031   return independent_path(shallow, deep);
1032 }
1033 
1034 //------------------------------reduction---------------------------
1035 // Is there a data path between s1 and s2 and the nodes reductions?
1036 bool SuperWord::reduction(Node* s1, Node* s2) {
1037   bool retValue = false;
1038   int d1 = depth(s1);
1039   int d2 = depth(s2);
1040   if (d1 + 1 == d2) {
1041     if (s1->is_reduction() && s2->is_reduction()) {
1042       // This is an ordered set, so s1 should define s2
1043       for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1044         Node* t1 = s1->fast_out(i);
1045         if (t1 == s2) {
1046           // both nodes are reductions and connected
1047           retValue = true;
1048         }
1049       }
1050     }
1051   }
1052 
1053   return retValue;
1054 }
1055 
1056 //------------------------------independent_path------------------------------
1057 // Helper for independent
1058 bool SuperWord::independent_path(Node* shallow, Node* deep, uint dp) {
1059   if (dp >= 1000) return false; // stop deep recursion
1060   visited_set(deep);
1061   int shal_depth = depth(shallow);
1062   assert(shal_depth <= depth(deep), "must be");
1063   for (DepPreds preds(deep, _dg); !preds.done(); preds.next()) {
1064     Node* pred = preds.current();
1065     if (in_bb(pred) && !visited_test(pred)) {
1066       if (shallow == pred) {
1067         return false;
1068       }
1069       if (shal_depth < depth(pred) && !independent_path(shallow, pred, dp+1)) {
1070         return false;
1071       }
1072     }
1073   }
1074   return true;
1075 }
1076 
1077 //------------------------------set_alignment---------------------------
1078 void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
1079   set_alignment(s1, align);
1080   if (align == top_align || align == bottom_align) {
1081     set_alignment(s2, align);
1082   } else {
1083     set_alignment(s2, align + data_size(s1));
1084   }
1085 }
1086 
1087 //------------------------------data_size---------------------------
1088 int SuperWord::data_size(Node* s) {
1089   Node* use = NULL; //test if the node is a candidate for CMoveVD optimization, then return the size of CMov
1090   if (_do_vector_loop) {
1091     use = _cmovev_kit.is_Bool_candidate(s);
1092     if (use != NULL) {
1093       return data_size(use);
1094     }
1095     use = _cmovev_kit.is_CmpD_candidate(s);
1096     if (use != NULL) {
1097       return data_size(use);
1098     }
1099   }
1100   int bsize = type2aelembytes(velt_basic_type(s));
1101   assert(bsize != 0, "valid size");
1102   return bsize;
1103 }
1104 
1105 //------------------------------extend_packlist---------------------------
1106 // Extend packset by following use->def and def->use links from pack members.
1107 void SuperWord::extend_packlist() {
1108   bool changed;
1109   do {
1110     packset_sort(_packset.length());
1111     changed = false;
1112     for (int i = 0; i < _packset.length(); i++) {
1113       Node_List* p = _packset.at(i);
1114       changed |= follow_use_defs(p);
1115       changed |= follow_def_uses(p);
1116     }
1117   } while (changed);
1118 
1119   if (_race_possible) {
1120     for (int i = 0; i < _packset.length(); i++) {
1121       Node_List* p = _packset.at(i);
1122       order_def_uses(p);
1123     }
1124   }
1125 
1126 #ifndef PRODUCT
1127   if (TraceSuperWord) {
1128     tty->print_cr("\nAfter extend_packlist");
1129     print_packset();
1130   }
1131 #endif
1132 }
1133 
1134 //------------------------------follow_use_defs---------------------------
1135 // Extend the packset by visiting operand definitions of nodes in pack p
1136 bool SuperWord::follow_use_defs(Node_List* p) {
1137   assert(p->size() == 2, "just checking");
1138   Node* s1 = p->at(0);
1139   Node* s2 = p->at(1);
1140   assert(s1->req() == s2->req(), "just checking");
1141   assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
1142 
1143   if (s1->is_Load()) return false;
1144 
1145   int align = alignment(s1);
1146   NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: s1 %d, align %d", s1->_idx, align);)
1147   bool changed = false;
1148   int start = s1->is_Store() ? MemNode::ValueIn   : 1;
1149   int end   = s1->is_Store() ? MemNode::ValueIn+1 : s1->req();
1150   for (int j = start; j < end; j++) {
1151     Node* t1 = s1->in(j);
1152     Node* t2 = s2->in(j);
1153     if (!in_bb(t1) || !in_bb(t2))
1154       continue;
1155     if (stmts_can_pack(t1, t2, align)) {
1156       if (est_savings(t1, t2) >= 0) {
1157         Node_List* pair = new Node_List();
1158         pair->push(t1);
1159         pair->push(t2);
1160         _packset.append(pair);
1161         NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: set_alignment(%d, %d, %d)", t1->_idx, t2->_idx, align);)
1162         set_alignment(t1, t2, align);
1163         changed = true;
1164       }
1165     }
1166   }
1167   return changed;
1168 }
1169 
1170 //------------------------------follow_def_uses---------------------------
1171 // Extend the packset by visiting uses of nodes in pack p
1172 bool SuperWord::follow_def_uses(Node_List* p) {
1173   bool changed = false;
1174   Node* s1 = p->at(0);
1175   Node* s2 = p->at(1);
1176   assert(p->size() == 2, "just checking");
1177   assert(s1->req() == s2->req(), "just checking");
1178   assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
1179 
1180   if (s1->is_Store()) return false;
1181 
1182   int align = alignment(s1);
1183   NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_def_uses: s1 %d, align %d", s1->_idx, align);)
1184   int savings = -1;
1185   int num_s1_uses = 0;
1186   Node* u1 = NULL;
1187   Node* u2 = NULL;
1188   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1189     Node* t1 = s1->fast_out(i);
1190     num_s1_uses++;
1191     if (!in_bb(t1)) continue;
1192     for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
1193       Node* t2 = s2->fast_out(j);
1194       if (!in_bb(t2)) continue;
1195       if (!opnd_positions_match(s1, t1, s2, t2))
1196         continue;
1197       if (stmts_can_pack(t1, t2, align)) {
1198         int my_savings = est_savings(t1, t2);
1199         if (my_savings > savings) {
1200           savings = my_savings;
1201           u1 = t1;
1202           u2 = t2;
1203         }
1204       }
1205     }
1206   }
1207   if (num_s1_uses > 1) {
1208     _race_possible = true;
1209   }
1210   if (savings >= 0) {
1211     Node_List* pair = new Node_List();
1212     pair->push(u1);
1213     pair->push(u2);
1214     _packset.append(pair);
1215     NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_def_uses: set_alignment(%d, %d, %d)", u1->_idx, u2->_idx, align);)
1216     set_alignment(u1, u2, align);
1217     changed = true;
1218   }
1219   return changed;
1220 }
1221 
1222 //------------------------------order_def_uses---------------------------
1223 // For extended packsets, ordinally arrange uses packset by major component
1224 void SuperWord::order_def_uses(Node_List* p) {
1225   Node* s1 = p->at(0);
1226 
1227   if (s1->is_Store()) return;
1228 
1229   // reductions are always managed beforehand
1230   if (s1->is_reduction()) return;
1231 
1232   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1233     Node* t1 = s1->fast_out(i);
1234 
1235     // Only allow operand swap on commuting operations
1236     if (!t1->is_Add() && !t1->is_Mul()) {
1237       break;
1238     }
1239 
1240     // Now find t1's packset
1241     Node_List* p2 = NULL;
1242     for (int j = 0; j < _packset.length(); j++) {
1243       p2 = _packset.at(j);
1244       Node* first = p2->at(0);
1245       if (t1 == first) {
1246         break;
1247       }
1248       p2 = NULL;
1249     }
1250     // Arrange all sub components by the major component
1251     if (p2 != NULL) {
1252       for (uint j = 1; j < p->size(); j++) {
1253         Node* d1 = p->at(j);
1254         Node* u1 = p2->at(j);
1255         opnd_positions_match(s1, t1, d1, u1);
1256       }
1257     }
1258   }
1259 }
1260 
1261 //---------------------------opnd_positions_match-------------------------
1262 // Is the use of d1 in u1 at the same operand position as d2 in u2?
1263 bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) {
1264   // check reductions to see if they are marshalled to represent the reduction
1265   // operator in a specified opnd
1266   if (u1->is_reduction() && u2->is_reduction()) {
1267     // ensure reductions have phis and reduction definitions feeding the 1st operand
1268     Node* first = u1->in(2);
1269     if (first->is_Phi() || first->is_reduction()) {
1270       u1->swap_edges(1, 2);
1271     }
1272     // ensure reductions have phis and reduction definitions feeding the 1st operand
1273     first = u2->in(2);
1274     if (first->is_Phi() || first->is_reduction()) {
1275       u2->swap_edges(1, 2);
1276     }
1277     return true;
1278   }
1279 
1280   uint ct = u1->req();
1281   if (ct != u2->req()) return false;
1282   uint i1 = 0;
1283   uint i2 = 0;
1284   do {
1285     for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break;
1286     for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break;
1287     if (i1 != i2) {
1288       if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) {
1289         // Further analysis relies on operands position matching.
1290         u2->swap_edges(i1, i2);
1291       } else {
1292         return false;
1293       }
1294     }
1295   } while (i1 < ct);
1296   return true;
1297 }
1298 
1299 //------------------------------est_savings---------------------------
1300 // Estimate the savings from executing s1 and s2 as a pack
1301 int SuperWord::est_savings(Node* s1, Node* s2) {
1302   int save_in = 2 - 1; // 2 operations per instruction in packed form
1303 
1304   // inputs
1305   for (uint i = 1; i < s1->req(); i++) {
1306     Node* x1 = s1->in(i);
1307     Node* x2 = s2->in(i);
1308     if (x1 != x2) {
1309       if (are_adjacent_refs(x1, x2)) {
1310         save_in += adjacent_profit(x1, x2);
1311       } else if (!in_packset(x1, x2)) {
1312         save_in -= pack_cost(2);
1313       } else {
1314         save_in += unpack_cost(2);
1315       }
1316     }
1317   }
1318 
1319   // uses of result
1320   uint ct = 0;
1321   int save_use = 0;
1322   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1323     Node* s1_use = s1->fast_out(i);
1324     for (int j = 0; j < _packset.length(); j++) {
1325       Node_List* p = _packset.at(j);
1326       if (p->at(0) == s1_use) {
1327         for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) {
1328           Node* s2_use = s2->fast_out(k);
1329           if (p->at(p->size()-1) == s2_use) {
1330             ct++;
1331             if (are_adjacent_refs(s1_use, s2_use)) {
1332               save_use += adjacent_profit(s1_use, s2_use);
1333             }
1334           }
1335         }
1336       }
1337     }
1338   }
1339 
1340   if (ct < s1->outcnt()) save_use += unpack_cost(1);
1341   if (ct < s2->outcnt()) save_use += unpack_cost(1);
1342 
1343   return MAX2(save_in, save_use);
1344 }
1345 
1346 //------------------------------costs---------------------------
1347 int SuperWord::adjacent_profit(Node* s1, Node* s2) { return 2; }
1348 int SuperWord::pack_cost(int ct)   { return ct; }
1349 int SuperWord::unpack_cost(int ct) { return ct; }
1350 
1351 //------------------------------combine_packs---------------------------
1352 // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last
1353 void SuperWord::combine_packs() {
1354   bool changed = true;
1355   // Combine packs regardless max vector size.
1356   while (changed) {
1357     changed = false;
1358     for (int i = 0; i < _packset.length(); i++) {
1359       Node_List* p1 = _packset.at(i);
1360       if (p1 == NULL) continue;
1361       // Because of sorting we can start at i + 1
1362       for (int j = i + 1; j < _packset.length(); j++) {
1363         Node_List* p2 = _packset.at(j);
1364         if (p2 == NULL) continue;
1365         if (i == j) continue;
1366         if (p1->at(p1->size()-1) == p2->at(0)) {
1367           for (uint k = 1; k < p2->size(); k++) {
1368             p1->push(p2->at(k));
1369           }
1370           _packset.at_put(j, NULL);
1371           changed = true;
1372         }
1373       }
1374     }
1375   }
1376 
1377   // Split packs which have size greater then max vector size.
1378   for (int i = 0; i < _packset.length(); i++) {
1379     Node_List* p1 = _packset.at(i);
1380     if (p1 != NULL) {
1381       BasicType bt = velt_basic_type(p1->at(0));
1382       uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector
1383       assert(is_power_of_2(max_vlen), "sanity");
1384       uint psize = p1->size();
1385       if (!is_power_of_2(psize)) {
1386         // Skip pack which can't be vector.
1387         // case1: for(...) { a[i] = i; }    elements values are different (i+x)
1388         // case2: for(...) { a[i] = b[i+1]; }  can't align both, load and store
1389         _packset.at_put(i, NULL);
1390         continue;
1391       }
1392       if (psize > max_vlen) {
1393         Node_List* pack = new Node_List();
1394         for (uint j = 0; j < psize; j++) {
1395           pack->push(p1->at(j));
1396           if (pack->size() >= max_vlen) {
1397             assert(is_power_of_2(pack->size()), "sanity");
1398             _packset.append(pack);
1399             pack = new Node_List();
1400           }
1401         }
1402         _packset.at_put(i, NULL);
1403       }
1404     }
1405   }
1406 
1407   // Compress list.
1408   for (int i = _packset.length() - 1; i >= 0; i--) {
1409     Node_List* p1 = _packset.at(i);
1410     if (p1 == NULL) {
1411       _packset.remove_at(i);
1412     }
1413   }
1414 
1415 #ifndef PRODUCT
1416   if (TraceSuperWord) {
1417     tty->print_cr("\nAfter combine_packs");
1418     print_packset();
1419   }
1420 #endif
1421 }
1422 
1423 //-----------------------------construct_my_pack_map--------------------------
1424 // Construct the map from nodes to packs.  Only valid after the
1425 // point where a node is only in one pack (after combine_packs).
1426 void SuperWord::construct_my_pack_map() {
1427   Node_List* rslt = NULL;
1428   for (int i = 0; i < _packset.length(); i++) {
1429     Node_List* p = _packset.at(i);
1430     for (uint j = 0; j < p->size(); j++) {
1431       Node* s = p->at(j);
1432       assert(my_pack(s) == NULL, "only in one pack");
1433       set_my_pack(s, p);
1434     }
1435   }
1436 }
1437 
1438 //------------------------------filter_packs---------------------------
1439 // Remove packs that are not implemented or not profitable.
1440 void SuperWord::filter_packs() {
1441   // Remove packs that are not implemented
1442   for (int i = _packset.length() - 1; i >= 0; i--) {
1443     Node_List* pk = _packset.at(i);
1444     bool impl = implemented(pk);
1445     if (!impl) {
1446 #ifndef PRODUCT
1447       if (TraceSuperWord && Verbose) {
1448         tty->print_cr("Unimplemented");
1449         pk->at(0)->dump();
1450       }
1451 #endif
1452       remove_pack_at(i);
1453     }
1454     Node *n = pk->at(0);
1455     if (n->is_reduction()) {
1456       _num_reductions++;
1457     } else {
1458       _num_work_vecs++;
1459     }
1460   }
1461 
1462   // Remove packs that are not profitable
1463   bool changed;
1464   do {
1465     changed = false;
1466     for (int i = _packset.length() - 1; i >= 0; i--) {
1467       Node_List* pk = _packset.at(i);
1468       bool prof = profitable(pk);
1469       if (!prof) {
1470 #ifndef PRODUCT
1471         if (TraceSuperWord && Verbose) {
1472           tty->print_cr("Unprofitable");
1473           pk->at(0)->dump();
1474         }
1475 #endif
1476         remove_pack_at(i);
1477         changed = true;
1478       }
1479     }
1480   } while (changed);
1481 
1482 #ifndef PRODUCT
1483   if (TraceSuperWord) {
1484     tty->print_cr("\nAfter filter_packs");
1485     print_packset();
1486     tty->cr();
1487   }
1488 #endif
1489 }
1490 
1491 //------------------------------merge_packs_to_cmovd---------------------------
1492 // Merge CMoveD into new vector-nodes
1493 // We want to catch this pattern and subsume CmpD and Bool into CMoveD
1494 //
1495 //                   SubD             ConD
1496 //                  /  |               /
1497 //                 /   |           /   /
1498 //                /    |       /      /
1499 //               /     |   /         /
1500 //              /      /            /
1501 //             /    /  |           /
1502 //            v /      |          /
1503 //         CmpD        |         /
1504 //          |          |        /
1505 //          v          |       /
1506 //         Bool        |      /
1507 //           \         |     /
1508 //             \       |    /
1509 //               \     |   /
1510 //                 \   |  /
1511 //                   \ v /
1512 //                   CMoveD
1513 //
1514 
1515 void SuperWord::merge_packs_to_cmovd() {
1516   for (int i = _packset.length() - 1; i >= 0; i--) {
1517     _cmovev_kit.make_cmovevd_pack(_packset.at(i));
1518   }
1519   #ifndef PRODUCT
1520     if (TraceSuperWord) {
1521       tty->print_cr("\nSuperWord::merge_packs_to_cmovd(): After merge");
1522       print_packset();
1523       tty->cr();
1524     }
1525   #endif
1526 }
1527 
1528 Node* CMoveKit::is_Bool_candidate(Node* def) const {
1529   Node* use = NULL;
1530   if (!def->is_Bool() || def->in(0) != NULL || def->outcnt() != 1) {
1531     return NULL;
1532   }
1533   for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
1534     use = def->fast_out(j);
1535     if (!_sw->same_generation(def, use) || !use->is_CMove()) {
1536       return NULL;
1537     }
1538   }
1539   return use;
1540 }
1541 
1542 Node* CMoveKit::is_CmpD_candidate(Node* def) const {
1543   Node* use = NULL;
1544   if (!def->is_Cmp() || def->in(0) != NULL || def->outcnt() != 1) {
1545     return NULL;
1546   }
1547   for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
1548     use = def->fast_out(j);
1549     if (!_sw->same_generation(def, use) || (use = is_Bool_candidate(use)) == NULL || !_sw->same_generation(def, use)) {
1550       return NULL;
1551     }
1552   }
1553   return use;
1554 }
1555 
1556 Node_List* CMoveKit::make_cmovevd_pack(Node_List* cmovd_pk) {
1557   Node *cmovd = cmovd_pk->at(0);
1558   if (!cmovd->is_CMove()) {
1559     return NULL;
1560   }
1561   if (pack(cmovd) != NULL) { // already in the cmov pack
1562     return NULL;
1563   }
1564   if (cmovd->in(0) != NULL) {
1565     NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: CMoveD %d has control flow, escaping...", cmovd->_idx); cmovd->dump();})
1566     return NULL;
1567   }
1568 
1569   Node* bol = cmovd->as_CMove()->in(CMoveNode::Condition);
1570   if (!bol->is_Bool()
1571       || bol->outcnt() != 1
1572       || !_sw->same_generation(bol, cmovd)
1573       || bol->in(0) != NULL  // BoolNode has control flow!!
1574       || _sw->my_pack(bol) == NULL) {
1575       NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: Bool %d does not fit CMoveD %d for building vector, escaping...", bol->_idx, cmovd->_idx); bol->dump();})
1576       return NULL;
1577   }
1578   Node_List* bool_pk = _sw->my_pack(bol);
1579   if (bool_pk->size() != cmovd_pk->size() ) {
1580     return NULL;
1581   }
1582 
1583   Node* cmpd = bol->in(1);
1584   if (!cmpd->is_Cmp()
1585       || cmpd->outcnt() != 1
1586       || !_sw->same_generation(cmpd, cmovd)
1587       || cmpd->in(0) != NULL  // CmpDNode has control flow!!
1588       || _sw->my_pack(cmpd) == NULL) {
1589       NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: CmpD %d does not fit CMoveD %d for building vector, escaping...", cmpd->_idx, cmovd->_idx); cmpd->dump();})
1590       return NULL;
1591   }
1592   Node_List* cmpd_pk = _sw->my_pack(cmpd);
1593   if (cmpd_pk->size() != cmovd_pk->size() ) {
1594     return NULL;
1595   }
1596 
1597   if (!test_cmpd_pack(cmpd_pk, cmovd_pk)) {
1598     NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: cmpd pack for CmpD %d failed vectorization test", cmpd->_idx); cmpd->dump();})
1599     return NULL;
1600   }
1601 
1602   Node_List* new_cmpd_pk = new Node_List();
1603   uint sz = cmovd_pk->size() - 1;
1604   for (uint i = 0; i <= sz; ++i) {
1605     Node* cmov = cmovd_pk->at(i);
1606     Node* bol  = bool_pk->at(i);
1607     Node* cmp  = cmpd_pk->at(i);
1608 
1609     new_cmpd_pk->insert(i, cmov);
1610 
1611     map(cmov, new_cmpd_pk);
1612     map(bol, new_cmpd_pk);
1613     map(cmp, new_cmpd_pk);
1614 
1615     _sw->set_my_pack(cmov, new_cmpd_pk); // and keep old packs for cmp and bool
1616   }
1617   _sw->_packset.remove(cmovd_pk);
1618   _sw->_packset.remove(bool_pk);
1619   _sw->_packset.remove(cmpd_pk);
1620   _sw->_packset.append(new_cmpd_pk);
1621   NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print_cr("CMoveKit::make_cmovevd_pack: added syntactic CMoveD pack"); _sw->print_pack(new_cmpd_pk);})
1622   return new_cmpd_pk;
1623 }
1624 
1625 bool CMoveKit::test_cmpd_pack(Node_List* cmpd_pk, Node_List* cmovd_pk) {
1626   Node* cmpd0 = cmpd_pk->at(0);
1627   assert(cmpd0->is_Cmp(), "CMoveKit::test_cmpd_pack: should be CmpDNode");
1628   assert(cmovd_pk->at(0)->is_CMove(), "CMoveKit::test_cmpd_pack: should be CMoveD");
1629   assert(cmpd_pk->size() == cmovd_pk->size(), "CMoveKit::test_cmpd_pack: should be same size");
1630   Node* in1 = cmpd0->in(1);
1631   Node* in2 = cmpd0->in(2);
1632   Node_List* in1_pk = _sw->my_pack(in1);
1633   Node_List* in2_pk = _sw->my_pack(in2);
1634 
1635   if (in1_pk != NULL && in1_pk->size() != cmpd_pk->size()
1636     || in2_pk != NULL && in2_pk->size() != cmpd_pk->size() ) {
1637     return false;
1638   }
1639 
1640   // test if "all" in1 are in the same pack or the same node
1641   if (in1_pk == NULL) {
1642     for (uint j = 1; j < cmpd_pk->size(); j++) {
1643       if (cmpd_pk->at(j)->in(1) != in1) {
1644         return false;
1645       }
1646     }//for: in1_pk is not pack but all CmpD nodes in the pack have the same in(1)
1647   }
1648   // test if "all" in2 are in the same pack or the same node
1649   if (in2_pk == NULL) {
1650     for (uint j = 1; j < cmpd_pk->size(); j++) {
1651       if (cmpd_pk->at(j)->in(2) != in2) {
1652         return false;
1653       }
1654     }//for: in2_pk is not pack but all CmpD nodes in the pack have the same in(2)
1655   }
1656   //now check if cmpd_pk may be subsumed in vector built for cmovd_pk
1657   int cmovd_ind1, cmovd_ind2;
1658   if (cmpd_pk->at(0)->in(1) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfFalse)
1659    && cmpd_pk->at(0)->in(2) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfTrue)) {
1660       cmovd_ind1 = CMoveNode::IfFalse;
1661       cmovd_ind2 = CMoveNode::IfTrue;
1662   } else if (cmpd_pk->at(0)->in(2) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfFalse)
1663           && cmpd_pk->at(0)->in(1) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfTrue)) {
1664       cmovd_ind2 = CMoveNode::IfFalse;
1665       cmovd_ind1 = CMoveNode::IfTrue;
1666   }
1667   else {
1668     return false;
1669   }
1670 
1671   for (uint j = 1; j < cmpd_pk->size(); j++) {
1672     if (cmpd_pk->at(j)->in(1) != cmovd_pk->at(j)->as_CMove()->in(cmovd_ind1)
1673         || cmpd_pk->at(j)->in(2) != cmovd_pk->at(j)->as_CMove()->in(cmovd_ind2)) {
1674         return false;
1675     }//if
1676   }
1677   NOT_PRODUCT(if(_sw->is_trace_cmov()) { tty->print("CMoveKit::test_cmpd_pack: cmpd pack for 1st CmpD %d is OK for vectorization: ", cmpd0->_idx); cmpd0->dump(); })
1678   return true;
1679 }
1680 
1681 //------------------------------implemented---------------------------
1682 // Can code be generated for pack p?
1683 bool SuperWord::implemented(Node_List* p) {
1684   bool retValue = false;
1685   Node* p0 = p->at(0);
1686   if (p0 != NULL) {
1687     int opc = p0->Opcode();
1688     uint size = p->size();
1689     if (p0->is_reduction()) {
1690       const Type *arith_type = p0->bottom_type();
1691       // Length 2 reductions of INT/LONG do not offer performance benefits
1692       if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) {
1693         retValue = false;
1694       } else {
1695         retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
1696       }
1697     } else {
1698       retValue = VectorNode::implemented(opc, size, velt_basic_type(p0));
1699     }
1700     if (!retValue) {
1701       if (is_cmov_pack(p)) {
1702         NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::implemented: found cmpd pack"); print_pack(p);})
1703         return true;
1704       }
1705     }
1706   }
1707   return retValue;
1708 }
1709 
1710 bool SuperWord::is_cmov_pack(Node_List* p) {
1711   return _cmovev_kit.pack(p->at(0)) != NULL;
1712 }
1713 //------------------------------same_inputs--------------------------
1714 // For pack p, are all idx operands the same?
1715 bool SuperWord::same_inputs(Node_List* p, int idx) {
1716   Node* p0 = p->at(0);
1717   uint vlen = p->size();
1718   Node* p0_def = p0->in(idx);
1719   for (uint i = 1; i < vlen; i++) {
1720     Node* pi = p->at(i);
1721     Node* pi_def = pi->in(idx);
1722     if (p0_def != pi_def) {
1723       return false;
1724     }
1725   }
1726   return true;
1727 }
1728 
1729 //------------------------------profitable---------------------------
1730 // For pack p, are all operands and all uses (with in the block) vector?
1731 bool SuperWord::profitable(Node_List* p) {
1732   Node* p0 = p->at(0);
1733   uint start, end;
1734   VectorNode::vector_operands(p0, &start, &end);
1735 
1736   // Return false if some inputs are not vectors or vectors with different
1737   // size or alignment.
1738   // Also, for now, return false if not scalar promotion case when inputs are
1739   // the same. Later, implement PackNode and allow differing, non-vector inputs
1740   // (maybe just the ones from outside the block.)
1741   for (uint i = start; i < end; i++) {
1742     if (!is_vector_use(p0, i)) {
1743       return false;
1744     }
1745   }
1746   // Check if reductions are connected
1747   if (p0->is_reduction()) {
1748     Node* second_in = p0->in(2);
1749     Node_List* second_pk = my_pack(second_in);
1750     if ((second_pk == NULL) || (_num_work_vecs == _num_reductions)) {
1751       // Remove reduction flag if no parent pack or if not enough work
1752       // to cover reduction expansion overhead
1753       p0->remove_flag(Node::Flag_is_reduction);
1754       return false;
1755     } else if (second_pk->size() != p->size()) {
1756       return false;
1757     }
1758   }
1759   if (VectorNode::is_shift(p0)) {
1760     // For now, return false if shift count is vector or not scalar promotion
1761     // case (different shift counts) because it is not supported yet.
1762     Node* cnt = p0->in(2);
1763     Node_List* cnt_pk = my_pack(cnt);
1764     if (cnt_pk != NULL)
1765       return false;
1766     if (!same_inputs(p, 2))
1767       return false;
1768   }
1769   if (!p0->is_Store()) {
1770     // For now, return false if not all uses are vector.
1771     // Later, implement ExtractNode and allow non-vector uses (maybe
1772     // just the ones outside the block.)
1773     for (uint i = 0; i < p->size(); i++) {
1774       Node* def = p->at(i);
1775       if (is_cmov_pack_internal_node(p, def)) {
1776         continue;
1777       }
1778       for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
1779         Node* use = def->fast_out(j);
1780         for (uint k = 0; k < use->req(); k++) {
1781           Node* n = use->in(k);
1782           if (def == n) {
1783             // reductions can be loop carried dependences
1784             if (def->is_reduction() && use->is_Phi())
1785               continue;
1786             if (!is_vector_use(use, k)) {
1787               return false;
1788             }
1789           }
1790         }
1791       }
1792     }
1793   }
1794   return true;
1795 }
1796 
1797 //------------------------------schedule---------------------------
1798 // Adjust the memory graph for the packed operations
1799 void SuperWord::schedule() {
1800 
1801   // Co-locate in the memory graph the members of each memory pack
1802   for (int i = 0; i < _packset.length(); i++) {
1803     co_locate_pack(_packset.at(i));
1804   }
1805 }
1806 
1807 //-------------------------------remove_and_insert-------------------
1808 // Remove "current" from its current position in the memory graph and insert
1809 // it after the appropriate insertion point (lip or uip).
1810 void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip,
1811                                   Node *uip, Unique_Node_List &sched_before) {
1812   Node* my_mem = current->in(MemNode::Memory);
1813   bool sched_up = sched_before.member(current);
1814 
1815   // remove current_store from its current position in the memmory graph
1816   for (DUIterator i = current->outs(); current->has_out(i); i++) {
1817     Node* use = current->out(i);
1818     if (use->is_Mem()) {
1819       assert(use->in(MemNode::Memory) == current, "must be");
1820       if (use == prev) { // connect prev to my_mem
1821           _igvn.replace_input_of(use, MemNode::Memory, my_mem);
1822           --i; //deleted this edge; rescan position
1823       } else if (sched_before.member(use)) {
1824         if (!sched_up) { // Will be moved together with current
1825           _igvn.replace_input_of(use, MemNode::Memory, uip);
1826           --i; //deleted this edge; rescan position
1827         }
1828       } else {
1829         if (sched_up) { // Will be moved together with current
1830           _igvn.replace_input_of(use, MemNode::Memory, lip);
1831           --i; //deleted this edge; rescan position
1832         }
1833       }
1834     }
1835   }
1836 
1837   Node *insert_pt =  sched_up ?  uip : lip;
1838 
1839   // all uses of insert_pt's memory state should use current's instead
1840   for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
1841     Node* use = insert_pt->out(i);
1842     if (use->is_Mem()) {
1843       assert(use->in(MemNode::Memory) == insert_pt, "must be");
1844       _igvn.replace_input_of(use, MemNode::Memory, current);
1845       --i; //deleted this edge; rescan position
1846     } else if (!sched_up && use->is_Phi() && use->bottom_type() == Type::MEMORY) {
1847       uint pos; //lip (lower insert point) must be the last one in the memory slice
1848       for (pos=1; pos < use->req(); pos++) {
1849         if (use->in(pos) == insert_pt) break;
1850       }
1851       _igvn.replace_input_of(use, pos, current);
1852       --i;
1853     }
1854   }
1855 
1856   //connect current to insert_pt
1857   _igvn.replace_input_of(current, MemNode::Memory, insert_pt);
1858 }
1859 
1860 //------------------------------co_locate_pack----------------------------------
1861 // To schedule a store pack, we need to move any sandwiched memory ops either before
1862 // or after the pack, based upon dependence information:
1863 // (1) If any store in the pack depends on the sandwiched memory op, the
1864 //     sandwiched memory op must be scheduled BEFORE the pack;
1865 // (2) If a sandwiched memory op depends on any store in the pack, the
1866 //     sandwiched memory op must be scheduled AFTER the pack;
1867 // (3) If a sandwiched memory op (say, memA) depends on another sandwiched
1868 //     memory op (say memB), memB must be scheduled before memA. So, if memA is
1869 //     scheduled before the pack, memB must also be scheduled before the pack;
1870 // (4) If there is no dependence restriction for a sandwiched memory op, we simply
1871 //     schedule this store AFTER the pack
1872 // (5) We know there is no dependence cycle, so there in no other case;
1873 // (6) Finally, all memory ops in another single pack should be moved in the same direction.
1874 //
1875 // To schedule a load pack, we use the memory state of either the first or the last load in
1876 // the pack, based on the dependence constraint.
1877 void SuperWord::co_locate_pack(Node_List* pk) {
1878   if (pk->at(0)->is_Store()) {
1879     MemNode* first     = executed_first(pk)->as_Mem();
1880     MemNode* last      = executed_last(pk)->as_Mem();
1881     Unique_Node_List schedule_before_pack;
1882     Unique_Node_List memops;
1883 
1884     MemNode* current   = last->in(MemNode::Memory)->as_Mem();
1885     MemNode* previous  = last;
1886     while (true) {
1887       assert(in_bb(current), "stay in block");
1888       memops.push(previous);
1889       for (DUIterator i = current->outs(); current->has_out(i); i++) {
1890         Node* use = current->out(i);
1891         if (use->is_Mem() && use != previous)
1892           memops.push(use);
1893       }
1894       if (current == first) break;
1895       previous = current;
1896       current  = current->in(MemNode::Memory)->as_Mem();
1897     }
1898 
1899     // determine which memory operations should be scheduled before the pack
1900     for (uint i = 1; i < memops.size(); i++) {
1901       Node *s1 = memops.at(i);
1902       if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) {
1903         for (uint j = 0; j< i; j++) {
1904           Node *s2 = memops.at(j);
1905           if (!independent(s1, s2)) {
1906             if (in_pack(s2, pk) || schedule_before_pack.member(s2)) {
1907               schedule_before_pack.push(s1); // s1 must be scheduled before
1908               Node_List* mem_pk = my_pack(s1);
1909               if (mem_pk != NULL) {
1910                 for (uint ii = 0; ii < mem_pk->size(); ii++) {
1911                   Node* s = mem_pk->at(ii);  // follow partner
1912                   if (memops.member(s) && !schedule_before_pack.member(s))
1913                     schedule_before_pack.push(s);
1914                 }
1915               }
1916               break;
1917             }
1918           }
1919         }
1920       }
1921     }
1922 
1923     Node*    upper_insert_pt = first->in(MemNode::Memory);
1924     // Following code moves loads connected to upper_insert_pt below aliased stores.
1925     // Collect such loads here and reconnect them back to upper_insert_pt later.
1926     memops.clear();
1927     for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
1928       Node* use = upper_insert_pt->out(i);
1929       if (use->is_Mem() && !use->is_Store()) {
1930         memops.push(use);
1931       }
1932     }
1933 
1934     MemNode* lower_insert_pt = last;
1935     previous                 = last; //previous store in pk
1936     current                  = last->in(MemNode::Memory)->as_Mem();
1937 
1938     // start scheduling from "last" to "first"
1939     while (true) {
1940       assert(in_bb(current), "stay in block");
1941       assert(in_pack(previous, pk), "previous stays in pack");
1942       Node* my_mem = current->in(MemNode::Memory);
1943 
1944       if (in_pack(current, pk)) {
1945         // Forward users of my memory state (except "previous) to my input memory state
1946         for (DUIterator i = current->outs(); current->has_out(i); i++) {
1947           Node* use = current->out(i);
1948           if (use->is_Mem() && use != previous) {
1949             assert(use->in(MemNode::Memory) == current, "must be");
1950             if (schedule_before_pack.member(use)) {
1951               _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt);
1952             } else {
1953               _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt);
1954             }
1955             --i; // deleted this edge; rescan position
1956           }
1957         }
1958         previous = current;
1959       } else { // !in_pack(current, pk) ==> a sandwiched store
1960         remove_and_insert(current, previous, lower_insert_pt, upper_insert_pt, schedule_before_pack);
1961       }
1962 
1963       if (current == first) break;
1964       current = my_mem->as_Mem();
1965     } // end while
1966 
1967     // Reconnect loads back to upper_insert_pt.
1968     for (uint i = 0; i < memops.size(); i++) {
1969       Node *ld = memops.at(i);
1970       if (ld->in(MemNode::Memory) != upper_insert_pt) {
1971         _igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt);
1972       }
1973     }
1974   } else if (pk->at(0)->is_Load()) { //load
1975     // all loads in the pack should have the same memory state. By default,
1976     // we use the memory state of the last load. However, if any load could
1977     // not be moved down due to the dependence constraint, we use the memory
1978     // state of the first load.
1979     Node* last_mem  = executed_last(pk)->in(MemNode::Memory);
1980     Node* first_mem = executed_first(pk)->in(MemNode::Memory);
1981     bool schedule_last = true;
1982     for (uint i = 0; i < pk->size(); i++) {
1983       Node* ld = pk->at(i);
1984       for (Node* current = last_mem; current != ld->in(MemNode::Memory);
1985            current=current->in(MemNode::Memory)) {
1986         assert(current != first_mem, "corrupted memory graph");
1987         if(current->is_Mem() && !independent(current, ld)){
1988           schedule_last = false; // a later store depends on this load
1989           break;
1990         }
1991       }
1992     }
1993 
1994     Node* mem_input = schedule_last ? last_mem : first_mem;
1995     _igvn.hash_delete(mem_input);
1996     // Give each load the same memory state
1997     for (uint i = 0; i < pk->size(); i++) {
1998       LoadNode* ld = pk->at(i)->as_Load();
1999       _igvn.replace_input_of(ld, MemNode::Memory, mem_input);
2000     }
2001   }
2002 }
2003 
2004 #ifndef PRODUCT
2005 void SuperWord::print_loop(bool whole) {
2006   Node_Stack stack(_arena, _phase->C->unique() >> 2);
2007   Node_List rpo_list;
2008   VectorSet visited(_arena);
2009   visited.set(lpt()->_head->_idx);
2010   _phase->rpo(lpt()->_head, stack, visited, rpo_list);
2011   _phase->dump(lpt(), rpo_list.size(), rpo_list );
2012   if(whole) {
2013     tty->print_cr("\n Whole loop tree");
2014     _phase->dump();
2015     tty->print_cr(" End of whole loop tree\n");
2016   }
2017 }
2018 #endif
2019 
2020 //------------------------------output---------------------------
2021 // Convert packs into vector node operations
2022 void SuperWord::output() {
2023   if (_packset.length() == 0) return;
2024 
2025 #ifndef PRODUCT
2026   if (TraceLoopOpts) {
2027     tty->print("SuperWord::output    ");
2028     lpt()->dump_head();
2029   }
2030 #endif
2031 
2032   // MUST ENSURE main loop's initial value is properly aligned:
2033   //  (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
2034 
2035   align_initial_loop_index(align_to_ref());
2036 
2037   // Insert extract (unpack) operations for scalar uses
2038   for (int i = 0; i < _packset.length(); i++) {
2039     insert_extracts(_packset.at(i));
2040   }
2041 
2042   Compile* C = _phase->C;
2043   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
2044   uint max_vlen_in_bytes = 0;
2045   uint max_vlen = 0;
2046 
2047   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop before create_reserve_version_of_loop"); print_loop(true);})
2048 
2049   CountedLoopReserveKit make_reversable(_phase, _lpt, do_reserve_copy());
2050 
2051   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop after create_reserve_version_of_loop"); print_loop(true);})
2052 
2053   if (do_reserve_copy() && !make_reversable.has_reserved()) {
2054     NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: loop was not reserved correctly, exiting SuperWord");})
2055     return;
2056   }
2057 
2058   for (int i = 0; i < _block.length(); i++) {
2059     Node* n = _block.at(i);
2060     Node_List* p = my_pack(n);
2061     if (p && n == executed_last(p)) {
2062       uint vlen = p->size();
2063       uint vlen_in_bytes = 0;
2064       Node* vn = NULL;
2065       Node* low_adr = p->at(0);
2066       Node* first   = executed_first(p);
2067       NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d executed first, %d executed last in pack", first->_idx, n->_idx); print_pack(p);})
2068       int   opc = n->Opcode();
2069       if (n->is_Load()) {
2070         Node* ctl = n->in(MemNode::Control);
2071         Node* mem = first->in(MemNode::Memory);
2072         SWPointer p1(n->as_Mem(), this, NULL, false);
2073         // Identify the memory dependency for the new loadVector node by
2074         // walking up through memory chain.
2075         // This is done to give flexibility to the new loadVector node so that
2076         // it can move above independent storeVector nodes.
2077         while (mem->is_StoreVector()) {
2078           SWPointer p2(mem->as_Mem(), this, NULL, false);
2079           int cmp = p1.cmp(p2);
2080           if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
2081             mem = mem->in(MemNode::Memory);
2082           } else {
2083             break; // dependent memory
2084           }
2085         }
2086         Node* adr = low_adr->in(MemNode::Address);
2087         const TypePtr* atyp = n->adr_type();
2088         vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p));
2089         vlen_in_bytes = vn->as_LoadVector()->memory_size();
2090       } else if (n->is_Store()) {
2091         // Promote value to be stored to vector
2092         Node* val = vector_opd(p, MemNode::ValueIn);
2093         if (val == NULL) {
2094           if (do_reserve_copy()) {
2095             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: val should not be NULL, exiting SuperWord");})
2096             return; //and reverse to backup IG
2097           }
2098           ShouldNotReachHere();
2099         }
2100 
2101         Node* ctl = n->in(MemNode::Control);
2102         Node* mem = first->in(MemNode::Memory);
2103         Node* adr = low_adr->in(MemNode::Address);
2104         const TypePtr* atyp = n->adr_type();
2105         vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
2106         vlen_in_bytes = vn->as_StoreVector()->memory_size();
2107       } else if (n->req() == 3 && !is_cmov_pack(p)) {
2108         // Promote operands to vector
2109         Node* in1 = NULL;
2110         bool node_isa_reduction = n->is_reduction();
2111         if (node_isa_reduction) {
2112           // the input to the first reduction operation is retained
2113           in1 = low_adr->in(1);
2114         } else {
2115           in1 = vector_opd(p, 1);
2116           if (in1 == NULL) {
2117             if (do_reserve_copy()) {
2118               NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: in1 should not be NULL, exiting SuperWord");})
2119               return; //and reverse to backup IG
2120             }
2121             ShouldNotReachHere();
2122           }
2123         }
2124         Node* in2 = vector_opd(p, 2);
2125         if (in2 == NULL) {
2126           if (do_reserve_copy()) {
2127             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: in2 should not be NULL, exiting SuperWord");})
2128             return; //and reverse to backup IG
2129           }
2130           ShouldNotReachHere();
2131         }
2132         if (VectorNode::is_invariant_vector(in1) && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) {
2133           // Move invariant vector input into second position to avoid register spilling.
2134           Node* tmp = in1;
2135           in1 = in2;
2136           in2 = tmp;
2137         }
2138         if (node_isa_reduction) {
2139           const Type *arith_type = n->bottom_type();
2140           vn = ReductionNode::make(opc, NULL, in1, in2, arith_type->basic_type());
2141           if (in2->is_Load()) {
2142             vlen_in_bytes = in2->as_LoadVector()->memory_size();
2143           } else {
2144             vlen_in_bytes = in2->as_Vector()->length_in_bytes();
2145           }
2146         } else {
2147           vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
2148           vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2149         }
2150       } else if (opc == Op_SqrtD || opc == Op_AbsF || opc == Op_AbsD || opc == Op_NegF || opc == Op_NegD) {
2151         // Promote operand to vector (Sqrt/Abs/Neg are 2 address instructions)
2152         Node* in = vector_opd(p, 1);
2153         vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
2154         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2155       } else if (is_cmov_pack(p)) {
2156         if (!n->is_CMove()) {
2157           continue;
2158         }
2159         // place here CMoveVDNode
2160         NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: print before CMove vectorization"); print_loop(false);})
2161         Node* bol = n->in(CMoveNode::Condition);
2162         if (!bol->is_Bool() && bol->Opcode() == Op_ExtractI && bol->req() > 1 ) {
2163           NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d is not Bool node, trying its in(1) node %d", bol->_idx, bol->in(1)->_idx); bol->dump(); bol->in(1)->dump();})
2164           bol = bol->in(1); //may be ExtractNode
2165         }
2166 
2167         assert(bol->is_Bool(), "should be BoolNode - too late to bail out!");
2168         if (!bol->is_Bool()) {
2169           if (do_reserve_copy()) {
2170             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: expected %d bool node, exiting SuperWord", bol->_idx); bol->dump();})
2171             return; //and reverse to backup IG
2172           }
2173           ShouldNotReachHere();
2174         }
2175 
2176         int cond = (int)bol->as_Bool()->_test._test;
2177         Node* in_cc  = _igvn.intcon(cond);
2178         NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created intcon in_cc node %d", in_cc->_idx); in_cc->dump();})
2179         Node* cc = bol->clone();
2180         cc->set_req(1, in_cc);
2181         NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created bool cc node %d", cc->_idx); cc->dump();})
2182 
2183         Node* src1 = vector_opd(p, 2); //2=CMoveNode::IfFalse
2184         if (src1 == NULL) {
2185           if (do_reserve_copy()) {
2186             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: src1 should not be NULL, exiting SuperWord");})
2187             return; //and reverse to backup IG
2188           }
2189           ShouldNotReachHere();
2190         }
2191         Node* src2 = vector_opd(p, 3); //3=CMoveNode::IfTrue
2192         if (src2 == NULL) {
2193           if (do_reserve_copy()) {
2194             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: src2 should not be NULL, exiting SuperWord");})
2195             return; //and reverse to backup IG
2196           }
2197           ShouldNotReachHere();
2198         }
2199         BasicType bt = velt_basic_type(n);
2200         const TypeVect* vt = TypeVect::make(bt, vlen);
2201         vn = new CMoveVDNode(cc, src1, src2, vt);
2202         NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created new CMove node %d: ", vn->_idx); vn->dump();})
2203       } else {
2204         if (do_reserve_copy()) {
2205           NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: ShouldNotReachHere, exiting SuperWord");})
2206           return; //and reverse to backup IG
2207         }
2208         ShouldNotReachHere();
2209       }
2210 
2211       assert(vn != NULL, "sanity");
2212       if (vn == NULL) {
2213         if (do_reserve_copy()){
2214           NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: got NULL node, cannot proceed, exiting SuperWord");})
2215           return; //and reverse to backup IG
2216         }
2217         ShouldNotReachHere();
2218       }
2219 
2220       _igvn.register_new_node_with_optimizer(vn);
2221       _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
2222       for (uint j = 0; j < p->size(); j++) {
2223         Node* pm = p->at(j);
2224         _igvn.replace_node(pm, vn);
2225       }
2226       _igvn._worklist.push(vn);
2227 
2228       if (vlen_in_bytes > max_vlen_in_bytes) {
2229         max_vlen = vlen;
2230         max_vlen_in_bytes = vlen_in_bytes;
2231       }
2232 #ifdef ASSERT
2233       if (TraceNewVectors) {
2234         tty->print("new Vector node: ");
2235         vn->dump();
2236       }
2237 #endif
2238     }
2239   }//for (int i = 0; i < _block.length(); i++)
2240 
2241   C->set_max_vector_size(max_vlen_in_bytes);
2242 
2243   if (SuperWordLoopUnrollAnalysis) {
2244     if (cl->has_passed_slp()) {
2245       uint slp_max_unroll_factor = cl->slp_max_unroll();
2246       if (slp_max_unroll_factor == max_vlen) {
2247         NOT_PRODUCT(if (TraceSuperWordLoopUnrollAnalysis) tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte));
2248         // For atomic unrolled loops which are vector mapped, instigate more unrolling.
2249         cl->set_notpassed_slp();
2250         // if vector resources are limited, do not allow additional unrolling
2251         if (FLOATPRESSURE > 8) {
2252           C->set_major_progress();
2253         }
2254         cl->mark_do_unroll_only();
2255       }
2256     }
2257   }
2258 
2259   if (do_reserve_copy()) {
2260     make_reversable.use_new();
2261   }
2262   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("\n Final loop after SuperWord"); print_loop(true);})
2263   return;
2264 }
2265 
2266 //------------------------------vector_opd---------------------------
2267 // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
2268 Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
2269   Node* p0 = p->at(0);
2270   uint vlen = p->size();
2271   Node* opd = p0->in(opd_idx);
2272 
2273   if (same_inputs(p, opd_idx)) {
2274     if (opd->is_Vector() || opd->is_LoadVector()) {
2275       assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector");
2276       if (opd_idx == 2 && VectorNode::is_shift(p0)) {
2277         NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("shift's count can't be vector");})
2278         return NULL;
2279       }
2280       return opd; // input is matching vector
2281     }
2282     if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
2283       Compile* C = _phase->C;
2284       Node* cnt = opd;
2285       // Vector instructions do not mask shift count, do it here.
2286       juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
2287       const TypeInt* t = opd->find_int_type();
2288       if (t != NULL && t->is_con()) {
2289         juint shift = t->get_con();
2290         if (shift > mask) { // Unsigned cmp
2291           cnt = ConNode::make(TypeInt::make(shift & mask));
2292         }
2293       } else {
2294         if (t == NULL || t->_lo < 0 || t->_hi > (int)mask) {
2295           cnt = ConNode::make(TypeInt::make(mask));
2296           _igvn.register_new_node_with_optimizer(cnt);
2297           cnt = new AndINode(opd, cnt);
2298           _igvn.register_new_node_with_optimizer(cnt);
2299           _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
2300         }
2301         assert(opd->bottom_type()->isa_int(), "int type only");
2302         if (!opd->bottom_type()->isa_int()) {
2303           NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("Should be int type only");})
2304           return NULL;
2305         }
2306         // Move non constant shift count into vector register.
2307         cnt = VectorNode::shift_count(p0, cnt, vlen, velt_basic_type(p0));
2308       }
2309       if (cnt != opd) {
2310         _igvn.register_new_node_with_optimizer(cnt);
2311         _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
2312       }
2313       return cnt;
2314     }
2315     assert(!opd->is_StoreVector(), "such vector is not expected here");
2316     if (opd->is_StoreVector()) {
2317       NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("StoreVector is not expected here");})
2318       return NULL;
2319     }
2320     // Convert scalar input to vector with the same number of elements as
2321     // p0's vector. Use p0's type because size of operand's container in
2322     // vector should match p0's size regardless operand's size.
2323     const Type* p0_t = velt_type(p0);
2324     VectorNode* vn = VectorNode::scalar2vector(opd, vlen, p0_t);
2325 
2326     _igvn.register_new_node_with_optimizer(vn);
2327     _phase->set_ctrl(vn, _phase->get_ctrl(opd));
2328 #ifdef ASSERT
2329     if (TraceNewVectors) {
2330       tty->print("new Vector node: ");
2331       vn->dump();
2332     }
2333 #endif
2334     return vn;
2335   }
2336 
2337   // Insert pack operation
2338   BasicType bt = velt_basic_type(p0);
2339   PackNode* pk = PackNode::make(opd, vlen, bt);
2340   DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); )
2341 
2342   for (uint i = 1; i < vlen; i++) {
2343     Node* pi = p->at(i);
2344     Node* in = pi->in(opd_idx);
2345     assert(my_pack(in) == NULL, "Should already have been unpacked");
2346     if (my_pack(in) != NULL) {
2347       NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("Should already have been unpacked");})
2348       return NULL;
2349     }
2350     assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
2351     pk->add_opd(in);
2352   }
2353   _igvn.register_new_node_with_optimizer(pk);
2354   _phase->set_ctrl(pk, _phase->get_ctrl(opd));
2355 #ifdef ASSERT
2356   if (TraceNewVectors) {
2357     tty->print("new Vector node: ");
2358     pk->dump();
2359   }
2360 #endif
2361   return pk;
2362 }
2363 
2364 //------------------------------insert_extracts---------------------------
2365 // If a use of pack p is not a vector use, then replace the
2366 // use with an extract operation.
2367 void SuperWord::insert_extracts(Node_List* p) {
2368   if (p->at(0)->is_Store()) return;
2369   assert(_n_idx_list.is_empty(), "empty (node,index) list");
2370 
2371   // Inspect each use of each pack member.  For each use that is
2372   // not a vector use, replace the use with an extract operation.
2373 
2374   for (uint i = 0; i < p->size(); i++) {
2375     Node* def = p->at(i);
2376     for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
2377       Node* use = def->fast_out(j);
2378       for (uint k = 0; k < use->req(); k++) {
2379         Node* n = use->in(k);
2380         if (def == n) {
2381           Node_List* u_pk = my_pack(use);
2382           if ((u_pk == NULL || !is_cmov_pack(u_pk) || use->is_CMove()) && !is_vector_use(use, k)) {
2383               _n_idx_list.push(use, k);
2384           }
2385         }
2386       }
2387     }
2388   }
2389 
2390   while (_n_idx_list.is_nonempty()) {
2391     Node* use = _n_idx_list.node();
2392     int   idx = _n_idx_list.index();
2393     _n_idx_list.pop();
2394     Node* def = use->in(idx);
2395 
2396     if (def->is_reduction()) continue;
2397 
2398     // Insert extract operation
2399     _igvn.hash_delete(def);
2400     int def_pos = alignment(def) / data_size(def);
2401 
2402     Node* ex = ExtractNode::make(def, def_pos, velt_basic_type(def));
2403     _igvn.register_new_node_with_optimizer(ex);
2404     _phase->set_ctrl(ex, _phase->get_ctrl(def));
2405     _igvn.replace_input_of(use, idx, ex);
2406     _igvn._worklist.push(def);
2407 
2408     bb_insert_after(ex, bb_idx(def));
2409     set_velt_type(ex, velt_type(def));
2410   }
2411 }
2412 
2413 //------------------------------is_vector_use---------------------------
2414 // Is use->in(u_idx) a vector use?
2415 bool SuperWord::is_vector_use(Node* use, int u_idx) {
2416   Node_List* u_pk = my_pack(use);
2417   if (u_pk == NULL) return false;
2418   if (use->is_reduction()) return true;
2419   Node* def = use->in(u_idx);
2420   Node_List* d_pk = my_pack(def);
2421   if (d_pk == NULL) {
2422     // check for scalar promotion
2423     Node* n = u_pk->at(0)->in(u_idx);
2424     for (uint i = 1; i < u_pk->size(); i++) {
2425       if (u_pk->at(i)->in(u_idx) != n) return false;
2426     }
2427     return true;
2428   }
2429   if (u_pk->size() != d_pk->size())
2430     return false;
2431   for (uint i = 0; i < u_pk->size(); i++) {
2432     Node* ui = u_pk->at(i);
2433     Node* di = d_pk->at(i);
2434     if (ui->in(u_idx) != di || alignment(ui) != alignment(di))
2435       return false;
2436   }
2437   return true;
2438 }
2439 
2440 //------------------------------construct_bb---------------------------
2441 // Construct reverse postorder list of block members
2442 bool SuperWord::construct_bb() {
2443   Node* entry = bb();
2444 
2445   assert(_stk.length() == 0,            "stk is empty");
2446   assert(_block.length() == 0,          "block is empty");
2447   assert(_data_entry.length() == 0,     "data_entry is empty");
2448   assert(_mem_slice_head.length() == 0, "mem_slice_head is empty");
2449   assert(_mem_slice_tail.length() == 0, "mem_slice_tail is empty");
2450 
2451   // Find non-control nodes with no inputs from within block,
2452   // create a temporary map from node _idx to bb_idx for use
2453   // by the visited and post_visited sets,
2454   // and count number of nodes in block.
2455   int bb_ct = 0;
2456   for (uint i = 0; i < lpt()->_body.size(); i++) {
2457     Node *n = lpt()->_body.at(i);
2458     set_bb_idx(n, i); // Create a temporary map
2459     if (in_bb(n)) {
2460       if (n->is_LoadStore() || n->is_MergeMem() ||
2461           (n->is_Proj() && !n->as_Proj()->is_CFG())) {
2462         // Bailout if the loop has LoadStore, MergeMem or data Proj
2463         // nodes. Superword optimization does not work with them.
2464         return false;
2465       }
2466       bb_ct++;
2467       if (!n->is_CFG()) {
2468         bool found = false;
2469         for (uint j = 0; j < n->req(); j++) {
2470           Node* def = n->in(j);
2471           if (def && in_bb(def)) {
2472             found = true;
2473             break;
2474           }
2475         }
2476         if (!found) {
2477           assert(n != entry, "can't be entry");
2478           _data_entry.push(n);
2479         }
2480       }
2481     }
2482   }
2483 
2484   // Find memory slices (head and tail)
2485   for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) {
2486     Node *n = lp()->fast_out(i);
2487     if (in_bb(n) && (n->is_Phi() && n->bottom_type() == Type::MEMORY)) {
2488       Node* n_tail  = n->in(LoopNode::LoopBackControl);
2489       if (n_tail != n->in(LoopNode::EntryControl)) {
2490         if (!n_tail->is_Mem()) {
2491           assert(n_tail->is_Mem(), "unexpected node for memory slice: %s", n_tail->Name());
2492           return false; // Bailout
2493         }
2494         _mem_slice_head.push(n);
2495         _mem_slice_tail.push(n_tail);
2496       }
2497     }
2498   }
2499 
2500   // Create an RPO list of nodes in block
2501 
2502   visited_clear();
2503   post_visited_clear();
2504 
2505   // Push all non-control nodes with no inputs from within block, then control entry
2506   for (int j = 0; j < _data_entry.length(); j++) {
2507     Node* n = _data_entry.at(j);
2508     visited_set(n);
2509     _stk.push(n);
2510   }
2511   visited_set(entry);
2512   _stk.push(entry);
2513 
2514   // Do a depth first walk over out edges
2515   int rpo_idx = bb_ct - 1;
2516   int size;
2517   int reduction_uses = 0;
2518   while ((size = _stk.length()) > 0) {
2519     Node* n = _stk.top(); // Leave node on stack
2520     if (!visited_test_set(n)) {
2521       // forward arc in graph
2522     } else if (!post_visited_test(n)) {
2523       // cross or back arc
2524       for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
2525         Node *use = n->fast_out(i);
2526         if (in_bb(use) && !visited_test(use) &&
2527             // Don't go around backedge
2528             (!use->is_Phi() || n == entry)) {
2529           if (use->is_reduction()) {
2530             // First see if we can map the reduction on the given system we are on, then
2531             // make a data entry operation for each reduction we see.
2532             BasicType bt = use->bottom_type()->basic_type();
2533             if (ReductionNode::implemented(use->Opcode(), Matcher::min_vector_size(bt), bt)) {
2534               reduction_uses++;
2535             }
2536           }
2537           _stk.push(use);
2538         }
2539       }
2540       if (_stk.length() == size) {
2541         // There were no additional uses, post visit node now
2542         _stk.pop(); // Remove node from stack
2543         assert(rpo_idx >= 0, "");
2544         _block.at_put_grow(rpo_idx, n);
2545         rpo_idx--;
2546         post_visited_set(n);
2547         assert(rpo_idx >= 0 || _stk.is_empty(), "");
2548       }
2549     } else {
2550       _stk.pop(); // Remove post-visited node from stack
2551     }
2552   }//while
2553 
2554   int ii_current = -1;
2555   unsigned int load_idx = (unsigned int)-1;
2556   _ii_order.clear();
2557   // Create real map of block indices for nodes
2558   for (int j = 0; j < _block.length(); j++) {
2559     Node* n = _block.at(j);
2560     set_bb_idx(n, j);
2561     if (_do_vector_loop && n->is_Load()) {
2562       if (ii_current == -1) {
2563         ii_current = _clone_map.gen(n->_idx);
2564         _ii_order.push(ii_current);
2565         load_idx = _clone_map.idx(n->_idx);
2566       } else if (_clone_map.idx(n->_idx) == load_idx && _clone_map.gen(n->_idx) != ii_current) {
2567         ii_current = _clone_map.gen(n->_idx);
2568         _ii_order.push(ii_current);
2569       }
2570     }
2571   }//for
2572 
2573   // Ensure extra info is allocated.
2574   initialize_bb();
2575 
2576 #ifndef PRODUCT
2577   if (_vector_loop_debug && _ii_order.length() > 0) {
2578     tty->print("SuperWord::construct_bb: List of generations: ");
2579     for (int jj = 0; jj < _ii_order.length(); ++jj) {
2580       tty->print("  %d:%d", jj, _ii_order.at(jj));
2581     }
2582     tty->print_cr(" ");
2583   }
2584   if (TraceSuperWord) {
2585     print_bb();
2586     tty->print_cr("\ndata entry nodes: %s", _data_entry.length() > 0 ? "" : "NONE");
2587     for (int m = 0; m < _data_entry.length(); m++) {
2588       tty->print("%3d ", m);
2589       _data_entry.at(m)->dump();
2590     }
2591     tty->print_cr("\nmemory slices: %s", _mem_slice_head.length() > 0 ? "" : "NONE");
2592     for (int m = 0; m < _mem_slice_head.length(); m++) {
2593       tty->print("%3d ", m); _mem_slice_head.at(m)->dump();
2594       tty->print("    ");    _mem_slice_tail.at(m)->dump();
2595     }
2596   }
2597 #endif
2598   assert(rpo_idx == -1 && bb_ct == _block.length(), "all block members found");
2599   return (_mem_slice_head.length() > 0) || (reduction_uses > 0) || (_data_entry.length() > 0);
2600 }
2601 
2602 //------------------------------initialize_bb---------------------------
2603 // Initialize per node info
2604 void SuperWord::initialize_bb() {
2605   Node* last = _block.at(_block.length() - 1);
2606   grow_node_info(bb_idx(last));
2607 }
2608 
2609 //------------------------------bb_insert_after---------------------------
2610 // Insert n into block after pos
2611 void SuperWord::bb_insert_after(Node* n, int pos) {
2612   int n_pos = pos + 1;
2613   // Make room
2614   for (int i = _block.length() - 1; i >= n_pos; i--) {
2615     _block.at_put_grow(i+1, _block.at(i));
2616   }
2617   for (int j = _node_info.length() - 1; j >= n_pos; j--) {
2618     _node_info.at_put_grow(j+1, _node_info.at(j));
2619   }
2620   // Set value
2621   _block.at_put_grow(n_pos, n);
2622   _node_info.at_put_grow(n_pos, SWNodeInfo::initial);
2623   // Adjust map from node->_idx to _block index
2624   for (int i = n_pos; i < _block.length(); i++) {
2625     set_bb_idx(_block.at(i), i);
2626   }
2627 }
2628 
2629 //------------------------------compute_max_depth---------------------------
2630 // Compute max depth for expressions from beginning of block
2631 // Use to prune search paths during test for independence.
2632 void SuperWord::compute_max_depth() {
2633   int ct = 0;
2634   bool again;
2635   do {
2636     again = false;
2637     for (int i = 0; i < _block.length(); i++) {
2638       Node* n = _block.at(i);
2639       if (!n->is_Phi()) {
2640         int d_orig = depth(n);
2641         int d_in   = 0;
2642         for (DepPreds preds(n, _dg); !preds.done(); preds.next()) {
2643           Node* pred = preds.current();
2644           if (in_bb(pred)) {
2645             d_in = MAX2(d_in, depth(pred));
2646           }
2647         }
2648         if (d_in + 1 != d_orig) {
2649           set_depth(n, d_in + 1);
2650           again = true;
2651         }
2652       }
2653     }
2654     ct++;
2655   } while (again);
2656 #ifndef PRODUCT
2657   if (TraceSuperWord && Verbose)
2658     tty->print_cr("compute_max_depth iterated: %d times", ct);
2659 #endif
2660 }
2661 
2662 //-------------------------compute_vector_element_type-----------------------
2663 // Compute necessary vector element type for expressions
2664 // This propagates backwards a narrower integer type when the
2665 // upper bits of the value are not needed.
2666 // Example:  char a,b,c;  a = b + c;
2667 // Normally the type of the add is integer, but for packed character
2668 // operations the type of the add needs to be char.
2669 void SuperWord::compute_vector_element_type() {
2670 #ifndef PRODUCT
2671   if (TraceSuperWord && Verbose)
2672     tty->print_cr("\ncompute_velt_type:");
2673 #endif
2674 
2675   // Initial type
2676   for (int i = 0; i < _block.length(); i++) {
2677     Node* n = _block.at(i);
2678     set_velt_type(n, container_type(n));
2679   }
2680 
2681   // Propagate integer narrowed type backwards through operations
2682   // that don't depend on higher order bits
2683   for (int i = _block.length() - 1; i >= 0; i--) {
2684     Node* n = _block.at(i);
2685     // Only integer types need be examined
2686     const Type* vtn = velt_type(n);
2687     if (vtn->basic_type() == T_INT) {
2688       uint start, end;
2689       VectorNode::vector_operands(n, &start, &end);
2690 
2691       for (uint j = start; j < end; j++) {
2692         Node* in  = n->in(j);
2693         // Don't propagate through a memory
2694         if (!in->is_Mem() && in_bb(in) && velt_type(in)->basic_type() == T_INT &&
2695             data_size(n) < data_size(in)) {
2696           bool same_type = true;
2697           for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
2698             Node *use = in->fast_out(k);
2699             if (!in_bb(use) || !same_velt_type(use, n)) {
2700               same_type = false;
2701               break;
2702             }
2703           }
2704           if (same_type) {
2705             // For right shifts of small integer types (bool, byte, char, short)
2706             // we need precise information about sign-ness. Only Load nodes have
2707             // this information because Store nodes are the same for signed and
2708             // unsigned values. And any arithmetic operation after a load may
2709             // expand a value to signed Int so such right shifts can't be used
2710             // because vector elements do not have upper bits of Int.
2711             const Type* vt = vtn;
2712             if (VectorNode::is_shift(in)) {
2713               Node* load = in->in(1);
2714               if (load->is_Load() && in_bb(load) && (velt_type(load)->basic_type() == T_INT)) {
2715                 vt = velt_type(load);
2716               } else if (in->Opcode() != Op_LShiftI) {
2717                 // Widen type to Int to avoid creation of right shift vector
2718                 // (align + data_size(s1) check in stmts_can_pack() will fail).
2719                 // Note, left shifts work regardless type.
2720                 vt = TypeInt::INT;
2721               }
2722             }
2723             set_velt_type(in, vt);
2724           }
2725         }
2726       }
2727     }
2728   }
2729 #ifndef PRODUCT
2730   if (TraceSuperWord && Verbose) {
2731     for (int i = 0; i < _block.length(); i++) {
2732       Node* n = _block.at(i);
2733       velt_type(n)->dump();
2734       tty->print("\t");
2735       n->dump();
2736     }
2737   }
2738 #endif
2739 }
2740 
2741 //------------------------------memory_alignment---------------------------
2742 // Alignment within a vector memory reference
2743 int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
2744   #ifndef PRODUCT
2745     if(TraceSuperWord && Verbose) {
2746       tty->print("SuperWord::memory_alignment within a vector memory reference for %d:  ", s->_idx); s->dump();
2747     }
2748   #endif
2749   NOT_PRODUCT(SWPointer::Tracer::Depth ddd(0);)
2750   SWPointer p(s, this, NULL, false);
2751   if (!p.valid()) {
2752     NOT_PRODUCT(if(is_trace_alignment()) tty->print("SWPointer::memory_alignment: SWPointer p invalid, return bottom_align");)
2753     return bottom_align;
2754   }
2755   int vw = vector_width_in_bytes(s);
2756   if (vw < 2) {
2757     NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SWPointer::memory_alignment: vector_width_in_bytes < 2, return bottom_align");)
2758     return bottom_align; // No vectors for this type
2759   }
2760   int offset  = p.offset_in_bytes();
2761   offset     += iv_adjust*p.memory_size();
2762   int off_rem = offset % vw;
2763   int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
2764   NOT_PRODUCT(if(TraceSuperWord && Verbose) tty->print_cr("SWPointer::memory_alignment: off_rem = %d, off_mod = %d", off_rem, off_mod);)
2765   return off_mod;
2766 }
2767 
2768 //---------------------------container_type---------------------------
2769 // Smallest type containing range of values
2770 const Type* SuperWord::container_type(Node* n) {
2771   if (n->is_Mem()) {
2772     BasicType bt = n->as_Mem()->memory_type();
2773     if (n->is_Store() && (bt == T_CHAR)) {
2774       // Use T_SHORT type instead of T_CHAR for stored values because any
2775       // preceding arithmetic operation extends values to signed Int.
2776       bt = T_SHORT;
2777     }
2778     if (n->Opcode() == Op_LoadUB) {
2779       // Adjust type for unsigned byte loads, it is important for right shifts.
2780       // T_BOOLEAN is used because there is no basic type representing type
2781       // TypeInt::UBYTE. Use of T_BOOLEAN for vectors is fine because only
2782       // size (one byte) and sign is important.
2783       bt = T_BOOLEAN;
2784     }
2785     return Type::get_const_basic_type(bt);
2786   }
2787   const Type* t = _igvn.type(n);
2788   if (t->basic_type() == T_INT) {
2789     // A narrow type of arithmetic operations will be determined by
2790     // propagating the type of memory operations.
2791     return TypeInt::INT;
2792   }
2793   return t;
2794 }
2795 
2796 bool SuperWord::same_velt_type(Node* n1, Node* n2) {
2797   const Type* vt1 = velt_type(n1);
2798   const Type* vt2 = velt_type(n2);
2799   if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) {
2800     // Compare vectors element sizes for integer types.
2801     return data_size(n1) == data_size(n2);
2802   }
2803   return vt1 == vt2;
2804 }
2805 
2806 //------------------------------in_packset---------------------------
2807 // Are s1 and s2 in a pack pair and ordered as s1,s2?
2808 bool SuperWord::in_packset(Node* s1, Node* s2) {
2809   for (int i = 0; i < _packset.length(); i++) {
2810     Node_List* p = _packset.at(i);
2811     assert(p->size() == 2, "must be");
2812     if (p->at(0) == s1 && p->at(p->size()-1) == s2) {
2813       return true;
2814     }
2815   }
2816   return false;
2817 }
2818 
2819 //------------------------------in_pack---------------------------
2820 // Is s in pack p?
2821 Node_List* SuperWord::in_pack(Node* s, Node_List* p) {
2822   for (uint i = 0; i < p->size(); i++) {
2823     if (p->at(i) == s) {
2824       return p;
2825     }
2826   }
2827   return NULL;
2828 }
2829 
2830 //------------------------------remove_pack_at---------------------------
2831 // Remove the pack at position pos in the packset
2832 void SuperWord::remove_pack_at(int pos) {
2833   Node_List* p = _packset.at(pos);
2834   for (uint i = 0; i < p->size(); i++) {
2835     Node* s = p->at(i);
2836     set_my_pack(s, NULL);
2837   }
2838   _packset.remove_at(pos);
2839 }
2840 
2841 void SuperWord::packset_sort(int n) {
2842   // simple bubble sort so that we capitalize with O(n) when its already sorted
2843   while (n != 0) {
2844     bool swapped = false;
2845     for (int i = 1; i < n; i++) {
2846       Node_List* q_low = _packset.at(i-1);
2847       Node_List* q_i = _packset.at(i);
2848 
2849       // only swap when we find something to swap
2850       if (alignment(q_low->at(0)) > alignment(q_i->at(0))) {
2851         Node_List* t = q_i;
2852         *(_packset.adr_at(i)) = q_low;
2853         *(_packset.adr_at(i-1)) = q_i;
2854         swapped = true;
2855       }
2856     }
2857     if (swapped == false) break;
2858     n--;
2859   }
2860 }
2861 
2862 //------------------------------executed_first---------------------------
2863 // Return the node executed first in pack p.  Uses the RPO block list
2864 // to determine order.
2865 Node* SuperWord::executed_first(Node_List* p) {
2866   Node* n = p->at(0);
2867   int n_rpo = bb_idx(n);
2868   for (uint i = 1; i < p->size(); i++) {
2869     Node* s = p->at(i);
2870     int s_rpo = bb_idx(s);
2871     if (s_rpo < n_rpo) {
2872       n = s;
2873       n_rpo = s_rpo;
2874     }
2875   }
2876   return n;
2877 }
2878 
2879 //------------------------------executed_last---------------------------
2880 // Return the node executed last in pack p.
2881 Node* SuperWord::executed_last(Node_List* p) {
2882   Node* n = p->at(0);
2883   int n_rpo = bb_idx(n);
2884   for (uint i = 1; i < p->size(); i++) {
2885     Node* s = p->at(i);
2886     int s_rpo = bb_idx(s);
2887     if (s_rpo > n_rpo) {
2888       n = s;
2889       n_rpo = s_rpo;
2890     }
2891   }
2892   return n;
2893 }
2894 
2895 LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
2896   LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest;
2897   for (uint i = 0; i < p->size(); i++) {
2898     Node* n = p->at(i);
2899     assert(n->is_Load(), "only meaningful for loads");
2900     if (!n->depends_only_on_test()) {
2901       dep = LoadNode::Pinned;
2902     }
2903   }
2904   return dep;
2905 }
2906 
2907 
2908 //----------------------------align_initial_loop_index---------------------------
2909 // Adjust pre-loop limit so that in main loop, a load/store reference
2910 // to align_to_ref will be a position zero in the vector.
2911 //   (iv + k) mod vector_align == 0
2912 void SuperWord::align_initial_loop_index(MemNode* align_to_ref) {
2913   CountedLoopNode *main_head = lp()->as_CountedLoop();
2914   assert(main_head->is_main_loop(), "");
2915   CountedLoopEndNode* pre_end = get_pre_loop_end(main_head);
2916   assert(pre_end != NULL, "we must have a correct pre-loop");
2917   Node *pre_opaq1 = pre_end->limit();
2918   assert(pre_opaq1->Opcode() == Op_Opaque1, "");
2919   Opaque1Node *pre_opaq = (Opaque1Node*)pre_opaq1;
2920   Node *lim0 = pre_opaq->in(1);
2921 
2922   // Where we put new limit calculations
2923   Node *pre_ctrl = pre_end->loopnode()->in(LoopNode::EntryControl);
2924 
2925   // Ensure the original loop limit is available from the
2926   // pre-loop Opaque1 node.
2927   Node *orig_limit = pre_opaq->original_loop_limit();
2928   assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
2929 
2930   SWPointer align_to_ref_p(align_to_ref, this, NULL, false);
2931   assert(align_to_ref_p.valid(), "sanity");
2932 
2933   // Given:
2934   //     lim0 == original pre loop limit
2935   //     V == v_align (power of 2)
2936   //     invar == extra invariant piece of the address expression
2937   //     e == offset [ +/- invar ]
2938   //
2939   // When reassociating expressions involving '%' the basic rules are:
2940   //     (a - b) % k == 0   =>  a % k == b % k
2941   // and:
2942   //     (a + b) % k == 0   =>  a % k == (k - b) % k
2943   //
2944   // For stride > 0 && scale > 0,
2945   //   Derive the new pre-loop limit "lim" such that the two constraints:
2946   //     (1) lim = lim0 + N           (where N is some positive integer < V)
2947   //     (2) (e + lim) % V == 0
2948   //   are true.
2949   //
2950   //   Substituting (1) into (2),
2951   //     (e + lim0 + N) % V == 0
2952   //   solve for N:
2953   //     N = (V - (e + lim0)) % V
2954   //   substitute back into (1), so that new limit
2955   //     lim = lim0 + (V - (e + lim0)) % V
2956   //
2957   // For stride > 0 && scale < 0
2958   //   Constraints:
2959   //     lim = lim0 + N
2960   //     (e - lim) % V == 0
2961   //   Solving for lim:
2962   //     (e - lim0 - N) % V == 0
2963   //     N = (e - lim0) % V
2964   //     lim = lim0 + (e - lim0) % V
2965   //
2966   // For stride < 0 && scale > 0
2967   //   Constraints:
2968   //     lim = lim0 - N
2969   //     (e + lim) % V == 0
2970   //   Solving for lim:
2971   //     (e + lim0 - N) % V == 0
2972   //     N = (e + lim0) % V
2973   //     lim = lim0 - (e + lim0) % V
2974   //
2975   // For stride < 0 && scale < 0
2976   //   Constraints:
2977   //     lim = lim0 - N
2978   //     (e - lim) % V == 0
2979   //   Solving for lim:
2980   //     (e - lim0 + N) % V == 0
2981   //     N = (V - (e - lim0)) % V
2982   //     lim = lim0 - (V - (e - lim0)) % V
2983 
2984   int vw = vector_width_in_bytes(align_to_ref);
2985   int stride   = iv_stride();
2986   int scale    = align_to_ref_p.scale_in_bytes();
2987   int elt_size = align_to_ref_p.memory_size();
2988   int v_align  = vw / elt_size;
2989   assert(v_align > 1, "sanity");
2990   int offset   = align_to_ref_p.offset_in_bytes() / elt_size;
2991   Node *offsn  = _igvn.intcon(offset);
2992 
2993   Node *e = offsn;
2994   if (align_to_ref_p.invar() != NULL) {
2995     // incorporate any extra invariant piece producing (offset +/- invar) >>> log2(elt)
2996     Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
2997     Node* aref     = new URShiftINode(align_to_ref_p.invar(), log2_elt);
2998     _igvn.register_new_node_with_optimizer(aref);
2999     _phase->set_ctrl(aref, pre_ctrl);
3000     if (align_to_ref_p.negate_invar()) {
3001       e = new SubINode(e, aref);
3002     } else {
3003       e = new AddINode(e, aref);
3004     }
3005     _igvn.register_new_node_with_optimizer(e);
3006     _phase->set_ctrl(e, pre_ctrl);
3007   }
3008   if (vw > ObjectAlignmentInBytes) {
3009     // incorporate base e +/- base && Mask >>> log2(elt)
3010     Node* xbase = new CastP2XNode(NULL, align_to_ref_p.base());
3011     _igvn.register_new_node_with_optimizer(xbase);
3012 #ifdef _LP64
3013     xbase  = new ConvL2INode(xbase);
3014     _igvn.register_new_node_with_optimizer(xbase);
3015 #endif
3016     Node* mask = _igvn.intcon(vw-1);
3017     Node* masked_xbase  = new AndINode(xbase, mask);
3018     _igvn.register_new_node_with_optimizer(masked_xbase);
3019     Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
3020     Node* bref     = new URShiftINode(masked_xbase, log2_elt);
3021     _igvn.register_new_node_with_optimizer(bref);
3022     _phase->set_ctrl(bref, pre_ctrl);
3023     e = new AddINode(e, bref);
3024     _igvn.register_new_node_with_optimizer(e);
3025     _phase->set_ctrl(e, pre_ctrl);
3026   }
3027 
3028   // compute e +/- lim0
3029   if (scale < 0) {
3030     e = new SubINode(e, lim0);
3031   } else {
3032     e = new AddINode(e, lim0);
3033   }
3034   _igvn.register_new_node_with_optimizer(e);
3035   _phase->set_ctrl(e, pre_ctrl);
3036 
3037   if (stride * scale > 0) {
3038     // compute V - (e +/- lim0)
3039     Node* va  = _igvn.intcon(v_align);
3040     e = new SubINode(va, e);
3041     _igvn.register_new_node_with_optimizer(e);
3042     _phase->set_ctrl(e, pre_ctrl);
3043   }
3044   // compute N = (exp) % V
3045   Node* va_msk = _igvn.intcon(v_align - 1);
3046   Node* N = new AndINode(e, va_msk);
3047   _igvn.register_new_node_with_optimizer(N);
3048   _phase->set_ctrl(N, pre_ctrl);
3049 
3050   //   substitute back into (1), so that new limit
3051   //     lim = lim0 + N
3052   Node* lim;
3053   if (stride < 0) {
3054     lim = new SubINode(lim0, N);
3055   } else {
3056     lim = new AddINode(lim0, N);
3057   }
3058   _igvn.register_new_node_with_optimizer(lim);
3059   _phase->set_ctrl(lim, pre_ctrl);
3060   Node* constrained =
3061     (stride > 0) ? (Node*) new MinINode(lim, orig_limit)
3062                  : (Node*) new MaxINode(lim, orig_limit);
3063   _igvn.register_new_node_with_optimizer(constrained);
3064   _phase->set_ctrl(constrained, pre_ctrl);
3065   _igvn.hash_delete(pre_opaq);
3066   pre_opaq->set_req(1, constrained);
3067 }
3068 
3069 //----------------------------get_pre_loop_end---------------------------
3070 // Find pre loop end from main loop.  Returns null if none.
3071 CountedLoopEndNode* SuperWord::get_pre_loop_end(CountedLoopNode* cl) {
3072   Node* ctrl = cl->in(LoopNode::EntryControl);
3073   if (!ctrl->is_IfTrue() && !ctrl->is_IfFalse()) return NULL;
3074   Node* iffm = ctrl->in(0);
3075   if (!iffm->is_If()) return NULL;
3076   Node* bolzm = iffm->in(1);
3077   if (!bolzm->is_Bool()) return NULL;
3078   Node* cmpzm = bolzm->in(1);
3079   if (!cmpzm->is_Cmp()) return NULL;
3080   Node* opqzm = cmpzm->in(2);
3081   // Can not optimize a loop if zero-trip Opaque1 node is optimized
3082   // away and then another round of loop opts attempted.
3083   if (opqzm->Opcode() != Op_Opaque1) {
3084     return NULL;
3085   }
3086   Node* p_f = iffm->in(0);
3087   if (!p_f->is_IfFalse()) return NULL;
3088   if (!p_f->in(0)->is_CountedLoopEnd()) return NULL;
3089   CountedLoopEndNode* pre_end = p_f->in(0)->as_CountedLoopEnd();
3090   CountedLoopNode* loop_node = pre_end->loopnode();
3091   if (loop_node == NULL || !loop_node->is_pre_loop()) return NULL;
3092   return pre_end;
3093 }
3094 
3095 
3096 //------------------------------init---------------------------
3097 void SuperWord::init() {
3098   _dg.init();
3099   _packset.clear();
3100   _disjoint_ptrs.clear();
3101   _block.clear();
3102   _data_entry.clear();
3103   _mem_slice_head.clear();
3104   _mem_slice_tail.clear();
3105   _iteration_first.clear();
3106   _iteration_last.clear();
3107   _node_info.clear();
3108   _align_to_ref = NULL;
3109   _lpt = NULL;
3110   _lp = NULL;
3111   _bb = NULL;
3112   _iv = NULL;
3113   _race_possible = 0;
3114   _early_return = false;
3115   _num_work_vecs = 0;
3116   _num_reductions = 0;
3117 }
3118 
3119 //------------------------------restart---------------------------
3120 void SuperWord::restart() {
3121   _dg.init();
3122   _packset.clear();
3123   _disjoint_ptrs.clear();
3124   _block.clear();
3125   _data_entry.clear();
3126   _mem_slice_head.clear();
3127   _mem_slice_tail.clear();
3128   _node_info.clear();
3129 }
3130 
3131 //------------------------------print_packset---------------------------
3132 void SuperWord::print_packset() {
3133 #ifndef PRODUCT
3134   tty->print_cr("packset");
3135   for (int i = 0; i < _packset.length(); i++) {
3136     tty->print_cr("Pack: %d", i);
3137     Node_List* p = _packset.at(i);
3138     print_pack(p);
3139   }
3140 #endif
3141 }
3142 
3143 //------------------------------print_pack---------------------------
3144 void SuperWord::print_pack(Node_List* p) {
3145   for (uint i = 0; i < p->size(); i++) {
3146     print_stmt(p->at(i));
3147   }
3148 }
3149 
3150 //------------------------------print_bb---------------------------
3151 void SuperWord::print_bb() {
3152 #ifndef PRODUCT
3153   tty->print_cr("\nBlock");
3154   for (int i = 0; i < _block.length(); i++) {
3155     Node* n = _block.at(i);
3156     tty->print("%d ", i);
3157     if (n) {
3158       n->dump();
3159     }
3160   }
3161 #endif
3162 }
3163 
3164 //------------------------------print_stmt---------------------------
3165 void SuperWord::print_stmt(Node* s) {
3166 #ifndef PRODUCT
3167   tty->print(" align: %d \t", alignment(s));
3168   s->dump();
3169 #endif
3170 }
3171 
3172 //------------------------------blank---------------------------
3173 char* SuperWord::blank(uint depth) {
3174   static char blanks[101];
3175   assert(depth < 101, "too deep");
3176   for (uint i = 0; i < depth; i++) blanks[i] = ' ';
3177   blanks[depth] = '\0';
3178   return blanks;
3179 }
3180 
3181 
3182 //==============================SWPointer===========================
3183 #ifndef PRODUCT
3184 int SWPointer::Tracer::_depth = 0;
3185 #endif
3186 //----------------------------SWPointer------------------------
3187 SWPointer::SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only) :
3188   _mem(mem), _slp(slp),  _base(NULL),  _adr(NULL),
3189   _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
3190   _nstack(nstack), _analyze_only(analyze_only),
3191   _stack_idx(0)
3192 #ifndef PRODUCT
3193   , _tracer(slp)
3194 #endif
3195 {
3196   NOT_PRODUCT(_tracer.ctor_1(mem);)
3197 
3198   Node* adr = mem->in(MemNode::Address);
3199   if (!adr->is_AddP()) {
3200     assert(!valid(), "too complex");
3201     return;
3202   }
3203   // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant)
3204   Node* base = adr->in(AddPNode::Base);
3205   // The base address should be loop invariant
3206   if (!invariant(base)) {
3207     assert(!valid(), "base address is loop variant");
3208     return;
3209   }
3210   //unsafe reference could not be aligned appropriately without runtime checking
3211   if (base == NULL || base->bottom_type() == Type::TOP) {
3212     assert(!valid(), "unsafe access");
3213     return;
3214   }
3215 
3216   NOT_PRODUCT(if(_slp->is_trace_alignment()) _tracer.store_depth();)
3217   NOT_PRODUCT(_tracer.ctor_2(adr);)
3218 
3219   int i;
3220   for (i = 0; i < 3; i++) {
3221     NOT_PRODUCT(_tracer.ctor_3(adr, i);)
3222 
3223     if (!scaled_iv_plus_offset(adr->in(AddPNode::Offset))) {
3224       assert(!valid(), "too complex");
3225       return;
3226     }
3227     adr = adr->in(AddPNode::Address);
3228     NOT_PRODUCT(_tracer.ctor_4(adr, i);)
3229 
3230     if (base == adr || !adr->is_AddP()) {
3231       NOT_PRODUCT(_tracer.ctor_5(adr, base, i);)
3232       break; // stop looking at addp's
3233     }
3234   }
3235   NOT_PRODUCT(if(_slp->is_trace_alignment()) _tracer.restore_depth();)
3236   NOT_PRODUCT(_tracer.ctor_6(mem);)
3237 
3238   _base = base;
3239   _adr  = adr;
3240   assert(valid(), "Usable");
3241 }
3242 
3243 // Following is used to create a temporary object during
3244 // the pattern match of an address expression.
3245 SWPointer::SWPointer(SWPointer* p) :
3246   _mem(p->_mem), _slp(p->_slp),  _base(NULL),  _adr(NULL),
3247   _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
3248   _nstack(p->_nstack), _analyze_only(p->_analyze_only),
3249   _stack_idx(p->_stack_idx)
3250   #ifndef PRODUCT
3251   , _tracer(p->_slp)
3252   #endif
3253 {}
3254 
3255 
3256 bool SWPointer::invariant(Node* n) {
3257   NOT_PRODUCT(Tracer::Depth dd;)
3258   Node *n_c = phase()->get_ctrl(n);
3259   NOT_PRODUCT(_tracer.invariant_1(n, n_c);)
3260   return !lpt()->is_member(phase()->get_loop(n_c));
3261 }
3262 //------------------------scaled_iv_plus_offset--------------------
3263 // Match: k*iv + offset
3264 // where: k is a constant that maybe zero, and
3265 //        offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional
3266 bool SWPointer::scaled_iv_plus_offset(Node* n) {
3267   NOT_PRODUCT(Tracer::Depth ddd;)
3268   NOT_PRODUCT(_tracer.scaled_iv_plus_offset_1(n);)
3269 
3270   if (scaled_iv(n)) {
3271     NOT_PRODUCT(_tracer.scaled_iv_plus_offset_2(n);)
3272     return true;
3273   }
3274 
3275   if (offset_plus_k(n)) {
3276     NOT_PRODUCT(_tracer.scaled_iv_plus_offset_3(n);)
3277     return true;
3278   }
3279 
3280   int opc = n->Opcode();
3281   if (opc == Op_AddI) {
3282     if (scaled_iv(n->in(1)) && offset_plus_k(n->in(2))) {
3283       NOT_PRODUCT(_tracer.scaled_iv_plus_offset_4(n);)
3284       return true;
3285     }
3286     if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
3287       NOT_PRODUCT(_tracer.scaled_iv_plus_offset_5(n);)
3288       return true;
3289     }
3290   } else if (opc == Op_SubI) {
3291     if (scaled_iv(n->in(1)) && offset_plus_k(n->in(2), true)) {
3292       NOT_PRODUCT(_tracer.scaled_iv_plus_offset_6(n);)
3293       return true;
3294     }
3295     if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
3296       _scale *= -1;
3297       NOT_PRODUCT(_tracer.scaled_iv_plus_offset_7(n);)
3298       return true;
3299     }
3300   }
3301 
3302   NOT_PRODUCT(_tracer.scaled_iv_plus_offset_8(n);)
3303   return false;
3304 }
3305 
3306 //----------------------------scaled_iv------------------------
3307 // Match: k*iv where k is a constant that's not zero
3308 bool SWPointer::scaled_iv(Node* n) {
3309   NOT_PRODUCT(Tracer::Depth ddd;)
3310   NOT_PRODUCT(_tracer.scaled_iv_1(n);)
3311 
3312   if (_scale != 0) { // already found a scale
3313     NOT_PRODUCT(_tracer.scaled_iv_2(n, _scale);)
3314     return false;
3315   }
3316 
3317   if (n == iv()) {
3318     _scale = 1;
3319     NOT_PRODUCT(_tracer.scaled_iv_3(n, _scale);)
3320     return true;
3321   }
3322   if (_analyze_only && (invariant(n) == false)) {
3323     _nstack->push(n, _stack_idx++);
3324   }
3325 
3326   int opc = n->Opcode();
3327   if (opc == Op_MulI) {
3328     if (n->in(1) == iv() && n->in(2)->is_Con()) {
3329       _scale = n->in(2)->get_int();
3330       NOT_PRODUCT(_tracer.scaled_iv_4(n, _scale);)
3331       return true;
3332     } else if (n->in(2) == iv() && n->in(1)->is_Con()) {
3333       _scale = n->in(1)->get_int();
3334       NOT_PRODUCT(_tracer.scaled_iv_5(n, _scale);)
3335       return true;
3336     }
3337   } else if (opc == Op_LShiftI) {
3338     if (n->in(1) == iv() && n->in(2)->is_Con()) {
3339       _scale = 1 << n->in(2)->get_int();
3340       NOT_PRODUCT(_tracer.scaled_iv_6(n, _scale);)
3341       return true;
3342     }
3343   } else if (opc == Op_ConvI2L) {
3344     if (scaled_iv_plus_offset(n->in(1))) {
3345       NOT_PRODUCT(_tracer.scaled_iv_7(n);)
3346       return true;
3347     }
3348   } else if (opc == Op_LShiftL) {
3349     if (!has_iv() && _invar == NULL) {
3350       // Need to preserve the current _offset value, so
3351       // create a temporary object for this expression subtree.
3352       // Hacky, so should re-engineer the address pattern match.
3353       NOT_PRODUCT(Tracer::Depth dddd;)
3354       SWPointer tmp(this);
3355       NOT_PRODUCT(_tracer.scaled_iv_8(n, &tmp);)
3356 
3357       if (tmp.scaled_iv_plus_offset(n->in(1))) {
3358         if (tmp._invar == NULL || _slp->do_vector_loop()) {
3359           int mult = 1 << n->in(2)->get_int();
3360           _scale   = tmp._scale  * mult;
3361           _offset += tmp._offset * mult;
3362           NOT_PRODUCT(_tracer.scaled_iv_9(n, _scale, _offset, mult);)
3363           return true;
3364         }
3365       }
3366     }
3367   }
3368   NOT_PRODUCT(_tracer.scaled_iv_10(n);)
3369   return false;
3370 }
3371 
3372 //----------------------------offset_plus_k------------------------
3373 // Match: offset is (k [+/- invariant])
3374 // where k maybe zero and invariant is optional, but not both.
3375 bool SWPointer::offset_plus_k(Node* n, bool negate) {
3376   NOT_PRODUCT(Tracer::Depth ddd;)
3377   NOT_PRODUCT(_tracer.offset_plus_k_1(n);)
3378 
3379   int opc = n->Opcode();
3380   if (opc == Op_ConI) {
3381     _offset += negate ? -(n->get_int()) : n->get_int();
3382     NOT_PRODUCT(_tracer.offset_plus_k_2(n, _offset);)
3383     return true;
3384   } else if (opc == Op_ConL) {
3385     // Okay if value fits into an int
3386     const TypeLong* t = n->find_long_type();
3387     if (t->higher_equal(TypeLong::INT)) {
3388       jlong loff = n->get_long();
3389       jint  off  = (jint)loff;
3390       _offset += negate ? -off : loff;
3391       NOT_PRODUCT(_tracer.offset_plus_k_3(n, _offset);)
3392       return true;
3393     }
3394     NOT_PRODUCT(_tracer.offset_plus_k_4(n);)
3395     return false;
3396   }
3397   if (_invar != NULL) { // already has an invariant
3398     NOT_PRODUCT(_tracer.offset_plus_k_5(n, _invar);)
3399     return false;
3400   }
3401 
3402   if (_analyze_only && (invariant(n) == false)) {
3403     _nstack->push(n, _stack_idx++);
3404   }
3405   if (opc == Op_AddI) {
3406     if (n->in(2)->is_Con() && invariant(n->in(1))) {
3407       _negate_invar = negate;
3408       _invar = n->in(1);
3409       _offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
3410       NOT_PRODUCT(_tracer.offset_plus_k_6(n, _invar, _negate_invar, _offset);)
3411       return true;
3412     } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
3413       _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
3414       _negate_invar = negate;
3415       _invar = n->in(2);
3416       NOT_PRODUCT(_tracer.offset_plus_k_7(n, _invar, _negate_invar, _offset);)
3417       return true;
3418     }
3419   }
3420   if (opc == Op_SubI) {
3421     if (n->in(2)->is_Con() && invariant(n->in(1))) {
3422       _negate_invar = negate;
3423       _invar = n->in(1);
3424       _offset += !negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
3425       NOT_PRODUCT(_tracer.offset_plus_k_8(n, _invar, _negate_invar, _offset);)
3426       return true;
3427     } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
3428       _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
3429       _negate_invar = !negate;
3430       _invar = n->in(2);
3431       NOT_PRODUCT(_tracer.offset_plus_k_9(n, _invar, _negate_invar, _offset);)
3432       return true;
3433     }
3434   }
3435   if (invariant(n)) {
3436     if (opc == Op_ConvI2L) {
3437       n = n->in(1);
3438     }
3439     _negate_invar = negate;
3440     _invar = n;
3441     NOT_PRODUCT(_tracer.offset_plus_k_10(n, _invar, _negate_invar, _offset);)
3442     return true;
3443   }
3444 
3445   NOT_PRODUCT(_tracer.offset_plus_k_11(n);)
3446   return false;
3447 }
3448 
3449 //----------------------------print------------------------
3450 void SWPointer::print() {
3451 #ifndef PRODUCT
3452   tty->print("base: %d  adr: %d  scale: %d  offset: %d  invar: %c%d\n",
3453              _base != NULL ? _base->_idx : 0,
3454              _adr  != NULL ? _adr->_idx  : 0,
3455              _scale, _offset,
3456              _negate_invar?'-':'+',
3457              _invar != NULL ? _invar->_idx : 0);
3458 #endif
3459 }
3460 
3461 //----------------------------tracing------------------------
3462 #ifndef PRODUCT
3463 void SWPointer::Tracer::print_depth() {
3464   for (int ii = 0; ii<_depth; ++ii) tty->print("  ");
3465 }
3466 
3467 void SWPointer::Tracer::ctor_1 (Node* mem) {
3468   if(_slp->is_trace_alignment()) {
3469     print_depth(); tty->print(" %d SWPointer::SWPointer: start alignment analysis", mem->_idx); mem->dump();
3470   }
3471 }
3472 
3473 void SWPointer::Tracer::ctor_2(Node* adr) {
3474   if(_slp->is_trace_alignment()) {
3475     //store_depth();
3476     inc_depth();
3477     print_depth(); tty->print(" %d (adr) SWPointer::SWPointer: ", adr->_idx); adr->dump();
3478     inc_depth();
3479     print_depth(); tty->print(" %d (base) SWPointer::SWPointer: ", adr->in(AddPNode::Base)->_idx); adr->in(AddPNode::Base)->dump();
3480   }
3481 }
3482 
3483 void SWPointer::Tracer::ctor_3(Node* adr, int i) {
3484   if(_slp->is_trace_alignment()) {
3485     inc_depth();
3486     Node* offset = adr->in(AddPNode::Offset);
3487     print_depth(); tty->print(" %d (offset) SWPointer::SWPointer: i = %d: ", offset->_idx, i); offset->dump();
3488   }
3489 }
3490 
3491 void SWPointer::Tracer::ctor_4(Node* adr, int i) {
3492   if(_slp->is_trace_alignment()) {
3493     inc_depth();
3494     print_depth(); tty->print(" %d (adr) SWPointer::SWPointer: i = %d: ", adr->_idx, i); adr->dump();
3495   }
3496 }
3497 
3498 void SWPointer::Tracer::ctor_5(Node* adr, Node* base, int i) {
3499   if(_slp->is_trace_alignment()) {
3500     inc_depth();
3501     if (base == adr) {
3502       print_depth(); tty->print_cr("  \\ %d (adr) == %d (base) SWPointer::SWPointer: breaking analysis at i = %d", adr->_idx, base->_idx, i);
3503     } else if (!adr->is_AddP()) {
3504       print_depth(); tty->print_cr("  \\ %d (adr) is NOT Addp SWPointer::SWPointer: breaking analysis at i = %d", adr->_idx, i);
3505     }
3506   }
3507 }
3508 
3509 void SWPointer::Tracer::ctor_6(Node* mem) {
3510   if(_slp->is_trace_alignment()) {
3511     //restore_depth();
3512     print_depth(); tty->print_cr(" %d (adr) SWPointer::SWPointer: stop analysis", mem->_idx);
3513   }
3514 }
3515 
3516 void SWPointer::Tracer::invariant_1(Node *n, Node *n_c) {
3517   if (_slp->do_vector_loop() && _slp->is_debug() && _slp->_lpt->is_member(_slp->_phase->get_loop(n_c)) != (int)_slp->in_bb(n)) {
3518     int is_member =  _slp->_lpt->is_member(_slp->_phase->get_loop(n_c));
3519     int in_bb     =  _slp->in_bb(n);
3520     print_depth(); tty->print("  \\ ");  tty->print_cr(" %d SWPointer::invariant  conditions differ: n_c %d", n->_idx, n_c->_idx);
3521     print_depth(); tty->print("  \\ ");  tty->print_cr("is_member %d, in_bb %d", is_member, in_bb);
3522     print_depth(); tty->print("  \\ ");  n->dump();
3523     print_depth(); tty->print("  \\ ");  n_c->dump();
3524   }
3525 }
3526 
3527 void SWPointer::Tracer::scaled_iv_plus_offset_1(Node* n) {
3528   if(_slp->is_trace_alignment()) {
3529     print_depth(); tty->print(" %d SWPointer::scaled_iv_plus_offset testing node: ", n->_idx);
3530     n->dump();
3531   }
3532 }
3533 
3534 void SWPointer::Tracer::scaled_iv_plus_offset_2(Node* n) {
3535   if(_slp->is_trace_alignment()) {
3536     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: PASSED", n->_idx);
3537   }
3538 }
3539 
3540 void SWPointer::Tracer::scaled_iv_plus_offset_3(Node* n) {
3541   if(_slp->is_trace_alignment()) {
3542     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: PASSED", n->_idx);
3543   }
3544 }
3545 
3546 void SWPointer::Tracer::scaled_iv_plus_offset_4(Node* n) {
3547   if(_slp->is_trace_alignment()) {
3548     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
3549     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump();
3550     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump();
3551   }
3552 }
3553 
3554 void SWPointer::Tracer::scaled_iv_plus_offset_5(Node* n) {
3555   if(_slp->is_trace_alignment()) {
3556     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
3557     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump();
3558     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump();
3559   }
3560 }
3561 
3562 void SWPointer::Tracer::scaled_iv_plus_offset_6(Node* n) {
3563   if(_slp->is_trace_alignment()) {
3564     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_SubI PASSED", n->_idx);
3565     print_depth(); tty->print("  \\  %d SWPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump();
3566     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump();
3567   }
3568 }
3569 
3570 void SWPointer::Tracer::scaled_iv_plus_offset_7(Node* n) {
3571   if(_slp->is_trace_alignment()) {
3572     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_SubI PASSED", n->_idx);
3573     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump();
3574     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump();
3575   }
3576 }
3577 
3578 void SWPointer::Tracer::scaled_iv_plus_offset_8(Node* n) {
3579   if(_slp->is_trace_alignment()) {
3580     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: FAILED", n->_idx);
3581   }
3582 }
3583 
3584 void SWPointer::Tracer::scaled_iv_1(Node* n) {
3585   if(_slp->is_trace_alignment()) {
3586     print_depth(); tty->print(" %d SWPointer::scaled_iv: testing node: ", n->_idx); n->dump();
3587   }
3588 }
3589 
3590 void SWPointer::Tracer::scaled_iv_2(Node* n, int scale) {
3591   if(_slp->is_trace_alignment()) {
3592     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: FAILED since another _scale has been detected before", n->_idx);
3593     print_depth(); tty->print_cr("  \\ SWPointer::scaled_iv: _scale (%d) != 0", scale);
3594   }
3595 }
3596 
3597 void SWPointer::Tracer::scaled_iv_3(Node* n, int scale) {
3598   if(_slp->is_trace_alignment()) {
3599     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: is iv, setting _scale = %d", n->_idx, scale);
3600   }
3601 }
3602 
3603 void SWPointer::Tracer::scaled_iv_4(Node* n, int scale) {
3604   if(_slp->is_trace_alignment()) {
3605     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
3606     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump();
3607     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
3608   }
3609 }
3610 
3611 void SWPointer::Tracer::scaled_iv_5(Node* n, int scale) {
3612   if(_slp->is_trace_alignment()) {
3613     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
3614     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(2) is iv: ", n->in(2)->_idx); n->in(2)->dump();
3615     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
3616   }
3617 }
3618 
3619 void SWPointer::Tracer::scaled_iv_6(Node* n, int scale) {
3620   if(_slp->is_trace_alignment()) {
3621     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_LShiftI PASSED, setting _scale = %d", n->_idx, scale);
3622     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump();
3623     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
3624   }
3625 }
3626 
3627 void SWPointer::Tracer::scaled_iv_7(Node* n) {
3628   if(_slp->is_trace_alignment()) {
3629     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_ConvI2L PASSED", n->_idx);
3630     print_depth(); tty->print_cr("  \\ SWPointer::scaled_iv: in(1) %d is scaled_iv_plus_offset: ", n->in(1)->_idx);
3631     inc_depth(); inc_depth();
3632     print_depth(); n->in(1)->dump();
3633     dec_depth(); dec_depth();
3634   }
3635 }
3636 
3637 void SWPointer::Tracer::scaled_iv_8(Node* n, SWPointer* tmp) {
3638   if(_slp->is_trace_alignment()) {
3639     print_depth(); tty->print(" %d SWPointer::scaled_iv: Op_LShiftL, creating tmp SWPointer: ", n->_idx); tmp->print();
3640   }
3641 }
3642 
3643 void SWPointer::Tracer::scaled_iv_9(Node* n, int scale, int _offset, int mult) {
3644   if(_slp->is_trace_alignment()) {
3645     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_LShiftL PASSED, setting _scale = %d, _offset = %d", n->_idx, scale, _offset);
3646     print_depth(); tty->print_cr("  \\ SWPointer::scaled_iv: in(1) %d is scaled_iv_plus_offset, in(2) %d used to get mult = %d: _scale = %d, _offset = %d",
3647     n->in(1)->_idx, n->in(2)->_idx, mult, scale, _offset);
3648     inc_depth(); inc_depth();
3649     print_depth(); n->in(1)->dump();
3650     print_depth(); n->in(2)->dump();
3651     dec_depth(); dec_depth();
3652   }
3653 }
3654 
3655 void SWPointer::Tracer::scaled_iv_10(Node* n) {
3656   if(_slp->is_trace_alignment()) {
3657     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: FAILED", n->_idx);
3658   }
3659 }
3660 
3661 void SWPointer::Tracer::offset_plus_k_1(Node* n) {
3662   if(_slp->is_trace_alignment()) {
3663     print_depth(); tty->print(" %d SWPointer::offset_plus_k: testing node: ", n->_idx); n->dump();
3664   }
3665 }
3666 
3667 void SWPointer::Tracer::offset_plus_k_2(Node* n, int _offset) {
3668   if(_slp->is_trace_alignment()) {
3669     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_ConI PASSED, setting _offset = %d", n->_idx, _offset);
3670   }
3671 }
3672 
3673 void SWPointer::Tracer::offset_plus_k_3(Node* n, int _offset) {
3674   if(_slp->is_trace_alignment()) {
3675     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_ConL PASSED, setting _offset = %d", n->_idx, _offset);
3676   }
3677 }
3678 
3679 void SWPointer::Tracer::offset_plus_k_4(Node* n) {
3680   if(_slp->is_trace_alignment()) {
3681     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED", n->_idx);
3682     print_depth(); tty->print_cr("  \\ " JLONG_FORMAT " SWPointer::offset_plus_k: Op_ConL FAILED, k is too big", n->get_long());
3683   }
3684 }
3685 
3686 void SWPointer::Tracer::offset_plus_k_5(Node* n, Node* _invar) {
3687   if(_slp->is_trace_alignment()) {
3688     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED since another invariant has been detected before", n->_idx);
3689     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: _invar != NULL: ", _invar->_idx); _invar->dump();
3690   }
3691 }
3692 
3693 void SWPointer::Tracer::offset_plus_k_6(Node* n, Node* _invar, bool _negate_invar, int _offset) {
3694   if(_slp->is_trace_alignment()) {
3695     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_AddI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
3696     n->_idx, _negate_invar, _invar->_idx, _offset);
3697     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
3698     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
3699   }
3700 }
3701 
3702 void SWPointer::Tracer::offset_plus_k_7(Node* n, Node* _invar, bool _negate_invar, int _offset) {
3703   if(_slp->is_trace_alignment()) {
3704     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_AddI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
3705     n->_idx, _negate_invar, _invar->_idx, _offset);
3706     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
3707     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
3708   }
3709 }
3710 
3711 void SWPointer::Tracer::offset_plus_k_8(Node* n, Node* _invar, bool _negate_invar, int _offset) {
3712   if(_slp->is_trace_alignment()) {
3713     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_SubI is PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
3714     n->_idx, _negate_invar, _invar->_idx, _offset);
3715     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
3716     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
3717   }
3718 }
3719 
3720 void SWPointer::Tracer::offset_plus_k_9(Node* n, Node* _invar, bool _negate_invar, int _offset) {
3721   if(_slp->is_trace_alignment()) {
3722     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_SubI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
3723     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
3724     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
3725   }
3726 }
3727 
3728 void SWPointer::Tracer::offset_plus_k_10(Node* n, Node* _invar, bool _negate_invar, int _offset) {
3729   if(_slp->is_trace_alignment()) {
3730     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
3731     print_depth(); tty->print_cr("  \\ %d SWPointer::offset_plus_k: is invariant", n->_idx);
3732   }
3733 }
3734 
3735 void SWPointer::Tracer::offset_plus_k_11(Node* n) {
3736   if(_slp->is_trace_alignment()) {
3737     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED", n->_idx);
3738   }
3739 }
3740 
3741 #endif
3742 // ========================= OrderedPair =====================
3743 
3744 const OrderedPair OrderedPair::initial;
3745 
3746 // ========================= SWNodeInfo =====================
3747 
3748 const SWNodeInfo SWNodeInfo::initial;
3749 
3750 
3751 // ============================ DepGraph ===========================
3752 
3753 //------------------------------make_node---------------------------
3754 // Make a new dependence graph node for an ideal node.
3755 DepMem* DepGraph::make_node(Node* node) {
3756   DepMem* m = new (_arena) DepMem(node);
3757   if (node != NULL) {
3758     assert(_map.at_grow(node->_idx) == NULL, "one init only");
3759     _map.at_put_grow(node->_idx, m);
3760   }
3761   return m;
3762 }
3763 
3764 //------------------------------make_edge---------------------------
3765 // Make a new dependence graph edge from dpred -> dsucc
3766 DepEdge* DepGraph::make_edge(DepMem* dpred, DepMem* dsucc) {
3767   DepEdge* e = new (_arena) DepEdge(dpred, dsucc, dsucc->in_head(), dpred->out_head());
3768   dpred->set_out_head(e);
3769   dsucc->set_in_head(e);
3770   return e;
3771 }
3772 
3773 // ========================== DepMem ========================
3774 
3775 //------------------------------in_cnt---------------------------
3776 int DepMem::in_cnt() {
3777   int ct = 0;
3778   for (DepEdge* e = _in_head; e != NULL; e = e->next_in()) ct++;
3779   return ct;
3780 }
3781 
3782 //------------------------------out_cnt---------------------------
3783 int DepMem::out_cnt() {
3784   int ct = 0;
3785   for (DepEdge* e = _out_head; e != NULL; e = e->next_out()) ct++;
3786   return ct;
3787 }
3788 
3789 //------------------------------print-----------------------------
3790 void DepMem::print() {
3791 #ifndef PRODUCT
3792   tty->print("  DepNode %d (", _node->_idx);
3793   for (DepEdge* p = _in_head; p != NULL; p = p->next_in()) {
3794     Node* pred = p->pred()->node();
3795     tty->print(" %d", pred != NULL ? pred->_idx : 0);
3796   }
3797   tty->print(") [");
3798   for (DepEdge* s = _out_head; s != NULL; s = s->next_out()) {
3799     Node* succ = s->succ()->node();
3800     tty->print(" %d", succ != NULL ? succ->_idx : 0);
3801   }
3802   tty->print_cr(" ]");
3803 #endif
3804 }
3805 
3806 // =========================== DepEdge =========================
3807 
3808 //------------------------------DepPreds---------------------------
3809 void DepEdge::print() {
3810 #ifndef PRODUCT
3811   tty->print_cr("DepEdge: %d [ %d ]", _pred->node()->_idx, _succ->node()->_idx);
3812 #endif
3813 }
3814 
3815 // =========================== DepPreds =========================
3816 // Iterator over predecessor edges in the dependence graph.
3817 
3818 //------------------------------DepPreds---------------------------
3819 DepPreds::DepPreds(Node* n, DepGraph& dg) {
3820   _n = n;
3821   _done = false;
3822   if (_n->is_Store() || _n->is_Load()) {
3823     _next_idx = MemNode::Address;
3824     _end_idx  = n->req();
3825     _dep_next = dg.dep(_n)->in_head();
3826   } else if (_n->is_Mem()) {
3827     _next_idx = 0;
3828     _end_idx  = 0;
3829     _dep_next = dg.dep(_n)->in_head();
3830   } else {
3831     _next_idx = 1;
3832     _end_idx  = _n->req();
3833     _dep_next = NULL;
3834   }
3835   next();
3836 }
3837 
3838 //------------------------------next---------------------------
3839 void DepPreds::next() {
3840   if (_dep_next != NULL) {
3841     _current  = _dep_next->pred()->node();
3842     _dep_next = _dep_next->next_in();
3843   } else if (_next_idx < _end_idx) {
3844     _current  = _n->in(_next_idx++);
3845   } else {
3846     _done = true;
3847   }
3848 }
3849 
3850 // =========================== DepSuccs =========================
3851 // Iterator over successor edges in the dependence graph.
3852 
3853 //------------------------------DepSuccs---------------------------
3854 DepSuccs::DepSuccs(Node* n, DepGraph& dg) {
3855   _n = n;
3856   _done = false;
3857   if (_n->is_Load()) {
3858     _next_idx = 0;
3859     _end_idx  = _n->outcnt();
3860     _dep_next = dg.dep(_n)->out_head();
3861   } else if (_n->is_Mem() || _n->is_Phi() && _n->bottom_type() == Type::MEMORY) {
3862     _next_idx = 0;
3863     _end_idx  = 0;
3864     _dep_next = dg.dep(_n)->out_head();
3865   } else {
3866     _next_idx = 0;
3867     _end_idx  = _n->outcnt();
3868     _dep_next = NULL;
3869   }
3870   next();
3871 }
3872 
3873 //-------------------------------next---------------------------
3874 void DepSuccs::next() {
3875   if (_dep_next != NULL) {
3876     _current  = _dep_next->succ()->node();
3877     _dep_next = _dep_next->next_out();
3878   } else if (_next_idx < _end_idx) {
3879     _current  = _n->raw_out(_next_idx++);
3880   } else {
3881     _done = true;
3882   }
3883 }
3884 
3885 //
3886 // --------------------------------- vectorization/simd -----------------------------------
3887 //
3888 bool SuperWord::same_origin_idx(Node* a, Node* b) const {
3889   return a != NULL && b != NULL && _clone_map.same_idx(a->_idx, b->_idx);
3890 }
3891 bool SuperWord::same_generation(Node* a, Node* b) const {
3892   return a != NULL && b != NULL && _clone_map.same_gen(a->_idx, b->_idx);
3893 }
3894 
3895 Node*  SuperWord::find_phi_for_mem_dep(LoadNode* ld) {
3896   assert(in_bb(ld), "must be in block");
3897   if (_clone_map.gen(ld->_idx) == _ii_first) {
3898 #ifndef PRODUCT
3899     if (_vector_loop_debug) {
3900       tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(ld->_idx)=%d",
3901         _clone_map.gen(ld->_idx));
3902     }
3903 #endif
3904     return NULL; //we think that any ld in the first gen being vectorizable
3905   }
3906 
3907   Node* mem = ld->in(MemNode::Memory);
3908   if (mem->outcnt() <= 1) {
3909     // we don't want to remove the only edge from mem node to load
3910 #ifndef PRODUCT
3911     if (_vector_loop_debug) {
3912       tty->print_cr("SuperWord::find_phi_for_mem_dep input node %d to load %d has no other outputs and edge mem->load cannot be removed",
3913         mem->_idx, ld->_idx);
3914       ld->dump();
3915       mem->dump();
3916     }
3917 #endif
3918     return NULL;
3919   }
3920   if (!in_bb(mem) || same_generation(mem, ld)) {
3921 #ifndef PRODUCT
3922     if (_vector_loop_debug) {
3923       tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(mem->_idx)=%d",
3924         _clone_map.gen(mem->_idx));
3925     }
3926 #endif
3927     return NULL; // does not depend on loop volatile node or depends on the same generation
3928   }
3929 
3930   //otherwise first node should depend on mem-phi
3931   Node* first = first_node(ld);
3932   assert(first->is_Load(), "must be Load");
3933   Node* phi = first->as_Load()->in(MemNode::Memory);
3934   if (!phi->is_Phi() || phi->bottom_type() != Type::MEMORY) {
3935 #ifndef PRODUCT
3936     if (_vector_loop_debug) {
3937       tty->print_cr("SuperWord::find_phi_for_mem_dep load is not vectorizable node, since it's `first` does not take input from mem phi");
3938       ld->dump();
3939       first->dump();
3940     }
3941 #endif
3942     return NULL;
3943   }
3944 
3945   Node* tail = 0;
3946   for (int m = 0; m < _mem_slice_head.length(); m++) {
3947     if (_mem_slice_head.at(m) == phi) {
3948       tail = _mem_slice_tail.at(m);
3949     }
3950   }
3951   if (tail == 0) { //test that found phi is in the list  _mem_slice_head
3952 #ifndef PRODUCT
3953     if (_vector_loop_debug) {
3954       tty->print_cr("SuperWord::find_phi_for_mem_dep load %d is not vectorizable node, its phi %d is not _mem_slice_head",
3955         ld->_idx, phi->_idx);
3956       ld->dump();
3957       phi->dump();
3958     }
3959 #endif
3960     return NULL;
3961   }
3962 
3963   // now all conditions are met
3964   return phi;
3965 }
3966 
3967 Node* SuperWord::first_node(Node* nd) {
3968   for (int ii = 0; ii < _iteration_first.length(); ii++) {
3969     Node* nnn = _iteration_first.at(ii);
3970     if (same_origin_idx(nnn, nd)) {
3971 #ifndef PRODUCT
3972       if (_vector_loop_debug) {
3973         tty->print_cr("SuperWord::first_node: %d is the first iteration node for %d (_clone_map.idx(nnn->_idx) = %d)",
3974           nnn->_idx, nd->_idx, _clone_map.idx(nnn->_idx));
3975       }
3976 #endif
3977       return nnn;
3978     }
3979   }
3980 
3981 #ifndef PRODUCT
3982   if (_vector_loop_debug) {
3983     tty->print_cr("SuperWord::first_node: did not find first iteration node for %d (_clone_map.idx(nd->_idx)=%d)",
3984       nd->_idx, _clone_map.idx(nd->_idx));
3985   }
3986 #endif
3987   return 0;
3988 }
3989 
3990 Node* SuperWord::last_node(Node* nd) {
3991   for (int ii = 0; ii < _iteration_last.length(); ii++) {
3992     Node* nnn = _iteration_last.at(ii);
3993     if (same_origin_idx(nnn, nd)) {
3994 #ifndef PRODUCT
3995       if (_vector_loop_debug) {
3996         tty->print_cr("SuperWord::last_node _clone_map.idx(nnn->_idx)=%d, _clone_map.idx(nd->_idx)=%d",
3997           _clone_map.idx(nnn->_idx), _clone_map.idx(nd->_idx));
3998       }
3999 #endif
4000       return nnn;
4001     }
4002   }
4003   return 0;
4004 }
4005 
4006 int SuperWord::mark_generations() {
4007   Node *ii_err = 0, *tail_err;
4008   for (int i = 0; i < _mem_slice_head.length(); i++) {
4009     Node* phi  = _mem_slice_head.at(i);
4010     assert(phi->is_Phi(), "must be phi");
4011 
4012     Node* tail = _mem_slice_tail.at(i);
4013     if (_ii_last == -1) {
4014       tail_err = tail;
4015       _ii_last = _clone_map.gen(tail->_idx);
4016     }
4017     else if (_ii_last != _clone_map.gen(tail->_idx)) {
4018 #ifndef PRODUCT
4019       if (TraceSuperWord && Verbose) {
4020         tty->print_cr("SuperWord::mark_generations _ii_last error - found different generations in two tail nodes ");
4021         tail->dump();
4022         tail_err->dump();
4023       }
4024 #endif
4025       return -1;
4026     }
4027 
4028     // find first iteration in the loop
4029     for (DUIterator_Fast imax, i = phi->fast_outs(imax); i < imax; i++) {
4030       Node* ii = phi->fast_out(i);
4031       if (in_bb(ii) && ii->is_Store()) { // we speculate that normally Stores of one and one only generation have deps from mem phi
4032         if (_ii_first == -1) {
4033           ii_err = ii;
4034           _ii_first = _clone_map.gen(ii->_idx);
4035         } else if (_ii_first != _clone_map.gen(ii->_idx)) {
4036 #ifndef PRODUCT
4037           if (TraceSuperWord && Verbose) {
4038             tty->print_cr("SuperWord::mark_generations: _ii_first was found before and not equal to one in this node (%d)", _ii_first);
4039             ii->dump();
4040             if (ii_err!= 0) {
4041               ii_err->dump();
4042             }
4043           }
4044 #endif
4045           return -1; // this phi has Stores from different generations of unroll and cannot be simd/vectorized
4046         }
4047       }
4048     }//for (DUIterator_Fast imax,
4049   }//for (int i...
4050 
4051   if (_ii_first == -1 || _ii_last == -1) {
4052 #ifndef PRODUCT
4053     if (TraceSuperWord && Verbose) {
4054       tty->print_cr("SuperWord::mark_generations unknown error, something vent wrong");
4055     }
4056 #endif
4057     return -1; // something vent wrong
4058   }
4059   // collect nodes in the first and last generations
4060   assert(_iteration_first.length() == 0, "_iteration_first must be empty");
4061   assert(_iteration_last.length() == 0, "_iteration_last must be empty");
4062   for (int j = 0; j < _block.length(); j++) {
4063     Node* n = _block.at(j);
4064     node_idx_t gen = _clone_map.gen(n->_idx);
4065     if ((signed)gen == _ii_first) {
4066       _iteration_first.push(n);
4067     } else if ((signed)gen == _ii_last) {
4068       _iteration_last.push(n);
4069     }
4070   }
4071 
4072   // building order of iterations
4073   if (_ii_order.length() == 0 && ii_err != 0) {
4074     assert(in_bb(ii_err) && ii_err->is_Store(), "should be Store in bb");
4075     Node* nd = ii_err;
4076     while(_clone_map.gen(nd->_idx) != _ii_last) {
4077       _ii_order.push(_clone_map.gen(nd->_idx));
4078       bool found = false;
4079       for (DUIterator_Fast imax, i = nd->fast_outs(imax); i < imax; i++) {
4080         Node* use = nd->fast_out(i);
4081         if (same_origin_idx(use, nd) && use->as_Store()->in(MemNode::Memory) == nd) {
4082           found = true;
4083           nd = use;
4084           break;
4085         }
4086       }//for
4087 
4088       if (found == false) {
4089 #ifndef PRODUCT
4090         if (TraceSuperWord && Verbose) {
4091           tty->print_cr("SuperWord::mark_generations: Cannot build order of iterations - no dependent Store for %d", nd->_idx);
4092         }
4093 #endif
4094         _ii_order.clear();
4095         return -1;
4096       }
4097     } //while
4098     _ii_order.push(_clone_map.gen(nd->_idx));
4099   }
4100 
4101 #ifndef PRODUCT
4102   if (_vector_loop_debug) {
4103     tty->print_cr("SuperWord::mark_generations");
4104     tty->print_cr("First generation (%d) nodes:", _ii_first);
4105     for (int ii = 0; ii < _iteration_first.length(); ii++)  _iteration_first.at(ii)->dump();
4106     tty->print_cr("Last generation (%d) nodes:", _ii_last);
4107     for (int ii = 0; ii < _iteration_last.length(); ii++)  _iteration_last.at(ii)->dump();
4108     tty->print_cr(" ");
4109 
4110     tty->print("SuperWord::List of generations: ");
4111     for (int jj = 0; jj < _ii_order.length(); ++jj) {
4112       tty->print("%d:%d ", jj, _ii_order.at(jj));
4113     }
4114     tty->print_cr(" ");
4115   }
4116 #endif
4117 
4118   return _ii_first;
4119 }
4120 
4121 bool SuperWord::fix_commutative_inputs(Node* gold, Node* fix) {
4122   assert(gold->is_Add() && fix->is_Add() || gold->is_Mul() && fix->is_Mul(), "should be only Add or Mul nodes");
4123   assert(same_origin_idx(gold, fix), "should be clones of the same node");
4124   Node* gin1 = gold->in(1);
4125   Node* gin2 = gold->in(2);
4126   Node* fin1 = fix->in(1);
4127   Node* fin2 = fix->in(2);
4128   bool swapped = false;
4129 
4130   if (in_bb(gin1) && in_bb(gin2) && in_bb(fin1) && in_bb(fin1)) {
4131     if (same_origin_idx(gin1, fin1) &&
4132         same_origin_idx(gin2, fin2)) {
4133       return true; // nothing to fix
4134     }
4135     if (same_origin_idx(gin1, fin2) &&
4136         same_origin_idx(gin2, fin1)) {
4137       fix->swap_edges(1, 2);
4138       swapped = true;
4139     }
4140   }
4141   // at least one input comes from outside of bb
4142   if (gin1->_idx == fin1->_idx)  {
4143     return true; // nothing to fix
4144   }
4145   if (!swapped && (gin1->_idx == fin2->_idx || gin2->_idx == fin1->_idx))  { //swapping is expensive, check condition first
4146     fix->swap_edges(1, 2);
4147     swapped = true;
4148   }
4149 
4150   if (swapped) {
4151 #ifndef PRODUCT
4152     if (_vector_loop_debug) {
4153       tty->print_cr("SuperWord::fix_commutative_inputs: fixed node %d", fix->_idx);
4154     }
4155 #endif
4156     return true;
4157   }
4158 
4159 #ifndef PRODUCT
4160   if (TraceSuperWord && Verbose) {
4161     tty->print_cr("SuperWord::fix_commutative_inputs: cannot fix node %d", fix->_idx);
4162   }
4163 #endif
4164   return false;
4165 }
4166 
4167 bool SuperWord::pack_parallel() {
4168 #ifndef PRODUCT
4169   if (_vector_loop_debug) {
4170     tty->print_cr("SuperWord::pack_parallel: START");
4171   }
4172 #endif
4173 
4174   _packset.clear();
4175 
4176   for (int ii = 0; ii < _iteration_first.length(); ii++) {
4177     Node* nd = _iteration_first.at(ii);
4178     if (in_bb(nd) && (nd->is_Load() || nd->is_Store() || nd->is_Add() || nd->is_Mul())) {
4179       Node_List* pk = new Node_List();
4180       pk->push(nd);
4181       for (int gen = 1; gen < _ii_order.length(); ++gen) {
4182         for (int kk = 0; kk < _block.length(); kk++) {
4183           Node* clone = _block.at(kk);
4184           if (same_origin_idx(clone, nd) &&
4185               _clone_map.gen(clone->_idx) == _ii_order.at(gen)) {
4186             if (nd->is_Add() || nd->is_Mul()) {
4187               fix_commutative_inputs(nd, clone);
4188             }
4189             pk->push(clone);
4190             if (pk->size() == 4) {
4191               _packset.append(pk);
4192 #ifndef PRODUCT
4193               if (_vector_loop_debug) {
4194                 tty->print_cr("SuperWord::pack_parallel: added pack ");
4195                 pk->dump();
4196               }
4197 #endif
4198               if (_clone_map.gen(clone->_idx) != _ii_last) {
4199                 pk = new Node_List();
4200               }
4201             }
4202             break;
4203           }
4204         }
4205       }//for
4206     }//if
4207   }//for
4208 
4209 #ifndef PRODUCT
4210   if (_vector_loop_debug) {
4211     tty->print_cr("SuperWord::pack_parallel: END");
4212   }
4213 #endif
4214 
4215   return true;
4216 }
4217 
4218 bool SuperWord::hoist_loads_in_graph() {
4219   GrowableArray<Node*> loads;
4220 
4221 #ifndef PRODUCT
4222   if (_vector_loop_debug) {
4223     tty->print_cr("SuperWord::hoist_loads_in_graph: total number _mem_slice_head.length() = %d", _mem_slice_head.length());
4224   }
4225 #endif
4226 
4227   for (int i = 0; i < _mem_slice_head.length(); i++) {
4228     Node* n = _mem_slice_head.at(i);
4229     if ( !in_bb(n) || !n->is_Phi() || n->bottom_type() != Type::MEMORY) {
4230 #ifndef PRODUCT
4231       if (TraceSuperWord && Verbose) {
4232         tty->print_cr("SuperWord::hoist_loads_in_graph: skipping unexpected node n=%d", n->_idx);
4233       }
4234 #endif
4235       continue;
4236     }
4237 
4238 #ifndef PRODUCT
4239     if (_vector_loop_debug) {
4240       tty->print_cr("SuperWord::hoist_loads_in_graph: processing phi %d  = _mem_slice_head.at(%d);", n->_idx, i);
4241     }
4242 #endif
4243 
4244     for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
4245       Node* ld = n->fast_out(i);
4246       if (ld->is_Load() && ld->as_Load()->in(MemNode::Memory) == n && in_bb(ld)) {
4247         for (int i = 0; i < _block.length(); i++) {
4248           Node* ld2 = _block.at(i);
4249           if (ld2->is_Load() && same_origin_idx(ld, ld2) &&
4250               !same_generation(ld, ld2)) { // <= do not collect the first generation ld
4251 #ifndef PRODUCT
4252             if (_vector_loop_debug) {
4253               tty->print_cr("SuperWord::hoist_loads_in_graph: will try to hoist load ld2->_idx=%d, cloned from %d (ld->_idx=%d)",
4254                 ld2->_idx, _clone_map.idx(ld->_idx), ld->_idx);
4255             }
4256 #endif
4257             // could not do on-the-fly, since iterator is immutable
4258             loads.push(ld2);
4259           }
4260         }// for
4261       }//if
4262     }//for (DUIterator_Fast imax,
4263   }//for (int i = 0; i
4264 
4265   for (int i = 0; i < loads.length(); i++) {
4266     LoadNode* ld = loads.at(i)->as_Load();
4267     Node* phi = find_phi_for_mem_dep(ld);
4268     if (phi != NULL) {
4269 #ifndef PRODUCT
4270       if (_vector_loop_debug) {
4271         tty->print_cr("SuperWord::hoist_loads_in_graph replacing MemNode::Memory(%d) edge in %d with one from %d",
4272           MemNode::Memory, ld->_idx, phi->_idx);
4273       }
4274 #endif
4275       _igvn.replace_input_of(ld, MemNode::Memory, phi);
4276     }
4277   }//for
4278 
4279   restart(); // invalidate all basic structures, since we rebuilt the graph
4280 
4281 #ifndef PRODUCT
4282   if (TraceSuperWord && Verbose) {
4283     tty->print_cr("\nSuperWord::hoist_loads_in_graph() the graph was rebuilt, all structures invalidated and need rebuild");
4284   }
4285 #endif
4286   return true;
4287 }
4288