1 /*
   2  * Copyright (c) 2007, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 #include "precompiled.hpp"
  25 #include "compiler/compileLog.hpp"
  26 #include "libadt/vectset.hpp"
  27 #include "memory/allocation.inline.hpp"
  28 #include "opto/addnode.hpp"
  29 #include "opto/callnode.hpp"
  30 #include "opto/castnode.hpp"
  31 #include "opto/convertnode.hpp"
  32 #include "opto/divnode.hpp"
  33 #include "opto/matcher.hpp"
  34 #include "opto/memnode.hpp"
  35 #include "opto/mulnode.hpp"
  36 #include "opto/opcodes.hpp"
  37 #include "opto/opaquenode.hpp"
  38 #include "opto/superword.hpp"
  39 #include "opto/vectornode.hpp"
  40 #include "opto/movenode.hpp"
  41 
  42 //
  43 //                  S U P E R W O R D   T R A N S F O R M
  44 //=============================================================================
  45 
  46 //------------------------------SuperWord---------------------------
  47 SuperWord::SuperWord(PhaseIdealLoop* phase) :
  48   _phase(phase),
  49   _igvn(phase->_igvn),
  50   _arena(phase->C->comp_arena()),
  51   _packset(arena(), 8,  0, NULL),         // packs for the current block
  52   _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
  53   _block(arena(), 8,  0, NULL),           // nodes in current block
  54   _post_block(arena(), 8, 0, NULL),       // nodes common to current block which are marked as post loop vectorizable
  55   _data_entry(arena(), 8,  0, NULL),      // nodes with all inputs from outside
  56   _mem_slice_head(arena(), 8,  0, NULL),  // memory slice heads
  57   _mem_slice_tail(arena(), 8,  0, NULL),  // memory slice tails
  58   _node_info(arena(), 8,  0, SWNodeInfo::initial), // info needed per node
  59   _clone_map(phase->C->clone_map()),      // map of nodes created in cloning
  60   _cmovev_kit(_arena, this),              // map to facilitate CMoveVD creation
  61   _align_to_ref(NULL),                    // memory reference to align vectors to
  62   _disjoint_ptrs(arena(), 8,  0, OrderedPair::initial), // runtime disambiguated pointer pairs
  63   _dg(_arena),                            // dependence graph
  64   _visited(arena()),                      // visited node set
  65   _post_visited(arena()),                 // post visited node set
  66   _n_idx_list(arena(), 8),                // scratch list of (node,index) pairs
  67   _stk(arena(), 8, 0, NULL),              // scratch stack of nodes
  68   _nlist(arena(), 8, 0, NULL),            // scratch list of nodes
  69   _lpt(NULL),                             // loop tree node
  70   _lp(NULL),                              // LoopNode
  71   _bb(NULL),                              // basic block
  72   _iv(NULL),                              // induction var
  73   _race_possible(false),                  // cases where SDMU is true
  74   _early_return(true),                    // analysis evaluations routine
  75   _num_work_vecs(0),                      // amount of vector work we have
  76   _num_reductions(0),                     // amount of reduction work we have
  77   _do_vector_loop(phase->C->do_vector_loop()),  // whether to do vectorization/simd style
  78   _do_reserve_copy(DoReserveCopyInSuperWord),
  79   _ii_first(-1),                          // first loop generation index - only if do_vector_loop()
  80   _ii_last(-1),                           // last loop generation index - only if do_vector_loop()
  81   _ii_order(arena(), 8, 0, 0)
  82 {
  83 #ifndef PRODUCT
  84   _vector_loop_debug = 0;
  85   if (_phase->C->method() != NULL) {
  86     _vector_loop_debug = phase->C->directive()->VectorizeDebugOption;
  87   }
  88 
  89 #endif
  90 }
  91 
  92 //------------------------------transform_loop---------------------------
  93 void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
  94   assert(UseSuperWord, "should be");
  95   // Do vectors exist on this architecture?
  96   if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
  97 
  98   assert(lpt->_head->is_CountedLoop(), "must be");
  99   CountedLoopNode *cl = lpt->_head->as_CountedLoop();
 100 
 101   if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
 102 
 103   bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
 104   if (post_loop_allowed) {
 105     if (cl->is_reduction_loop()) return; // no predication mapping
 106     Node *limit = cl->limit();
 107     if (limit->is_Con()) return; // non constant limits only
 108     // Now check the limit for expressions we do not handle
 109     if (limit->is_Add()) {
 110       Node *in2 = limit->in(2);
 111       if (in2->is_Con()) {
 112         int val = in2->get_int();
 113         // should not try to program these cases
 114         if (val < 0) return;
 115       }
 116     }
 117   }
 118 
 119   // skip any loop that has not been assigned max unroll by analysis
 120   if (do_optimization) {
 121     if (cl->slp_max_unroll() == 0) return;
 122   }
 123 
 124   // Check for no control flow in body (other than exit)
 125   Node *cl_exit = cl->loopexit();
 126   if (cl->is_main_loop() && (cl_exit->in(0) != lpt->_head)) {
 127     #ifndef PRODUCT
 128       if (TraceSuperWord) {
 129         tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
 130         tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
 131         tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();
 132         tty->print("lpt->_head %d", lpt->_head->_idx); lpt->_head->dump();
 133         lpt->dump_head();
 134       }
 135     #endif
 136     return;
 137   }
 138 
 139   // Make sure the are no extra control users of the loop backedge
 140   if (cl->back_control()->outcnt() != 1) {
 141     return;
 142   }
 143 
 144   // Skip any loops already optimized by slp
 145   if (cl->is_vectorized_loop()) return;
 146 
 147   if (cl->is_main_loop()) {
 148     // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
 149     CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
 150     if (pre_end == NULL) return;
 151     Node *pre_opaq1 = pre_end->limit();
 152     if (pre_opaq1->Opcode() != Op_Opaque1) return;
 153   }
 154 
 155   init(); // initialize data structures
 156 
 157   set_lpt(lpt);
 158   set_lp(cl);
 159 
 160   // For now, define one block which is the entire loop body
 161   set_bb(cl);
 162 
 163   if (do_optimization) {
 164     assert(_packset.length() == 0, "packset must be empty");
 165     SLP_extract();
 166     if (PostLoopMultiversioning && Matcher::has_predicated_vectors()) {
 167       if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) {
 168         IdealLoopTree *lpt_next = lpt->_next;
 169         CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
 170         _phase->has_range_checks(lpt_next);
 171         if (cl_next->is_post_loop() && !cl_next->range_checks_present()) {
 172           if (!cl_next->is_vectorized_loop()) {
 173             int slp_max_unroll_factor = cl->slp_max_unroll();
 174             cl_next->set_slp_max_unroll(slp_max_unroll_factor);
 175           }
 176         }
 177       }
 178     }
 179   }
 180 }
 181 
 182 //------------------------------early unrolling analysis------------------------------
 183 void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
 184   bool is_slp = true;
 185   ResourceMark rm;
 186   size_t ignored_size = lpt()->_body.size();
 187   int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
 188   Node_Stack nstack((int)ignored_size);
 189   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
 190   Node *cl_exit = cl->loopexit();
 191   int rpo_idx = _post_block.length();
 192 
 193   assert(rpo_idx == 0, "post loop block is empty");
 194 
 195   // First clear the entries
 196   for (uint i = 0; i < lpt()->_body.size(); i++) {
 197     ignored_loop_nodes[i] = -1;
 198   }
 199 
 200   int max_vector = Matcher::max_vector_size(T_INT);
 201   bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
 202 
 203   // Process the loop, some/all of the stack entries will not be in order, ergo
 204   // need to preprocess the ignored initial state before we process the loop
 205   for (uint i = 0; i < lpt()->_body.size(); i++) {
 206     Node* n = lpt()->_body.at(i);
 207     if (n == cl->incr() ||
 208       n->is_reduction() ||
 209       n->is_AddP() ||
 210       n->is_Cmp() ||
 211       n->is_IfTrue() ||
 212       n->is_CountedLoop() ||
 213       (n == cl_exit)) {
 214       ignored_loop_nodes[i] = n->_idx;
 215       continue;
 216     }
 217 
 218     if (n->is_If()) {
 219       IfNode *iff = n->as_If();
 220       if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
 221         if (lpt()->is_loop_exit(iff)) {
 222           ignored_loop_nodes[i] = n->_idx;
 223           continue;
 224         }
 225       }
 226     }
 227 
 228     if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
 229       Node* n_tail = n->in(LoopNode::LoopBackControl);
 230       if (n_tail != n->in(LoopNode::EntryControl)) {
 231         if (!n_tail->is_Mem()) {
 232           is_slp = false;
 233           break;
 234         }
 235       }
 236     }
 237 
 238     // This must happen after check of phi/if
 239     if (n->is_Phi() || n->is_If()) {
 240       ignored_loop_nodes[i] = n->_idx;
 241       continue;
 242     }
 243 
 244     if (n->is_LoadStore() || n->is_MergeMem() ||
 245       (n->is_Proj() && !n->as_Proj()->is_CFG())) {
 246       is_slp = false;
 247       break;
 248     }
 249 
 250     // Ignore nodes with non-primitive type.
 251     BasicType bt;
 252     if (n->is_Mem()) {
 253       bt = n->as_Mem()->memory_type();
 254     } else {
 255       bt = n->bottom_type()->basic_type();
 256     }
 257     if (is_java_primitive(bt) == false) {
 258       ignored_loop_nodes[i] = n->_idx;
 259       continue;
 260     }
 261 
 262     if (n->is_Mem()) {
 263       MemNode* current = n->as_Mem();
 264       Node* adr = n->in(MemNode::Address);
 265       Node* n_ctrl = _phase->get_ctrl(adr);
 266 
 267       // save a queue of post process nodes
 268       if (n_ctrl != NULL && lpt()->is_member(_phase->get_loop(n_ctrl))) {
 269         // Process the memory expression
 270         int stack_idx = 0;
 271         bool have_side_effects = true;
 272         if (adr->is_AddP() == false) {
 273           nstack.push(adr, stack_idx++);
 274         } else {
 275           // Mark the components of the memory operation in nstack
 276           SWPointer p1(current, this, &nstack, true);
 277           have_side_effects = p1.node_stack()->is_nonempty();
 278         }
 279 
 280         // Process the pointer stack
 281         while (have_side_effects) {
 282           Node* pointer_node = nstack.node();
 283           for (uint j = 0; j < lpt()->_body.size(); j++) {
 284             Node* cur_node = lpt()->_body.at(j);
 285             if (cur_node == pointer_node) {
 286               ignored_loop_nodes[j] = cur_node->_idx;
 287               break;
 288             }
 289           }
 290           nstack.pop();
 291           have_side_effects = nstack.is_nonempty();
 292         }
 293       }
 294     }
 295   }
 296 
 297   if (is_slp) {
 298     // Now we try to find the maximum supported consistent vector which the machine
 299     // description can use
 300     bool small_basic_type = false;
 301     for (uint i = 0; i < lpt()->_body.size(); i++) {
 302       if (ignored_loop_nodes[i] != -1) continue;
 303 
 304       BasicType bt;
 305       Node* n = lpt()->_body.at(i);
 306       if (n->is_Mem()) {
 307         bt = n->as_Mem()->memory_type();
 308       } else {
 309         bt = n->bottom_type()->basic_type();
 310       }
 311 
 312       if (post_loop_allowed) {
 313         if (!small_basic_type) {
 314           switch (bt) {
 315           case T_CHAR:
 316           case T_BYTE:
 317           case T_SHORT:
 318             small_basic_type = true;
 319             break;
 320           case T_LONG:
 321             // TODO: Remove when support completed for mask context with LONG.
 322             //       The ad files need to be augmented.
 323             small_basic_type = true;
 324             break;
 325           }
 326         }
 327       }
 328 
 329       if (is_java_primitive(bt) == false) continue;
 330 
 331       int cur_max_vector = Matcher::max_vector_size(bt);
 332 
 333       // If a max vector exists which is not larger than _local_loop_unroll_factor
 334       // stop looking, we already have the max vector to map to.
 335       if (cur_max_vector < local_loop_unroll_factor) {
 336         is_slp = false;
 337         if (TraceSuperWordLoopUnrollAnalysis) {
 338           tty->print_cr("slp analysis fails: unroll limit greater than max vector\n");
 339         }
 340         break;
 341       }
 342 
 343       // Map the maximal common vector
 344       if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
 345         if (cur_max_vector < max_vector) {
 346           max_vector = cur_max_vector;
 347         }
 348 
 349         // We only process post loops on predicated targets where we want to
 350         // mask map the loop to a single iteration
 351         if (post_loop_allowed) {
 352           _post_block.at_put_grow(rpo_idx++, n);
 353         }
 354       }
 355     }
 356     if (is_slp) {
 357       local_loop_unroll_factor = max_vector;
 358       cl->mark_passed_slp();
 359     }
 360     cl->mark_was_slp();
 361     if (cl->is_main_loop()) {
 362       cl->set_slp_max_unroll(local_loop_unroll_factor);
 363     } else if (post_loop_allowed) {
 364       if (!small_basic_type) {
 365         // avoid replication context for small basic types in programmable masked loops
 366         cl->set_slp_max_unroll(local_loop_unroll_factor);
 367       }
 368     }
 369   }
 370 }
 371 
 372 //------------------------------SLP_extract---------------------------
 373 // Extract the superword level parallelism
 374 //
 375 // 1) A reverse post-order of nodes in the block is constructed.  By scanning
 376 //    this list from first to last, all definitions are visited before their uses.
 377 //
 378 // 2) A point-to-point dependence graph is constructed between memory references.
 379 //    This simplies the upcoming "independence" checker.
 380 //
 381 // 3) The maximum depth in the node graph from the beginning of the block
 382 //    to each node is computed.  This is used to prune the graph search
 383 //    in the independence checker.
 384 //
 385 // 4) For integer types, the necessary bit width is propagated backwards
 386 //    from stores to allow packed operations on byte, char, and short
 387 //    integers.  This reverses the promotion to type "int" that javac
 388 //    did for operations like: char c1,c2,c3;  c1 = c2 + c3.
 389 //
 390 // 5) One of the memory references is picked to be an aligned vector reference.
 391 //    The pre-loop trip count is adjusted to align this reference in the
 392 //    unrolled body.
 393 //
 394 // 6) The initial set of pack pairs is seeded with memory references.
 395 //
 396 // 7) The set of pack pairs is extended by following use->def and def->use links.
 397 //
 398 // 8) The pairs are combined into vector sized packs.
 399 //
 400 // 9) Reorder the memory slices to co-locate members of the memory packs.
 401 //
 402 // 10) Generate ideal vector nodes for the final set of packs and where necessary,
 403 //    inserting scalar promotion, vector creation from multiple scalars, and
 404 //    extraction of scalar values from vectors.
 405 //
 406 void SuperWord::SLP_extract() {
 407 
 408 #ifndef PRODUCT
 409   if (_do_vector_loop && TraceSuperWord) {
 410     tty->print("SuperWord::SLP_extract\n");
 411     tty->print("input loop\n");
 412     _lpt->dump_head();
 413     _lpt->dump();
 414     for (uint i = 0; i < _lpt->_body.size(); i++) {
 415       _lpt->_body.at(i)->dump();
 416     }
 417   }
 418 #endif
 419   // Ready the block
 420   if (!construct_bb()) {
 421     return; // Exit if no interesting nodes or complex graph.
 422   }
 423 
 424   // build    _dg, _disjoint_ptrs
 425   dependence_graph();
 426 
 427   // compute function depth(Node*)
 428   compute_max_depth();
 429 
 430   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
 431   bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
 432   if (cl->is_main_loop()) {
 433     if (_do_vector_loop) {
 434       if (mark_generations() != -1) {
 435         hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly
 436 
 437         if (!construct_bb()) {
 438           return; // Exit if no interesting nodes or complex graph.
 439         }
 440         dependence_graph();
 441         compute_max_depth();
 442       }
 443 
 444 #ifndef PRODUCT
 445       if (TraceSuperWord) {
 446         tty->print_cr("\nSuperWord::_do_vector_loop: graph after hoist_loads_in_graph");
 447         _lpt->dump_head();
 448         for (int j = 0; j < _block.length(); j++) {
 449           Node* n = _block.at(j);
 450           int d = depth(n);
 451           for (int i = 0; i < d; i++) tty->print("%s", "  ");
 452           tty->print("%d :", d);
 453           n->dump();
 454         }
 455       }
 456 #endif
 457     }
 458 
 459     compute_vector_element_type();
 460 
 461     // Attempt vectorization
 462 
 463     find_adjacent_refs();
 464 
 465     extend_packlist();
 466 
 467     if (_do_vector_loop) {
 468       if (_packset.length() == 0) {
 469         if (TraceSuperWord) {
 470           tty->print_cr("\nSuperWord::_do_vector_loop DFA could not build packset, now trying to build anyway");
 471         }
 472         pack_parallel();
 473       }
 474     }
 475 
 476     combine_packs();
 477 
 478     construct_my_pack_map();
 479 
 480     if (_do_vector_loop) {
 481       merge_packs_to_cmovd();
 482     }
 483 
 484     filter_packs();
 485 
 486     schedule();
 487   } else if (post_loop_allowed) {
 488     int saved_mapped_unroll_factor = cl->slp_max_unroll();
 489     if (saved_mapped_unroll_factor) {
 490       int vector_mapped_unroll_factor = saved_mapped_unroll_factor;
 491 
 492       // now reset the slp_unroll_factor so that we can check the analysis mapped
 493       // what the vector loop was mapped to
 494       cl->set_slp_max_unroll(0);
 495 
 496       // do the analysis on the post loop
 497       unrolling_analysis(vector_mapped_unroll_factor);
 498 
 499       // if our analyzed loop is a canonical fit, start processing it
 500       if (vector_mapped_unroll_factor == saved_mapped_unroll_factor) {
 501         // now add the vector nodes to packsets
 502         for (int i = 0; i < _post_block.length(); i++) {
 503           Node* n = _post_block.at(i);
 504           Node_List* singleton = new Node_List();
 505           singleton->push(n);
 506           _packset.append(singleton);
 507           set_my_pack(n, singleton);
 508         }
 509 
 510         // map base types for vector usage
 511         compute_vector_element_type();
 512       } else {
 513         return;
 514       }
 515     } else {
 516       // for some reason we could not map the slp analysis state of the vectorized loop
 517       return;
 518     }
 519   }
 520 
 521   output();
 522 }
 523 
 524 //------------------------------find_adjacent_refs---------------------------
 525 // Find the adjacent memory references and create pack pairs for them.
 526 // This is the initial set of packs that will then be extended by
 527 // following use->def and def->use links.  The align positions are
 528 // assigned relative to the reference "align_to_ref"
 529 void SuperWord::find_adjacent_refs() {
 530   // Get list of memory operations
 531   Node_List memops;
 532   for (int i = 0; i < _block.length(); i++) {
 533     Node* n = _block.at(i);
 534     if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
 535         is_java_primitive(n->as_Mem()->memory_type())) {
 536       int align = memory_alignment(n->as_Mem(), 0);
 537       if (align != bottom_align) {
 538         memops.push(n);
 539       }
 540     }
 541   }
 542 
 543   Node_List align_to_refs;
 544   int best_iv_adjustment = 0;
 545   MemNode* best_align_to_mem_ref = NULL;
 546 
 547   while (memops.size() != 0) {
 548     // Find a memory reference to align to.
 549     MemNode* mem_ref = find_align_to_ref(memops);
 550     if (mem_ref == NULL) break;
 551     align_to_refs.push(mem_ref);
 552     int iv_adjustment = get_iv_adjustment(mem_ref);
 553 
 554     if (best_align_to_mem_ref == NULL) {
 555       // Set memory reference which is the best from all memory operations
 556       // to be used for alignment. The pre-loop trip count is modified to align
 557       // this reference to a vector-aligned address.
 558       best_align_to_mem_ref = mem_ref;
 559       best_iv_adjustment = iv_adjustment;
 560       NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
 561     }
 562 
 563     SWPointer align_to_ref_p(mem_ref, this, NULL, false);
 564     // Set alignment relative to "align_to_ref" for all related memory operations.
 565     for (int i = memops.size() - 1; i >= 0; i--) {
 566       MemNode* s = memops.at(i)->as_Mem();
 567       if (isomorphic(s, mem_ref) &&
 568            (!_do_vector_loop || same_origin_idx(s, mem_ref))) {
 569         SWPointer p2(s, this, NULL, false);
 570         if (p2.comparable(align_to_ref_p)) {
 571           int align = memory_alignment(s, iv_adjustment);
 572           set_alignment(s, align);
 573         }
 574       }
 575     }
 576 
 577     // Create initial pack pairs of memory operations for which
 578     // alignment is set and vectors will be aligned.
 579     bool create_pack = true;
 580     if (memory_alignment(mem_ref, best_iv_adjustment) == 0 || _do_vector_loop) {
 581       if (!Matcher::misaligned_vectors_ok()) {
 582         int vw = vector_width(mem_ref);
 583         int vw_best = vector_width(best_align_to_mem_ref);
 584         if (vw > vw_best) {
 585           // Do not vectorize a memory access with more elements per vector
 586           // if unaligned memory access is not allowed because number of
 587           // iterations in pre-loop will be not enough to align it.
 588           create_pack = false;
 589         } else {
 590           SWPointer p2(best_align_to_mem_ref, this, NULL, false);
 591           if (align_to_ref_p.invar() != p2.invar()) {
 592             // Do not vectorize memory accesses with different invariants
 593             // if unaligned memory accesses are not allowed.
 594             create_pack = false;
 595           }
 596         }
 597       }
 598     } else {
 599       if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
 600         // Can't allow vectorization of unaligned memory accesses with the
 601         // same type since it could be overlapped accesses to the same array.
 602         create_pack = false;
 603       } else {
 604         // Allow independent (different type) unaligned memory operations
 605         // if HW supports them.
 606         if (!Matcher::misaligned_vectors_ok()) {
 607           create_pack = false;
 608         } else {
 609           // Check if packs of the same memory type but
 610           // with a different alignment were created before.
 611           for (uint i = 0; i < align_to_refs.size(); i++) {
 612             MemNode* mr = align_to_refs.at(i)->as_Mem();
 613             if (same_velt_type(mr, mem_ref) &&
 614                 memory_alignment(mr, iv_adjustment) != 0)
 615               create_pack = false;
 616           }
 617         }
 618       }
 619     }
 620     if (create_pack) {
 621       for (uint i = 0; i < memops.size(); i++) {
 622         Node* s1 = memops.at(i);
 623         int align = alignment(s1);
 624         if (align == top_align) continue;
 625         for (uint j = 0; j < memops.size(); j++) {
 626           Node* s2 = memops.at(j);
 627           if (alignment(s2) == top_align) continue;
 628           if (s1 != s2 && are_adjacent_refs(s1, s2)) {
 629             if (stmts_can_pack(s1, s2, align)) {
 630               Node_List* pair = new Node_List();
 631               pair->push(s1);
 632               pair->push(s2);
 633               if (!_do_vector_loop || same_origin_idx(s1, s2)) {
 634                 _packset.append(pair);
 635               }
 636             }
 637           }
 638         }
 639       }
 640     } else { // Don't create unaligned pack
 641       // First, remove remaining memory ops of the same type from the list.
 642       for (int i = memops.size() - 1; i >= 0; i--) {
 643         MemNode* s = memops.at(i)->as_Mem();
 644         if (same_velt_type(s, mem_ref)) {
 645           memops.remove(i);
 646         }
 647       }
 648 
 649       // Second, remove already constructed packs of the same type.
 650       for (int i = _packset.length() - 1; i >= 0; i--) {
 651         Node_List* p = _packset.at(i);
 652         MemNode* s = p->at(0)->as_Mem();
 653         if (same_velt_type(s, mem_ref)) {
 654           remove_pack_at(i);
 655         }
 656       }
 657 
 658       // If needed find the best memory reference for loop alignment again.
 659       if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
 660         // Put memory ops from remaining packs back on memops list for
 661         // the best alignment search.
 662         uint orig_msize = memops.size();
 663         for (int i = 0; i < _packset.length(); i++) {
 664           Node_List* p = _packset.at(i);
 665           MemNode* s = p->at(0)->as_Mem();
 666           assert(!same_velt_type(s, mem_ref), "sanity");
 667           memops.push(s);
 668         }
 669         best_align_to_mem_ref = find_align_to_ref(memops);
 670         if (best_align_to_mem_ref == NULL) {
 671           if (TraceSuperWord) {
 672             tty->print_cr("SuperWord::find_adjacent_refs(): best_align_to_mem_ref == NULL");
 673           }
 674           break;
 675         }
 676         best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref);
 677         NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
 678         // Restore list.
 679         while (memops.size() > orig_msize)
 680           (void)memops.pop();
 681       }
 682     } // unaligned memory accesses
 683 
 684     // Remove used mem nodes.
 685     for (int i = memops.size() - 1; i >= 0; i--) {
 686       MemNode* m = memops.at(i)->as_Mem();
 687       if (alignment(m) != top_align) {
 688         memops.remove(i);
 689       }
 690     }
 691 
 692   } // while (memops.size() != 0
 693   set_align_to_ref(best_align_to_mem_ref);
 694 
 695   if (TraceSuperWord) {
 696     tty->print_cr("\nAfter find_adjacent_refs");
 697     print_packset();
 698   }
 699 }
 700 
 701 #ifndef PRODUCT
 702 void SuperWord::find_adjacent_refs_trace_1(Node* best_align_to_mem_ref, int best_iv_adjustment) {
 703   if (is_trace_adjacent()) {
 704     tty->print("SuperWord::find_adjacent_refs best_align_to_mem_ref = %d, best_iv_adjustment = %d",
 705        best_align_to_mem_ref->_idx, best_iv_adjustment);
 706        best_align_to_mem_ref->dump();
 707   }
 708 }
 709 #endif
 710 
 711 //------------------------------find_align_to_ref---------------------------
 712 // Find a memory reference to align the loop induction variable to.
 713 // Looks first at stores then at loads, looking for a memory reference
 714 // with the largest number of references similar to it.
 715 MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
 716   GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
 717 
 718   // Count number of comparable memory ops
 719   for (uint i = 0; i < memops.size(); i++) {
 720     MemNode* s1 = memops.at(i)->as_Mem();
 721     SWPointer p1(s1, this, NULL, false);
 722     // Discard if pre loop can't align this reference
 723     if (!ref_is_alignable(p1)) {
 724       *cmp_ct.adr_at(i) = 0;
 725       continue;
 726     }
 727     for (uint j = i+1; j < memops.size(); j++) {
 728       MemNode* s2 = memops.at(j)->as_Mem();
 729       if (isomorphic(s1, s2)) {
 730         SWPointer p2(s2, this, NULL, false);
 731         if (p1.comparable(p2)) {
 732           (*cmp_ct.adr_at(i))++;
 733           (*cmp_ct.adr_at(j))++;
 734         }
 735       }
 736     }
 737   }
 738 
 739   // Find Store (or Load) with the greatest number of "comparable" references,
 740   // biggest vector size, smallest data size and smallest iv offset.
 741   int max_ct        = 0;
 742   int max_vw        = 0;
 743   int max_idx       = -1;
 744   int min_size      = max_jint;
 745   int min_iv_offset = max_jint;
 746   for (uint j = 0; j < memops.size(); j++) {
 747     MemNode* s = memops.at(j)->as_Mem();
 748     if (s->is_Store()) {
 749       int vw = vector_width_in_bytes(s);
 750       assert(vw > 1, "sanity");
 751       SWPointer p(s, this, NULL, false);
 752       if (cmp_ct.at(j) >  max_ct ||
 753           cmp_ct.at(j) == max_ct &&
 754             (vw >  max_vw ||
 755              vw == max_vw &&
 756               (data_size(s) <  min_size ||
 757                data_size(s) == min_size &&
 758                  (p.offset_in_bytes() < min_iv_offset)))) {
 759         max_ct = cmp_ct.at(j);
 760         max_vw = vw;
 761         max_idx = j;
 762         min_size = data_size(s);
 763         min_iv_offset = p.offset_in_bytes();
 764       }
 765     }
 766   }
 767   // If no stores, look at loads
 768   if (max_ct == 0) {
 769     for (uint j = 0; j < memops.size(); j++) {
 770       MemNode* s = memops.at(j)->as_Mem();
 771       if (s->is_Load()) {
 772         int vw = vector_width_in_bytes(s);
 773         assert(vw > 1, "sanity");
 774         SWPointer p(s, this, NULL, false);
 775         if (cmp_ct.at(j) >  max_ct ||
 776             cmp_ct.at(j) == max_ct &&
 777               (vw >  max_vw ||
 778                vw == max_vw &&
 779                 (data_size(s) <  min_size ||
 780                  data_size(s) == min_size &&
 781                    (p.offset_in_bytes() < min_iv_offset)))) {
 782           max_ct = cmp_ct.at(j);
 783           max_vw = vw;
 784           max_idx = j;
 785           min_size = data_size(s);
 786           min_iv_offset = p.offset_in_bytes();
 787         }
 788       }
 789     }
 790   }
 791 
 792 #ifdef ASSERT
 793   if (TraceSuperWord && Verbose) {
 794     tty->print_cr("\nVector memops after find_align_to_ref");
 795     for (uint i = 0; i < memops.size(); i++) {
 796       MemNode* s = memops.at(i)->as_Mem();
 797       s->dump();
 798     }
 799   }
 800 #endif
 801 
 802   if (max_ct > 0) {
 803 #ifdef ASSERT
 804     if (TraceSuperWord) {
 805       tty->print("\nVector align to node: ");
 806       memops.at(max_idx)->as_Mem()->dump();
 807     }
 808 #endif
 809     return memops.at(max_idx)->as_Mem();
 810   }
 811   return NULL;
 812 }
 813 
 814 //------------------------------ref_is_alignable---------------------------
 815 // Can the preloop align the reference to position zero in the vector?
 816 bool SuperWord::ref_is_alignable(SWPointer& p) {
 817   if (!p.has_iv()) {
 818     return true;   // no induction variable
 819   }
 820   CountedLoopEndNode* pre_end = get_pre_loop_end(lp()->as_CountedLoop());
 821   assert(pre_end != NULL, "we must have a correct pre-loop");
 822   assert(pre_end->stride_is_con(), "pre loop stride is constant");
 823   int preloop_stride = pre_end->stride_con();
 824 
 825   int span = preloop_stride * p.scale_in_bytes();
 826   int mem_size = p.memory_size();
 827   int offset   = p.offset_in_bytes();
 828   // Stride one accesses are alignable if offset is aligned to memory operation size.
 829   // Offset can be unaligned when UseUnalignedAccesses is used.
 830   if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) {
 831     return true;
 832   }
 833   // If the initial offset from start of the object is computable,
 834   // check if the pre-loop can align the final offset accordingly.
 835   //
 836   // In other words: Can we find an i such that the offset
 837   // after i pre-loop iterations is aligned to vw?
 838   //   (init_offset + pre_loop) % vw == 0              (1)
 839   // where
 840   //   pre_loop = i * span
 841   // is the number of bytes added to the offset by i pre-loop iterations.
 842   //
 843   // For this to hold we need pre_loop to increase init_offset by
 844   //   pre_loop = vw - (init_offset % vw)
 845   //
 846   // This is only possible if pre_loop is divisible by span because each
 847   // pre-loop iteration increases the initial offset by 'span' bytes:
 848   //   (vw - (init_offset % vw)) % span == 0
 849   //
 850   int vw = vector_width_in_bytes(p.mem());
 851   assert(vw > 1, "sanity");
 852   Node* init_nd = pre_end->init_trip();
 853   if (init_nd->is_Con() && p.invar() == NULL) {
 854     int init = init_nd->bottom_type()->is_int()->get_con();
 855     int init_offset = init * p.scale_in_bytes() + offset;
 856     assert(init_offset >= 0, "positive offset from object start");
 857     if (vw % span == 0) {
 858       // If vm is a multiple of span, we use formula (1).
 859       if (span > 0) {
 860         return (vw - (init_offset % vw)) % span == 0;
 861       } else {
 862         assert(span < 0, "nonzero stride * scale");
 863         return (init_offset % vw) % -span == 0;
 864       }
 865     } else if (span % vw == 0) {
 866       // If span is a multiple of vw, we can simplify formula (1) to:
 867       //   (init_offset + i * span) % vw == 0
 868       //     =>
 869       //   (init_offset % vw) + ((i * span) % vw) == 0
 870       //     =>
 871       //   init_offset % vw == 0
 872       //
 873       // Because we add a multiple of vw to the initial offset, the final
 874       // offset is a multiple of vw if and only if init_offset is a multiple.
 875       //
 876       return (init_offset % vw) == 0;
 877     }
 878   }
 879   return false;
 880 }
 881 
 882 //---------------------------get_iv_adjustment---------------------------
 883 // Calculate loop's iv adjustment for this memory ops.
 884 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
 885   SWPointer align_to_ref_p(mem_ref, this, NULL, false);
 886   int offset = align_to_ref_p.offset_in_bytes();
 887   int scale  = align_to_ref_p.scale_in_bytes();
 888   int elt_size = align_to_ref_p.memory_size();
 889   int vw       = vector_width_in_bytes(mem_ref);
 890   assert(vw > 1, "sanity");
 891   int iv_adjustment;
 892   if (scale != 0) {
 893     int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
 894     // At least one iteration is executed in pre-loop by default. As result
 895     // several iterations are needed to align memory operations in main-loop even
 896     // if offset is 0.
 897     int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
 898     assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
 899            "(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size);
 900     iv_adjustment = iv_adjustment_in_bytes/elt_size;
 901   } else {
 902     // This memory op is not dependent on iv (scale == 0)
 903     iv_adjustment = 0;
 904   }
 905 
 906 #ifndef PRODUCT
 907   if (TraceSuperWord) {
 908     tty->print("SuperWord::get_iv_adjustment: n = %d, noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d: ",
 909       mem_ref->_idx, offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
 910     mem_ref->dump();
 911   }
 912 #endif
 913   return iv_adjustment;
 914 }
 915 
 916 //---------------------------dependence_graph---------------------------
 917 // Construct dependency graph.
 918 // Add dependence edges to load/store nodes for memory dependence
 919 //    A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
 920 void SuperWord::dependence_graph() {
 921   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
 922   // First, assign a dependence node to each memory node
 923   for (int i = 0; i < _block.length(); i++ ) {
 924     Node *n = _block.at(i);
 925     if (n->is_Mem() || n->is_Phi() && n->bottom_type() == Type::MEMORY) {
 926       _dg.make_node(n);
 927     }
 928   }
 929 
 930   // For each memory slice, create the dependences
 931   for (int i = 0; i < _mem_slice_head.length(); i++) {
 932     Node* n      = _mem_slice_head.at(i);
 933     Node* n_tail = _mem_slice_tail.at(i);
 934 
 935     // Get slice in predecessor order (last is first)
 936     if (cl->is_main_loop()) {
 937       mem_slice_preds(n_tail, n, _nlist);
 938     }
 939 
 940 #ifndef PRODUCT
 941     if(TraceSuperWord && Verbose) {
 942       tty->print_cr("SuperWord::dependence_graph: built a new mem slice");
 943       for (int j = _nlist.length() - 1; j >= 0 ; j--) {
 944         _nlist.at(j)->dump();
 945       }
 946     }
 947 #endif
 948     // Make the slice dependent on the root
 949     DepMem* slice = _dg.dep(n);
 950     _dg.make_edge(_dg.root(), slice);
 951 
 952     // Create a sink for the slice
 953     DepMem* slice_sink = _dg.make_node(NULL);
 954     _dg.make_edge(slice_sink, _dg.tail());
 955 
 956     // Now visit each pair of memory ops, creating the edges
 957     for (int j = _nlist.length() - 1; j >= 0 ; j--) {
 958       Node* s1 = _nlist.at(j);
 959 
 960       // If no dependency yet, use slice
 961       if (_dg.dep(s1)->in_cnt() == 0) {
 962         _dg.make_edge(slice, s1);
 963       }
 964       SWPointer p1(s1->as_Mem(), this, NULL, false);
 965       bool sink_dependent = true;
 966       for (int k = j - 1; k >= 0; k--) {
 967         Node* s2 = _nlist.at(k);
 968         if (s1->is_Load() && s2->is_Load())
 969           continue;
 970         SWPointer p2(s2->as_Mem(), this, NULL, false);
 971 
 972         int cmp = p1.cmp(p2);
 973         if (SuperWordRTDepCheck &&
 974             p1.base() != p2.base() && p1.valid() && p2.valid()) {
 975           // Create a runtime check to disambiguate
 976           OrderedPair pp(p1.base(), p2.base());
 977           _disjoint_ptrs.append_if_missing(pp);
 978         } else if (!SWPointer::not_equal(cmp)) {
 979           // Possibly same address
 980           _dg.make_edge(s1, s2);
 981           sink_dependent = false;
 982         }
 983       }
 984       if (sink_dependent) {
 985         _dg.make_edge(s1, slice_sink);
 986       }
 987     }
 988 
 989     if (TraceSuperWord) {
 990       tty->print_cr("\nDependence graph for slice: %d", n->_idx);
 991       for (int q = 0; q < _nlist.length(); q++) {
 992         _dg.print(_nlist.at(q));
 993       }
 994       tty->cr();
 995     }
 996 
 997     _nlist.clear();
 998   }
 999 
1000   if (TraceSuperWord) {
1001     tty->print_cr("\ndisjoint_ptrs: %s", _disjoint_ptrs.length() > 0 ? "" : "NONE");
1002     for (int r = 0; r < _disjoint_ptrs.length(); r++) {
1003       _disjoint_ptrs.at(r).print();
1004       tty->cr();
1005     }
1006     tty->cr();
1007   }
1008 
1009 }
1010 
1011 //---------------------------mem_slice_preds---------------------------
1012 // Return a memory slice (node list) in predecessor order starting at "start"
1013 void SuperWord::mem_slice_preds(Node* start, Node* stop, GrowableArray<Node*> &preds) {
1014   assert(preds.length() == 0, "start empty");
1015   Node* n = start;
1016   Node* prev = NULL;
1017   while (true) {
1018     NOT_PRODUCT( if(is_trace_mem_slice()) tty->print_cr("SuperWord::mem_slice_preds: n %d", n->_idx);)
1019     assert(in_bb(n), "must be in block");
1020     for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
1021       Node* out = n->fast_out(i);
1022       if (out->is_Load()) {
1023         if (in_bb(out)) {
1024           preds.push(out);
1025           if (TraceSuperWord && Verbose) {
1026             tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", out->_idx);
1027           }
1028         }
1029       } else {
1030         // FIXME
1031         if (out->is_MergeMem() && !in_bb(out)) {
1032           // Either unrolling is causing a memory edge not to disappear,
1033           // or need to run igvn.optimize() again before SLP
1034         } else if (out->is_Phi() && out->bottom_type() == Type::MEMORY && !in_bb(out)) {
1035           // Ditto.  Not sure what else to check further.
1036         } else if (out->Opcode() == Op_StoreCM && out->in(MemNode::OopStore) == n) {
1037           // StoreCM has an input edge used as a precedence edge.
1038           // Maybe an issue when oop stores are vectorized.
1039         } else {
1040           assert(out == prev || prev == NULL, "no branches off of store slice");
1041         }
1042       }//else
1043     }//for
1044     if (n == stop) break;
1045     preds.push(n);
1046     if (TraceSuperWord && Verbose) {
1047       tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", n->_idx);
1048     }
1049     prev = n;
1050     assert(n->is_Mem(), "unexpected node %s", n->Name());
1051     n = n->in(MemNode::Memory);
1052   }
1053 }
1054 
1055 //------------------------------stmts_can_pack---------------------------
1056 // Can s1 and s2 be in a pack with s1 immediately preceding s2 and
1057 // s1 aligned at "align"
1058 bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
1059 
1060   // Do not use superword for non-primitives
1061   BasicType bt1 = velt_basic_type(s1);
1062   BasicType bt2 = velt_basic_type(s2);
1063   if(!is_java_primitive(bt1) || !is_java_primitive(bt2))
1064     return false;
1065   if (Matcher::max_vector_size(bt1) < 2) {
1066     return false; // No vectors for this type
1067   }
1068 
1069   if (isomorphic(s1, s2)) {
1070     if (independent(s1, s2) || reduction(s1, s2)) {
1071       if (!exists_at(s1, 0) && !exists_at(s2, 1)) {
1072         if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
1073           int s1_align = alignment(s1);
1074           int s2_align = alignment(s2);
1075           if (s1_align == top_align || s1_align == align) {
1076             if (s2_align == top_align || s2_align == align + data_size(s1)) {
1077               return true;
1078             }
1079           }
1080         }
1081       }
1082     }
1083   }
1084   return false;
1085 }
1086 
1087 //------------------------------exists_at---------------------------
1088 // Does s exist in a pack at position pos?
1089 bool SuperWord::exists_at(Node* s, uint pos) {
1090   for (int i = 0; i < _packset.length(); i++) {
1091     Node_List* p = _packset.at(i);
1092     if (p->at(pos) == s) {
1093       return true;
1094     }
1095   }
1096   return false;
1097 }
1098 
1099 //------------------------------are_adjacent_refs---------------------------
1100 // Is s1 immediately before s2 in memory?
1101 bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
1102   if (!s1->is_Mem() || !s2->is_Mem()) return false;
1103   if (!in_bb(s1)    || !in_bb(s2))    return false;
1104 
1105   // Do not use superword for non-primitives
1106   if (!is_java_primitive(s1->as_Mem()->memory_type()) ||
1107       !is_java_primitive(s2->as_Mem()->memory_type())) {
1108     return false;
1109   }
1110 
1111   // FIXME - co_locate_pack fails on Stores in different mem-slices, so
1112   // only pack memops that are in the same alias set until that's fixed.
1113   if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
1114       _phase->C->get_alias_index(s2->as_Mem()->adr_type()))
1115     return false;
1116   SWPointer p1(s1->as_Mem(), this, NULL, false);
1117   SWPointer p2(s2->as_Mem(), this, NULL, false);
1118   if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
1119   int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
1120   return diff == data_size(s1);
1121 }
1122 
1123 //------------------------------isomorphic---------------------------
1124 // Are s1 and s2 similar?
1125 bool SuperWord::isomorphic(Node* s1, Node* s2) {
1126   if (s1->Opcode() != s2->Opcode()) return false;
1127   if (s1->req() != s2->req()) return false;
1128   if (s1->in(0) != s2->in(0)) return false;
1129   if (!same_velt_type(s1, s2)) return false;
1130   return true;
1131 }
1132 
1133 //------------------------------independent---------------------------
1134 // Is there no data path from s1 to s2 or s2 to s1?
1135 bool SuperWord::independent(Node* s1, Node* s2) {
1136   //  assert(s1->Opcode() == s2->Opcode(), "check isomorphic first");
1137   int d1 = depth(s1);
1138   int d2 = depth(s2);
1139   if (d1 == d2) return s1 != s2;
1140   Node* deep    = d1 > d2 ? s1 : s2;
1141   Node* shallow = d1 > d2 ? s2 : s1;
1142 
1143   visited_clear();
1144 
1145   return independent_path(shallow, deep);
1146 }
1147 
1148 //------------------------------reduction---------------------------
1149 // Is there a data path between s1 and s2 and the nodes reductions?
1150 bool SuperWord::reduction(Node* s1, Node* s2) {
1151   bool retValue = false;
1152   int d1 = depth(s1);
1153   int d2 = depth(s2);
1154   if (d1 + 1 == d2) {
1155     if (s1->is_reduction() && s2->is_reduction()) {
1156       // This is an ordered set, so s1 should define s2
1157       for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1158         Node* t1 = s1->fast_out(i);
1159         if (t1 == s2) {
1160           // both nodes are reductions and connected
1161           retValue = true;
1162         }
1163       }
1164     }
1165   }
1166 
1167   return retValue;
1168 }
1169 
1170 //------------------------------independent_path------------------------------
1171 // Helper for independent
1172 bool SuperWord::independent_path(Node* shallow, Node* deep, uint dp) {
1173   if (dp >= 1000) return false; // stop deep recursion
1174   visited_set(deep);
1175   int shal_depth = depth(shallow);
1176   assert(shal_depth <= depth(deep), "must be");
1177   for (DepPreds preds(deep, _dg); !preds.done(); preds.next()) {
1178     Node* pred = preds.current();
1179     if (in_bb(pred) && !visited_test(pred)) {
1180       if (shallow == pred) {
1181         return false;
1182       }
1183       if (shal_depth < depth(pred) && !independent_path(shallow, pred, dp+1)) {
1184         return false;
1185       }
1186     }
1187   }
1188   return true;
1189 }
1190 
1191 //------------------------------set_alignment---------------------------
1192 void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
1193   set_alignment(s1, align);
1194   if (align == top_align || align == bottom_align) {
1195     set_alignment(s2, align);
1196   } else {
1197     set_alignment(s2, align + data_size(s1));
1198   }
1199 }
1200 
1201 //------------------------------data_size---------------------------
1202 int SuperWord::data_size(Node* s) {
1203   Node* use = NULL; //test if the node is a candidate for CMoveVD optimization, then return the size of CMov
1204   if (_do_vector_loop) {
1205     use = _cmovev_kit.is_Bool_candidate(s);
1206     if (use != NULL) {
1207       return data_size(use);
1208     }
1209     use = _cmovev_kit.is_CmpD_candidate(s);
1210     if (use != NULL) {
1211       return data_size(use);
1212     }
1213   }
1214   int bsize = type2aelembytes(velt_basic_type(s));
1215   assert(bsize != 0, "valid size");
1216   return bsize;
1217 }
1218 
1219 //------------------------------extend_packlist---------------------------
1220 // Extend packset by following use->def and def->use links from pack members.
1221 void SuperWord::extend_packlist() {
1222   bool changed;
1223   do {
1224     packset_sort(_packset.length());
1225     changed = false;
1226     for (int i = 0; i < _packset.length(); i++) {
1227       Node_List* p = _packset.at(i);
1228       changed |= follow_use_defs(p);
1229       changed |= follow_def_uses(p);
1230     }
1231   } while (changed);
1232 
1233   if (_race_possible) {
1234     for (int i = 0; i < _packset.length(); i++) {
1235       Node_List* p = _packset.at(i);
1236       order_def_uses(p);
1237     }
1238   }
1239 
1240   if (TraceSuperWord) {
1241     tty->print_cr("\nAfter extend_packlist");
1242     print_packset();
1243   }
1244 }
1245 
1246 //------------------------------follow_use_defs---------------------------
1247 // Extend the packset by visiting operand definitions of nodes in pack p
1248 bool SuperWord::follow_use_defs(Node_List* p) {
1249   assert(p->size() == 2, "just checking");
1250   Node* s1 = p->at(0);
1251   Node* s2 = p->at(1);
1252   assert(s1->req() == s2->req(), "just checking");
1253   assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
1254 
1255   if (s1->is_Load()) return false;
1256 
1257   int align = alignment(s1);
1258   NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: s1 %d, align %d", s1->_idx, align);)
1259   bool changed = false;
1260   int start = s1->is_Store() ? MemNode::ValueIn   : 1;
1261   int end   = s1->is_Store() ? MemNode::ValueIn+1 : s1->req();
1262   for (int j = start; j < end; j++) {
1263     Node* t1 = s1->in(j);
1264     Node* t2 = s2->in(j);
1265     if (!in_bb(t1) || !in_bb(t2))
1266       continue;
1267     if (stmts_can_pack(t1, t2, align)) {
1268       if (est_savings(t1, t2) >= 0) {
1269         Node_List* pair = new Node_List();
1270         pair->push(t1);
1271         pair->push(t2);
1272         _packset.append(pair);
1273         NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: set_alignment(%d, %d, %d)", t1->_idx, t2->_idx, align);)
1274         set_alignment(t1, t2, align);
1275         changed = true;
1276       }
1277     }
1278   }
1279   return changed;
1280 }
1281 
1282 //------------------------------follow_def_uses---------------------------
1283 // Extend the packset by visiting uses of nodes in pack p
1284 bool SuperWord::follow_def_uses(Node_List* p) {
1285   bool changed = false;
1286   Node* s1 = p->at(0);
1287   Node* s2 = p->at(1);
1288   assert(p->size() == 2, "just checking");
1289   assert(s1->req() == s2->req(), "just checking");
1290   assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
1291 
1292   if (s1->is_Store()) return false;
1293 
1294   int align = alignment(s1);
1295   NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_def_uses: s1 %d, align %d", s1->_idx, align);)
1296   int savings = -1;
1297   int num_s1_uses = 0;
1298   Node* u1 = NULL;
1299   Node* u2 = NULL;
1300   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1301     Node* t1 = s1->fast_out(i);
1302     num_s1_uses++;
1303     if (!in_bb(t1)) continue;
1304     for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
1305       Node* t2 = s2->fast_out(j);
1306       if (!in_bb(t2)) continue;
1307       if (!opnd_positions_match(s1, t1, s2, t2))
1308         continue;
1309       if (stmts_can_pack(t1, t2, align)) {
1310         int my_savings = est_savings(t1, t2);
1311         if (my_savings > savings) {
1312           savings = my_savings;
1313           u1 = t1;
1314           u2 = t2;
1315         }
1316       }
1317     }
1318   }
1319   if (num_s1_uses > 1) {
1320     _race_possible = true;
1321   }
1322   if (savings >= 0) {
1323     Node_List* pair = new Node_List();
1324     pair->push(u1);
1325     pair->push(u2);
1326     _packset.append(pair);
1327     NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_def_uses: set_alignment(%d, %d, %d)", u1->_idx, u2->_idx, align);)
1328     set_alignment(u1, u2, align);
1329     changed = true;
1330   }
1331   return changed;
1332 }
1333 
1334 //------------------------------order_def_uses---------------------------
1335 // For extended packsets, ordinally arrange uses packset by major component
1336 void SuperWord::order_def_uses(Node_List* p) {
1337   Node* s1 = p->at(0);
1338 
1339   if (s1->is_Store()) return;
1340 
1341   // reductions are always managed beforehand
1342   if (s1->is_reduction()) return;
1343 
1344   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1345     Node* t1 = s1->fast_out(i);
1346 
1347     // Only allow operand swap on commuting operations
1348     if (!t1->is_Add() && !t1->is_Mul()) {
1349       break;
1350     }
1351 
1352     // Now find t1's packset
1353     Node_List* p2 = NULL;
1354     for (int j = 0; j < _packset.length(); j++) {
1355       p2 = _packset.at(j);
1356       Node* first = p2->at(0);
1357       if (t1 == first) {
1358         break;
1359       }
1360       p2 = NULL;
1361     }
1362     // Arrange all sub components by the major component
1363     if (p2 != NULL) {
1364       for (uint j = 1; j < p->size(); j++) {
1365         Node* d1 = p->at(j);
1366         Node* u1 = p2->at(j);
1367         opnd_positions_match(s1, t1, d1, u1);
1368       }
1369     }
1370   }
1371 }
1372 
1373 //---------------------------opnd_positions_match-------------------------
1374 // Is the use of d1 in u1 at the same operand position as d2 in u2?
1375 bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) {
1376   // check reductions to see if they are marshalled to represent the reduction
1377   // operator in a specified opnd
1378   if (u1->is_reduction() && u2->is_reduction()) {
1379     // ensure reductions have phis and reduction definitions feeding the 1st operand
1380     Node* first = u1->in(2);
1381     if (first->is_Phi() || first->is_reduction()) {
1382       u1->swap_edges(1, 2);
1383     }
1384     // ensure reductions have phis and reduction definitions feeding the 1st operand
1385     first = u2->in(2);
1386     if (first->is_Phi() || first->is_reduction()) {
1387       u2->swap_edges(1, 2);
1388     }
1389     return true;
1390   }
1391 
1392   uint ct = u1->req();
1393   if (ct != u2->req()) return false;
1394   uint i1 = 0;
1395   uint i2 = 0;
1396   do {
1397     for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break;
1398     for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break;
1399     if (i1 != i2) {
1400       if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) {
1401         // Further analysis relies on operands position matching.
1402         u2->swap_edges(i1, i2);
1403       } else {
1404         return false;
1405       }
1406     }
1407   } while (i1 < ct);
1408   return true;
1409 }
1410 
1411 //------------------------------est_savings---------------------------
1412 // Estimate the savings from executing s1 and s2 as a pack
1413 int SuperWord::est_savings(Node* s1, Node* s2) {
1414   int save_in = 2 - 1; // 2 operations per instruction in packed form
1415 
1416   // inputs
1417   for (uint i = 1; i < s1->req(); i++) {
1418     Node* x1 = s1->in(i);
1419     Node* x2 = s2->in(i);
1420     if (x1 != x2) {
1421       if (are_adjacent_refs(x1, x2)) {
1422         save_in += adjacent_profit(x1, x2);
1423       } else if (!in_packset(x1, x2)) {
1424         save_in -= pack_cost(2);
1425       } else {
1426         save_in += unpack_cost(2);
1427       }
1428     }
1429   }
1430 
1431   // uses of result
1432   uint ct = 0;
1433   int save_use = 0;
1434   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1435     Node* s1_use = s1->fast_out(i);
1436     for (int j = 0; j < _packset.length(); j++) {
1437       Node_List* p = _packset.at(j);
1438       if (p->at(0) == s1_use) {
1439         for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) {
1440           Node* s2_use = s2->fast_out(k);
1441           if (p->at(p->size()-1) == s2_use) {
1442             ct++;
1443             if (are_adjacent_refs(s1_use, s2_use)) {
1444               save_use += adjacent_profit(s1_use, s2_use);
1445             }
1446           }
1447         }
1448       }
1449     }
1450   }
1451 
1452   if (ct < s1->outcnt()) save_use += unpack_cost(1);
1453   if (ct < s2->outcnt()) save_use += unpack_cost(1);
1454 
1455   return MAX2(save_in, save_use);
1456 }
1457 
1458 //------------------------------costs---------------------------
1459 int SuperWord::adjacent_profit(Node* s1, Node* s2) { return 2; }
1460 int SuperWord::pack_cost(int ct)   { return ct; }
1461 int SuperWord::unpack_cost(int ct) { return ct; }
1462 
1463 //------------------------------combine_packs---------------------------
1464 // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last
1465 void SuperWord::combine_packs() {
1466   bool changed = true;
1467   // Combine packs regardless max vector size.
1468   while (changed) {
1469     changed = false;
1470     for (int i = 0; i < _packset.length(); i++) {
1471       Node_List* p1 = _packset.at(i);
1472       if (p1 == NULL) continue;
1473       // Because of sorting we can start at i + 1
1474       for (int j = i + 1; j < _packset.length(); j++) {
1475         Node_List* p2 = _packset.at(j);
1476         if (p2 == NULL) continue;
1477         if (i == j) continue;
1478         if (p1->at(p1->size()-1) == p2->at(0)) {
1479           for (uint k = 1; k < p2->size(); k++) {
1480             p1->push(p2->at(k));
1481           }
1482           _packset.at_put(j, NULL);
1483           changed = true;
1484         }
1485       }
1486     }
1487   }
1488 
1489   // Split packs which have size greater then max vector size.
1490   for (int i = 0; i < _packset.length(); i++) {
1491     Node_List* p1 = _packset.at(i);
1492     if (p1 != NULL) {
1493       BasicType bt = velt_basic_type(p1->at(0));
1494       uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector
1495       assert(is_power_of_2(max_vlen), "sanity");
1496       uint psize = p1->size();
1497       if (!is_power_of_2(psize)) {
1498         // Skip pack which can't be vector.
1499         // case1: for(...) { a[i] = i; }    elements values are different (i+x)
1500         // case2: for(...) { a[i] = b[i+1]; }  can't align both, load and store
1501         _packset.at_put(i, NULL);
1502         continue;
1503       }
1504       if (psize > max_vlen) {
1505         Node_List* pack = new Node_List();
1506         for (uint j = 0; j < psize; j++) {
1507           pack->push(p1->at(j));
1508           if (pack->size() >= max_vlen) {
1509             assert(is_power_of_2(pack->size()), "sanity");
1510             _packset.append(pack);
1511             pack = new Node_List();
1512           }
1513         }
1514         _packset.at_put(i, NULL);
1515       }
1516     }
1517   }
1518 
1519   // Compress list.
1520   for (int i = _packset.length() - 1; i >= 0; i--) {
1521     Node_List* p1 = _packset.at(i);
1522     if (p1 == NULL) {
1523       _packset.remove_at(i);
1524     }
1525   }
1526 
1527   if (TraceSuperWord) {
1528     tty->print_cr("\nAfter combine_packs");
1529     print_packset();
1530   }
1531 }
1532 
1533 //-----------------------------construct_my_pack_map--------------------------
1534 // Construct the map from nodes to packs.  Only valid after the
1535 // point where a node is only in one pack (after combine_packs).
1536 void SuperWord::construct_my_pack_map() {
1537   Node_List* rslt = NULL;
1538   for (int i = 0; i < _packset.length(); i++) {
1539     Node_List* p = _packset.at(i);
1540     for (uint j = 0; j < p->size(); j++) {
1541       Node* s = p->at(j);
1542       assert(my_pack(s) == NULL, "only in one pack");
1543       set_my_pack(s, p);
1544     }
1545   }
1546 }
1547 
1548 //------------------------------filter_packs---------------------------
1549 // Remove packs that are not implemented or not profitable.
1550 void SuperWord::filter_packs() {
1551   // Remove packs that are not implemented
1552   for (int i = _packset.length() - 1; i >= 0; i--) {
1553     Node_List* pk = _packset.at(i);
1554     bool impl = implemented(pk);
1555     if (!impl) {
1556 #ifndef PRODUCT
1557       if (TraceSuperWord && Verbose) {
1558         tty->print_cr("Unimplemented");
1559         pk->at(0)->dump();
1560       }
1561 #endif
1562       remove_pack_at(i);
1563     }
1564     Node *n = pk->at(0);
1565     if (n->is_reduction()) {
1566       _num_reductions++;
1567     } else {
1568       _num_work_vecs++;
1569     }
1570   }
1571 
1572   // Remove packs that are not profitable
1573   bool changed;
1574   do {
1575     changed = false;
1576     for (int i = _packset.length() - 1; i >= 0; i--) {
1577       Node_List* pk = _packset.at(i);
1578       bool prof = profitable(pk);
1579       if (!prof) {
1580 #ifndef PRODUCT
1581         if (TraceSuperWord && Verbose) {
1582           tty->print_cr("Unprofitable");
1583           pk->at(0)->dump();
1584         }
1585 #endif
1586         remove_pack_at(i);
1587         changed = true;
1588       }
1589     }
1590   } while (changed);
1591 
1592 #ifndef PRODUCT
1593   if (TraceSuperWord) {
1594     tty->print_cr("\nAfter filter_packs");
1595     print_packset();
1596     tty->cr();
1597   }
1598 #endif
1599 }
1600 
1601 //------------------------------merge_packs_to_cmovd---------------------------
1602 // Merge CMoveD into new vector-nodes
1603 // We want to catch this pattern and subsume CmpD and Bool into CMoveD
1604 //
1605 //                   SubD             ConD
1606 //                  /  |               /
1607 //                 /   |           /   /
1608 //                /    |       /      /
1609 //               /     |   /         /
1610 //              /      /            /
1611 //             /    /  |           /
1612 //            v /      |          /
1613 //         CmpD        |         /
1614 //          |          |        /
1615 //          v          |       /
1616 //         Bool        |      /
1617 //           \         |     /
1618 //             \       |    /
1619 //               \     |   /
1620 //                 \   |  /
1621 //                   \ v /
1622 //                   CMoveD
1623 //
1624 
1625 void SuperWord::merge_packs_to_cmovd() {
1626   for (int i = _packset.length() - 1; i >= 0; i--) {
1627     _cmovev_kit.make_cmovevd_pack(_packset.at(i));
1628   }
1629   #ifndef PRODUCT
1630     if (TraceSuperWord) {
1631       tty->print_cr("\nSuperWord::merge_packs_to_cmovd(): After merge");
1632       print_packset();
1633       tty->cr();
1634     }
1635   #endif
1636 }
1637 
1638 Node* CMoveKit::is_Bool_candidate(Node* def) const {
1639   Node* use = NULL;
1640   if (!def->is_Bool() || def->in(0) != NULL || def->outcnt() != 1) {
1641     return NULL;
1642   }
1643   for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
1644     use = def->fast_out(j);
1645     if (!_sw->same_generation(def, use) || !use->is_CMove()) {
1646       return NULL;
1647     }
1648   }
1649   return use;
1650 }
1651 
1652 Node* CMoveKit::is_CmpD_candidate(Node* def) const {
1653   Node* use = NULL;
1654   if (!def->is_Cmp() || def->in(0) != NULL || def->outcnt() != 1) {
1655     return NULL;
1656   }
1657   for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
1658     use = def->fast_out(j);
1659     if (!_sw->same_generation(def, use) || (use = is_Bool_candidate(use)) == NULL || !_sw->same_generation(def, use)) {
1660       return NULL;
1661     }
1662   }
1663   return use;
1664 }
1665 
1666 Node_List* CMoveKit::make_cmovevd_pack(Node_List* cmovd_pk) {
1667   Node *cmovd = cmovd_pk->at(0);
1668   if (!cmovd->is_CMove()) {
1669     return NULL;
1670   }
1671   if (pack(cmovd) != NULL) { // already in the cmov pack
1672     return NULL;
1673   }
1674   if (cmovd->in(0) != NULL) {
1675     NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: CMoveD %d has control flow, escaping...", cmovd->_idx); cmovd->dump();})
1676     return NULL;
1677   }
1678 
1679   Node* bol = cmovd->as_CMove()->in(CMoveNode::Condition);
1680   if (!bol->is_Bool()
1681       || bol->outcnt() != 1
1682       || !_sw->same_generation(bol, cmovd)
1683       || bol->in(0) != NULL  // BoolNode has control flow!!
1684       || _sw->my_pack(bol) == NULL) {
1685       NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: Bool %d does not fit CMoveD %d for building vector, escaping...", bol->_idx, cmovd->_idx); bol->dump();})
1686       return NULL;
1687   }
1688   Node_List* bool_pk = _sw->my_pack(bol);
1689   if (bool_pk->size() != cmovd_pk->size() ) {
1690     return NULL;
1691   }
1692 
1693   Node* cmpd = bol->in(1);
1694   if (!cmpd->is_Cmp()
1695       || cmpd->outcnt() != 1
1696       || !_sw->same_generation(cmpd, cmovd)
1697       || cmpd->in(0) != NULL  // CmpDNode has control flow!!
1698       || _sw->my_pack(cmpd) == NULL) {
1699       NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: CmpD %d does not fit CMoveD %d for building vector, escaping...", cmpd->_idx, cmovd->_idx); cmpd->dump();})
1700       return NULL;
1701   }
1702   Node_List* cmpd_pk = _sw->my_pack(cmpd);
1703   if (cmpd_pk->size() != cmovd_pk->size() ) {
1704     return NULL;
1705   }
1706 
1707   if (!test_cmpd_pack(cmpd_pk, cmovd_pk)) {
1708     NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: cmpd pack for CmpD %d failed vectorization test", cmpd->_idx); cmpd->dump();})
1709     return NULL;
1710   }
1711 
1712   Node_List* new_cmpd_pk = new Node_List();
1713   uint sz = cmovd_pk->size() - 1;
1714   for (uint i = 0; i <= sz; ++i) {
1715     Node* cmov = cmovd_pk->at(i);
1716     Node* bol  = bool_pk->at(i);
1717     Node* cmp  = cmpd_pk->at(i);
1718 
1719     new_cmpd_pk->insert(i, cmov);
1720 
1721     map(cmov, new_cmpd_pk);
1722     map(bol, new_cmpd_pk);
1723     map(cmp, new_cmpd_pk);
1724 
1725     _sw->set_my_pack(cmov, new_cmpd_pk); // and keep old packs for cmp and bool
1726   }
1727   _sw->_packset.remove(cmovd_pk);
1728   _sw->_packset.remove(bool_pk);
1729   _sw->_packset.remove(cmpd_pk);
1730   _sw->_packset.append(new_cmpd_pk);
1731   NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print_cr("CMoveKit::make_cmovevd_pack: added syntactic CMoveD pack"); _sw->print_pack(new_cmpd_pk);})
1732   return new_cmpd_pk;
1733 }
1734 
1735 bool CMoveKit::test_cmpd_pack(Node_List* cmpd_pk, Node_List* cmovd_pk) {
1736   Node* cmpd0 = cmpd_pk->at(0);
1737   assert(cmpd0->is_Cmp(), "CMoveKit::test_cmpd_pack: should be CmpDNode");
1738   assert(cmovd_pk->at(0)->is_CMove(), "CMoveKit::test_cmpd_pack: should be CMoveD");
1739   assert(cmpd_pk->size() == cmovd_pk->size(), "CMoveKit::test_cmpd_pack: should be same size");
1740   Node* in1 = cmpd0->in(1);
1741   Node* in2 = cmpd0->in(2);
1742   Node_List* in1_pk = _sw->my_pack(in1);
1743   Node_List* in2_pk = _sw->my_pack(in2);
1744 
1745   if (in1_pk != NULL && in1_pk->size() != cmpd_pk->size()
1746     || in2_pk != NULL && in2_pk->size() != cmpd_pk->size() ) {
1747     return false;
1748   }
1749 
1750   // test if "all" in1 are in the same pack or the same node
1751   if (in1_pk == NULL) {
1752     for (uint j = 1; j < cmpd_pk->size(); j++) {
1753       if (cmpd_pk->at(j)->in(1) != in1) {
1754         return false;
1755       }
1756     }//for: in1_pk is not pack but all CmpD nodes in the pack have the same in(1)
1757   }
1758   // test if "all" in2 are in the same pack or the same node
1759   if (in2_pk == NULL) {
1760     for (uint j = 1; j < cmpd_pk->size(); j++) {
1761       if (cmpd_pk->at(j)->in(2) != in2) {
1762         return false;
1763       }
1764     }//for: in2_pk is not pack but all CmpD nodes in the pack have the same in(2)
1765   }
1766   //now check if cmpd_pk may be subsumed in vector built for cmovd_pk
1767   int cmovd_ind1, cmovd_ind2;
1768   if (cmpd_pk->at(0)->in(1) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfFalse)
1769    && cmpd_pk->at(0)->in(2) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfTrue)) {
1770       cmovd_ind1 = CMoveNode::IfFalse;
1771       cmovd_ind2 = CMoveNode::IfTrue;
1772   } else if (cmpd_pk->at(0)->in(2) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfFalse)
1773           && cmpd_pk->at(0)->in(1) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfTrue)) {
1774       cmovd_ind2 = CMoveNode::IfFalse;
1775       cmovd_ind1 = CMoveNode::IfTrue;
1776   }
1777   else {
1778     return false;
1779   }
1780 
1781   for (uint j = 1; j < cmpd_pk->size(); j++) {
1782     if (cmpd_pk->at(j)->in(1) != cmovd_pk->at(j)->as_CMove()->in(cmovd_ind1)
1783         || cmpd_pk->at(j)->in(2) != cmovd_pk->at(j)->as_CMove()->in(cmovd_ind2)) {
1784         return false;
1785     }//if
1786   }
1787   NOT_PRODUCT(if(_sw->is_trace_cmov()) { tty->print("CMoveKit::test_cmpd_pack: cmpd pack for 1st CmpD %d is OK for vectorization: ", cmpd0->_idx); cmpd0->dump(); })
1788   return true;
1789 }
1790 
1791 //------------------------------implemented---------------------------
1792 // Can code be generated for pack p?
1793 bool SuperWord::implemented(Node_List* p) {
1794   bool retValue = false;
1795   Node* p0 = p->at(0);
1796   if (p0 != NULL) {
1797     int opc = p0->Opcode();
1798     uint size = p->size();
1799     if (p0->is_reduction()) {
1800       const Type *arith_type = p0->bottom_type();
1801       // Length 2 reductions of INT/LONG do not offer performance benefits
1802       if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) {
1803         retValue = false;
1804       } else {
1805         retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
1806       }
1807     } else {
1808       retValue = VectorNode::implemented(opc, size, velt_basic_type(p0));
1809     }
1810     if (!retValue) {
1811       if (is_cmov_pack(p)) {
1812         NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::implemented: found cmpd pack"); print_pack(p);})
1813         return true;
1814       }
1815     }
1816   }
1817   return retValue;
1818 }
1819 
1820 bool SuperWord::is_cmov_pack(Node_List* p) {
1821   return _cmovev_kit.pack(p->at(0)) != NULL;
1822 }
1823 //------------------------------same_inputs--------------------------
1824 // For pack p, are all idx operands the same?
1825 bool SuperWord::same_inputs(Node_List* p, int idx) {
1826   Node* p0 = p->at(0);
1827   uint vlen = p->size();
1828   Node* p0_def = p0->in(idx);
1829   for (uint i = 1; i < vlen; i++) {
1830     Node* pi = p->at(i);
1831     Node* pi_def = pi->in(idx);
1832     if (p0_def != pi_def) {
1833       return false;
1834     }
1835   }
1836   return true;
1837 }
1838 
1839 //------------------------------profitable---------------------------
1840 // For pack p, are all operands and all uses (with in the block) vector?
1841 bool SuperWord::profitable(Node_List* p) {
1842   Node* p0 = p->at(0);
1843   uint start, end;
1844   VectorNode::vector_operands(p0, &start, &end);
1845 
1846   // Return false if some inputs are not vectors or vectors with different
1847   // size or alignment.
1848   // Also, for now, return false if not scalar promotion case when inputs are
1849   // the same. Later, implement PackNode and allow differing, non-vector inputs
1850   // (maybe just the ones from outside the block.)
1851   for (uint i = start; i < end; i++) {
1852     if (!is_vector_use(p0, i)) {
1853       return false;
1854     }
1855   }
1856   // Check if reductions are connected
1857   if (p0->is_reduction()) {
1858     Node* second_in = p0->in(2);
1859     Node_List* second_pk = my_pack(second_in);
1860     if ((second_pk == NULL) || (_num_work_vecs == _num_reductions)) {
1861       // Remove reduction flag if no parent pack or if not enough work
1862       // to cover reduction expansion overhead
1863       p0->remove_flag(Node::Flag_is_reduction);
1864       return false;
1865     } else if (second_pk->size() != p->size()) {
1866       return false;
1867     }
1868   }
1869   if (VectorNode::is_shift(p0)) {
1870     // For now, return false if shift count is vector or not scalar promotion
1871     // case (different shift counts) because it is not supported yet.
1872     Node* cnt = p0->in(2);
1873     Node_List* cnt_pk = my_pack(cnt);
1874     if (cnt_pk != NULL)
1875       return false;
1876     if (!same_inputs(p, 2))
1877       return false;
1878   }
1879   if (!p0->is_Store()) {
1880     // For now, return false if not all uses are vector.
1881     // Later, implement ExtractNode and allow non-vector uses (maybe
1882     // just the ones outside the block.)
1883     for (uint i = 0; i < p->size(); i++) {
1884       Node* def = p->at(i);
1885       if (is_cmov_pack_internal_node(p, def)) {
1886         continue;
1887       }
1888       for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
1889         Node* use = def->fast_out(j);
1890         for (uint k = 0; k < use->req(); k++) {
1891           Node* n = use->in(k);
1892           if (def == n) {
1893             // reductions can be loop carried dependences
1894             if (def->is_reduction() && use->is_Phi())
1895               continue;
1896             if (!is_vector_use(use, k)) {
1897               return false;
1898             }
1899           }
1900         }
1901       }
1902     }
1903   }
1904   return true;
1905 }
1906 
1907 //------------------------------schedule---------------------------
1908 // Adjust the memory graph for the packed operations
1909 void SuperWord::schedule() {
1910 
1911   // Co-locate in the memory graph the members of each memory pack
1912   for (int i = 0; i < _packset.length(); i++) {
1913     co_locate_pack(_packset.at(i));
1914   }
1915 }
1916 
1917 //-------------------------------remove_and_insert-------------------
1918 // Remove "current" from its current position in the memory graph and insert
1919 // it after the appropriate insertion point (lip or uip).
1920 void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip,
1921                                   Node *uip, Unique_Node_List &sched_before) {
1922   Node* my_mem = current->in(MemNode::Memory);
1923   bool sched_up = sched_before.member(current);
1924 
1925   // remove current_store from its current position in the memmory graph
1926   for (DUIterator i = current->outs(); current->has_out(i); i++) {
1927     Node* use = current->out(i);
1928     if (use->is_Mem()) {
1929       assert(use->in(MemNode::Memory) == current, "must be");
1930       if (use == prev) { // connect prev to my_mem
1931           _igvn.replace_input_of(use, MemNode::Memory, my_mem);
1932           --i; //deleted this edge; rescan position
1933       } else if (sched_before.member(use)) {
1934         if (!sched_up) { // Will be moved together with current
1935           _igvn.replace_input_of(use, MemNode::Memory, uip);
1936           --i; //deleted this edge; rescan position
1937         }
1938       } else {
1939         if (sched_up) { // Will be moved together with current
1940           _igvn.replace_input_of(use, MemNode::Memory, lip);
1941           --i; //deleted this edge; rescan position
1942         }
1943       }
1944     }
1945   }
1946 
1947   Node *insert_pt =  sched_up ?  uip : lip;
1948 
1949   // all uses of insert_pt's memory state should use current's instead
1950   for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
1951     Node* use = insert_pt->out(i);
1952     if (use->is_Mem()) {
1953       assert(use->in(MemNode::Memory) == insert_pt, "must be");
1954       _igvn.replace_input_of(use, MemNode::Memory, current);
1955       --i; //deleted this edge; rescan position
1956     } else if (!sched_up && use->is_Phi() && use->bottom_type() == Type::MEMORY) {
1957       uint pos; //lip (lower insert point) must be the last one in the memory slice
1958       for (pos=1; pos < use->req(); pos++) {
1959         if (use->in(pos) == insert_pt) break;
1960       }
1961       _igvn.replace_input_of(use, pos, current);
1962       --i;
1963     }
1964   }
1965 
1966   //connect current to insert_pt
1967   _igvn.replace_input_of(current, MemNode::Memory, insert_pt);
1968 }
1969 
1970 //------------------------------co_locate_pack----------------------------------
1971 // To schedule a store pack, we need to move any sandwiched memory ops either before
1972 // or after the pack, based upon dependence information:
1973 // (1) If any store in the pack depends on the sandwiched memory op, the
1974 //     sandwiched memory op must be scheduled BEFORE the pack;
1975 // (2) If a sandwiched memory op depends on any store in the pack, the
1976 //     sandwiched memory op must be scheduled AFTER the pack;
1977 // (3) If a sandwiched memory op (say, memA) depends on another sandwiched
1978 //     memory op (say memB), memB must be scheduled before memA. So, if memA is
1979 //     scheduled before the pack, memB must also be scheduled before the pack;
1980 // (4) If there is no dependence restriction for a sandwiched memory op, we simply
1981 //     schedule this store AFTER the pack
1982 // (5) We know there is no dependence cycle, so there in no other case;
1983 // (6) Finally, all memory ops in another single pack should be moved in the same direction.
1984 //
1985 // To schedule a load pack, we use the memory state of either the first or the last load in
1986 // the pack, based on the dependence constraint.
1987 void SuperWord::co_locate_pack(Node_List* pk) {
1988   if (pk->at(0)->is_Store()) {
1989     MemNode* first     = executed_first(pk)->as_Mem();
1990     MemNode* last      = executed_last(pk)->as_Mem();
1991     Unique_Node_List schedule_before_pack;
1992     Unique_Node_List memops;
1993 
1994     MemNode* current   = last->in(MemNode::Memory)->as_Mem();
1995     MemNode* previous  = last;
1996     while (true) {
1997       assert(in_bb(current), "stay in block");
1998       memops.push(previous);
1999       for (DUIterator i = current->outs(); current->has_out(i); i++) {
2000         Node* use = current->out(i);
2001         if (use->is_Mem() && use != previous)
2002           memops.push(use);
2003       }
2004       if (current == first) break;
2005       previous = current;
2006       current  = current->in(MemNode::Memory)->as_Mem();
2007     }
2008 
2009     // determine which memory operations should be scheduled before the pack
2010     for (uint i = 1; i < memops.size(); i++) {
2011       Node *s1 = memops.at(i);
2012       if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) {
2013         for (uint j = 0; j< i; j++) {
2014           Node *s2 = memops.at(j);
2015           if (!independent(s1, s2)) {
2016             if (in_pack(s2, pk) || schedule_before_pack.member(s2)) {
2017               schedule_before_pack.push(s1); // s1 must be scheduled before
2018               Node_List* mem_pk = my_pack(s1);
2019               if (mem_pk != NULL) {
2020                 for (uint ii = 0; ii < mem_pk->size(); ii++) {
2021                   Node* s = mem_pk->at(ii);  // follow partner
2022                   if (memops.member(s) && !schedule_before_pack.member(s))
2023                     schedule_before_pack.push(s);
2024                 }
2025               }
2026               break;
2027             }
2028           }
2029         }
2030       }
2031     }
2032 
2033     Node*    upper_insert_pt = first->in(MemNode::Memory);
2034     // Following code moves loads connected to upper_insert_pt below aliased stores.
2035     // Collect such loads here and reconnect them back to upper_insert_pt later.
2036     memops.clear();
2037     for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
2038       Node* use = upper_insert_pt->out(i);
2039       if (use->is_Mem() && !use->is_Store()) {
2040         memops.push(use);
2041       }
2042     }
2043 
2044     MemNode* lower_insert_pt = last;
2045     previous                 = last; //previous store in pk
2046     current                  = last->in(MemNode::Memory)->as_Mem();
2047 
2048     // start scheduling from "last" to "first"
2049     while (true) {
2050       assert(in_bb(current), "stay in block");
2051       assert(in_pack(previous, pk), "previous stays in pack");
2052       Node* my_mem = current->in(MemNode::Memory);
2053 
2054       if (in_pack(current, pk)) {
2055         // Forward users of my memory state (except "previous) to my input memory state
2056         for (DUIterator i = current->outs(); current->has_out(i); i++) {
2057           Node* use = current->out(i);
2058           if (use->is_Mem() && use != previous) {
2059             assert(use->in(MemNode::Memory) == current, "must be");
2060             if (schedule_before_pack.member(use)) {
2061               _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt);
2062             } else {
2063               _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt);
2064             }
2065             --i; // deleted this edge; rescan position
2066           }
2067         }
2068         previous = current;
2069       } else { // !in_pack(current, pk) ==> a sandwiched store
2070         remove_and_insert(current, previous, lower_insert_pt, upper_insert_pt, schedule_before_pack);
2071       }
2072 
2073       if (current == first) break;
2074       current = my_mem->as_Mem();
2075     } // end while
2076 
2077     // Reconnect loads back to upper_insert_pt.
2078     for (uint i = 0; i < memops.size(); i++) {
2079       Node *ld = memops.at(i);
2080       if (ld->in(MemNode::Memory) != upper_insert_pt) {
2081         _igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt);
2082       }
2083     }
2084   } else if (pk->at(0)->is_Load()) { //load
2085     // all loads in the pack should have the same memory state. By default,
2086     // we use the memory state of the last load. However, if any load could
2087     // not be moved down due to the dependence constraint, we use the memory
2088     // state of the first load.
2089     Node* last_mem  = executed_last(pk)->in(MemNode::Memory);
2090     Node* first_mem = executed_first(pk)->in(MemNode::Memory);
2091     bool schedule_last = true;
2092     for (uint i = 0; i < pk->size(); i++) {
2093       Node* ld = pk->at(i);
2094       for (Node* current = last_mem; current != ld->in(MemNode::Memory);
2095            current=current->in(MemNode::Memory)) {
2096         assert(current != first_mem, "corrupted memory graph");
2097         if(current->is_Mem() && !independent(current, ld)){
2098           schedule_last = false; // a later store depends on this load
2099           break;
2100         }
2101       }
2102     }
2103 
2104     Node* mem_input = schedule_last ? last_mem : first_mem;
2105     _igvn.hash_delete(mem_input);
2106     // Give each load the same memory state
2107     for (uint i = 0; i < pk->size(); i++) {
2108       LoadNode* ld = pk->at(i)->as_Load();
2109       _igvn.replace_input_of(ld, MemNode::Memory, mem_input);
2110     }
2111   }
2112 }
2113 
2114 #ifndef PRODUCT
2115 void SuperWord::print_loop(bool whole) {
2116   Node_Stack stack(_arena, _phase->C->unique() >> 2);
2117   Node_List rpo_list;
2118   VectorSet visited(_arena);
2119   visited.set(lpt()->_head->_idx);
2120   _phase->rpo(lpt()->_head, stack, visited, rpo_list);
2121   _phase->dump(lpt(), rpo_list.size(), rpo_list );
2122   if(whole) {
2123     tty->print_cr("\n Whole loop tree");
2124     _phase->dump();
2125     tty->print_cr(" End of whole loop tree\n");
2126   }
2127 }
2128 #endif
2129 
2130 //------------------------------output---------------------------
2131 // Convert packs into vector node operations
2132 void SuperWord::output() {
2133   if (_packset.length() == 0) return;
2134 
2135 #ifndef PRODUCT
2136   if (TraceLoopOpts) {
2137     tty->print("SuperWord::output    ");
2138     lpt()->dump_head();
2139   }
2140 #endif
2141 
2142   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
2143   if (cl->is_main_loop()) {
2144     // MUST ENSURE main loop's initial value is properly aligned:
2145     //  (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
2146 
2147     align_initial_loop_index(align_to_ref());
2148 
2149     // Insert extract (unpack) operations for scalar uses
2150     for (int i = 0; i < _packset.length(); i++) {
2151       insert_extracts(_packset.at(i));
2152     }
2153   }
2154 
2155   Compile* C = _phase->C;
2156   uint max_vlen_in_bytes = 0;
2157   uint max_vlen = 0;
2158   bool can_process_post_loop = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
2159 
2160   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop before create_reserve_version_of_loop"); print_loop(true);})
2161 
2162   CountedLoopReserveKit make_reversable(_phase, _lpt, do_reserve_copy());
2163 
2164   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop after create_reserve_version_of_loop"); print_loop(true);})
2165 
2166   if (do_reserve_copy() && !make_reversable.has_reserved()) {
2167     NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: loop was not reserved correctly, exiting SuperWord");})
2168     return;
2169   }
2170 
2171   for (int i = 0; i < _block.length(); i++) {
2172     Node* n = _block.at(i);
2173     Node_List* p = my_pack(n);
2174     if (p && n == executed_last(p)) {
2175       uint vlen = p->size();
2176       uint vlen_in_bytes = 0;
2177       Node* vn = NULL;
2178       Node* low_adr = p->at(0);
2179       Node* first   = executed_first(p);
2180       if (can_process_post_loop) {
2181         // override vlen with the main loops vector length
2182         vlen = cl->slp_max_unroll();
2183       }
2184       NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d executed first, %d executed last in pack", first->_idx, n->_idx); print_pack(p);})
2185       int   opc = n->Opcode();
2186       if (n->is_Load()) {
2187         Node* ctl = n->in(MemNode::Control);
2188         Node* mem = first->in(MemNode::Memory);
2189         SWPointer p1(n->as_Mem(), this, NULL, false);
2190         // Identify the memory dependency for the new loadVector node by
2191         // walking up through memory chain.
2192         // This is done to give flexibility to the new loadVector node so that
2193         // it can move above independent storeVector nodes.
2194         while (mem->is_StoreVector()) {
2195           SWPointer p2(mem->as_Mem(), this, NULL, false);
2196           int cmp = p1.cmp(p2);
2197           if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
2198             mem = mem->in(MemNode::Memory);
2199           } else {
2200             break; // dependent memory
2201           }
2202         }
2203         Node* adr = low_adr->in(MemNode::Address);
2204         const TypePtr* atyp = n->adr_type();
2205         vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p));
2206         vlen_in_bytes = vn->as_LoadVector()->memory_size();
2207       } else if (n->is_Store()) {
2208         // Promote value to be stored to vector
2209         Node* val = vector_opd(p, MemNode::ValueIn);
2210         if (val == NULL) {
2211           if (do_reserve_copy()) {
2212             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: val should not be NULL, exiting SuperWord");})
2213             return; //and reverse to backup IG
2214           }
2215           ShouldNotReachHere();
2216         }
2217 
2218         Node* ctl = n->in(MemNode::Control);
2219         Node* mem = first->in(MemNode::Memory);
2220         Node* adr = low_adr->in(MemNode::Address);
2221         const TypePtr* atyp = n->adr_type();
2222         vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
2223         vlen_in_bytes = vn->as_StoreVector()->memory_size();
2224       } else if (n->req() == 3 && !is_cmov_pack(p)) {
2225         // Promote operands to vector
2226         Node* in1 = NULL;
2227         bool node_isa_reduction = n->is_reduction();
2228         if (node_isa_reduction) {
2229           // the input to the first reduction operation is retained
2230           in1 = low_adr->in(1);
2231         } else {
2232           in1 = vector_opd(p, 1);
2233           if (in1 == NULL) {
2234             if (do_reserve_copy()) {
2235               NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: in1 should not be NULL, exiting SuperWord");})
2236               return; //and reverse to backup IG
2237             }
2238             ShouldNotReachHere();
2239           }
2240         }
2241         Node* in2 = vector_opd(p, 2);
2242         if (in2 == NULL) {
2243           if (do_reserve_copy()) {
2244             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: in2 should not be NULL, exiting SuperWord");})
2245             return; //and reverse to backup IG
2246           }
2247           ShouldNotReachHere();
2248         }
2249         if (VectorNode::is_invariant_vector(in1) && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) {
2250           // Move invariant vector input into second position to avoid register spilling.
2251           Node* tmp = in1;
2252           in1 = in2;
2253           in2 = tmp;
2254         }
2255         if (node_isa_reduction) {
2256           const Type *arith_type = n->bottom_type();
2257           vn = ReductionNode::make(opc, NULL, in1, in2, arith_type->basic_type());
2258           if (in2->is_Load()) {
2259             vlen_in_bytes = in2->as_LoadVector()->memory_size();
2260           } else {
2261             vlen_in_bytes = in2->as_Vector()->length_in_bytes();
2262           }
2263         } else {
2264           vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
2265           vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2266         }
2267       } else if (opc == Op_SqrtD || opc == Op_AbsF || opc == Op_AbsD || opc == Op_NegF || opc == Op_NegD) {
2268         // Promote operand to vector (Sqrt/Abs/Neg are 2 address instructions)
2269         Node* in = vector_opd(p, 1);
2270         vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
2271         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2272       } else if (is_cmov_pack(p)) {
2273         if (can_process_post_loop) {
2274           // do not refactor of flow in post loop context
2275           return;
2276         }
2277         if (!n->is_CMove()) {
2278           continue;
2279         }
2280         // place here CMoveVDNode
2281         NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: print before CMove vectorization"); print_loop(false);})
2282         Node* bol = n->in(CMoveNode::Condition);
2283         if (!bol->is_Bool() && bol->Opcode() == Op_ExtractI && bol->req() > 1 ) {
2284           NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d is not Bool node, trying its in(1) node %d", bol->_idx, bol->in(1)->_idx); bol->dump(); bol->in(1)->dump();})
2285           bol = bol->in(1); //may be ExtractNode
2286         }
2287 
2288         assert(bol->is_Bool(), "should be BoolNode - too late to bail out!");
2289         if (!bol->is_Bool()) {
2290           if (do_reserve_copy()) {
2291             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: expected %d bool node, exiting SuperWord", bol->_idx); bol->dump();})
2292             return; //and reverse to backup IG
2293           }
2294           ShouldNotReachHere();
2295         }
2296 
2297         int cond = (int)bol->as_Bool()->_test._test;
2298         Node* in_cc  = _igvn.intcon(cond);
2299         NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created intcon in_cc node %d", in_cc->_idx); in_cc->dump();})
2300         Node* cc = bol->clone();
2301         cc->set_req(1, in_cc);
2302         NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created bool cc node %d", cc->_idx); cc->dump();})
2303 
2304         Node* src1 = vector_opd(p, 2); //2=CMoveNode::IfFalse
2305         if (src1 == NULL) {
2306           if (do_reserve_copy()) {
2307             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: src1 should not be NULL, exiting SuperWord");})
2308             return; //and reverse to backup IG
2309           }
2310           ShouldNotReachHere();
2311         }
2312         Node* src2 = vector_opd(p, 3); //3=CMoveNode::IfTrue
2313         if (src2 == NULL) {
2314           if (do_reserve_copy()) {
2315             NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: src2 should not be NULL, exiting SuperWord");})
2316             return; //and reverse to backup IG
2317           }
2318           ShouldNotReachHere();
2319         }
2320         BasicType bt = velt_basic_type(n);
2321         const TypeVect* vt = TypeVect::make(bt, vlen);
2322         vn = new CMoveVDNode(cc, src1, src2, vt);
2323         NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created new CMove node %d: ", vn->_idx); vn->dump();})
2324       } else {
2325         if (do_reserve_copy()) {
2326           NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: ShouldNotReachHere, exiting SuperWord");})
2327           return; //and reverse to backup IG
2328         }
2329         ShouldNotReachHere();
2330       }
2331 
2332       assert(vn != NULL, "sanity");
2333       if (vn == NULL) {
2334         if (do_reserve_copy()){
2335           NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: got NULL node, cannot proceed, exiting SuperWord");})
2336           return; //and reverse to backup IG
2337         }
2338         ShouldNotReachHere();
2339       }
2340 
2341       _block.at_put(i, vn);
2342       _igvn.register_new_node_with_optimizer(vn);
2343       _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
2344       for (uint j = 0; j < p->size(); j++) {
2345         Node* pm = p->at(j);
2346         _igvn.replace_node(pm, vn);
2347       }
2348       _igvn._worklist.push(vn);
2349 
2350       if (can_process_post_loop) {
2351         // first check if the vector size if the maximum vector which we can use on the machine,
2352         // other vector size have reduced values for predicated data mapping.
2353         if (vlen_in_bytes != (uint)MaxVectorSize) {
2354           return;
2355         }
2356       }
2357 
2358       if (vlen_in_bytes > max_vlen_in_bytes) {
2359         max_vlen = vlen;
2360         max_vlen_in_bytes = vlen_in_bytes;
2361       }
2362 #ifdef ASSERT
2363       if (TraceNewVectors) {
2364         tty->print("new Vector node: ");
2365         vn->dump();
2366       }
2367 #endif
2368     }
2369   }//for (int i = 0; i < _block.length(); i++)
2370 
2371   C->set_max_vector_size(max_vlen_in_bytes);
2372 
2373   if (SuperWordLoopUnrollAnalysis) {
2374     if (cl->has_passed_slp()) {
2375       uint slp_max_unroll_factor = cl->slp_max_unroll();
2376       if (slp_max_unroll_factor == max_vlen) {
2377         if (TraceSuperWordLoopUnrollAnalysis) {
2378           tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte);
2379         }
2380 
2381         // For atomic unrolled loops which are vector mapped, instigate more unrolling
2382         cl->set_notpassed_slp();
2383         if (cl->is_main_loop()) {
2384           // if vector resources are limited, do not allow additional unrolling, also
2385           // do not unroll more on pure vector loops which were not reduced so that we can
2386           // program the post loop to single iteration execution.
2387           if (FLOATPRESSURE > 8) {
2388             C->set_major_progress();
2389             cl->mark_do_unroll_only();
2390           }
2391         }
2392 
2393         if (do_reserve_copy()) {
2394           cl->mark_loop_vectorized();
2395           if (can_process_post_loop) {
2396             // Now create the difference of trip and limit and use it as our mask index.
2397             // Note: We limited the unroll of the vectorized loop so that
2398             //       only vlen-1 size iterations can remain to be mask programmed.
2399             Node *incr = cl->incr();
2400             SubINode *index = new SubINode(cl->limit(), cl->init_trip());
2401             _igvn.register_new_node_with_optimizer(index);
2402             MaskCreateINode *mask = new MaskCreateINode(_phase->get_ctrl(cl->init_trip()), index);
2403             _igvn.register_new_node_with_optimizer(mask);
2404             // make this a single iteration loop
2405             AddINode *new_incr = new AddINode(incr->in(1), mask);
2406             _igvn.register_new_node_with_optimizer(new_incr);
2407             _phase->set_ctrl(new_incr, _phase->get_ctrl(incr));
2408             _igvn.replace_node(incr, new_incr);
2409             cl->mark_is_multiversioned();
2410           }
2411         }
2412       }
2413     }
2414   }
2415 
2416   if (do_reserve_copy()) {
2417     make_reversable.use_new();
2418   }
2419   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("\n Final loop after SuperWord"); print_loop(true);})
2420   return;
2421 }
2422 
2423 //------------------------------vector_opd---------------------------
2424 // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
2425 Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
2426   Node* p0 = p->at(0);
2427   uint vlen = p->size();
2428   Node* opd = p0->in(opd_idx);
2429   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
2430 
2431   if (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop()) {
2432     // override vlen with the main loops vector length
2433     vlen = cl->slp_max_unroll();
2434   }
2435 
2436   if (same_inputs(p, opd_idx)) {
2437     if (opd->is_Vector() || opd->is_LoadVector()) {
2438       assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector");
2439       if (opd_idx == 2 && VectorNode::is_shift(p0)) {
2440         NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("shift's count can't be vector");})
2441         return NULL;
2442       }
2443       return opd; // input is matching vector
2444     }
2445     if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
2446       Compile* C = _phase->C;
2447       Node* cnt = opd;
2448       // Vector instructions do not mask shift count, do it here.
2449       juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
2450       const TypeInt* t = opd->find_int_type();
2451       if (t != NULL && t->is_con()) {
2452         juint shift = t->get_con();
2453         if (shift > mask) { // Unsigned cmp
2454           cnt = ConNode::make(TypeInt::make(shift & mask));
2455         }
2456       } else {
2457         if (t == NULL || t->_lo < 0 || t->_hi > (int)mask) {
2458           cnt = ConNode::make(TypeInt::make(mask));
2459           _igvn.register_new_node_with_optimizer(cnt);
2460           cnt = new AndINode(opd, cnt);
2461           _igvn.register_new_node_with_optimizer(cnt);
2462           _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
2463         }
2464         assert(opd->bottom_type()->isa_int(), "int type only");
2465         if (!opd->bottom_type()->isa_int()) {
2466           NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("Should be int type only");})
2467           return NULL;
2468         }
2469         // Move non constant shift count into vector register.
2470         cnt = VectorNode::shift_count(p0, cnt, vlen, velt_basic_type(p0));
2471       }
2472       if (cnt != opd) {
2473         _igvn.register_new_node_with_optimizer(cnt);
2474         _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
2475       }
2476       return cnt;
2477     }
2478     assert(!opd->is_StoreVector(), "such vector is not expected here");
2479     if (opd->is_StoreVector()) {
2480       NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("StoreVector is not expected here");})
2481       return NULL;
2482     }
2483     // Convert scalar input to vector with the same number of elements as
2484     // p0's vector. Use p0's type because size of operand's container in
2485     // vector should match p0's size regardless operand's size.
2486     const Type* p0_t = velt_type(p0);
2487     VectorNode* vn = VectorNode::scalar2vector(opd, vlen, p0_t);
2488 
2489     _igvn.register_new_node_with_optimizer(vn);
2490     _phase->set_ctrl(vn, _phase->get_ctrl(opd));
2491 #ifdef ASSERT
2492     if (TraceNewVectors) {
2493       tty->print("new Vector node: ");
2494       vn->dump();
2495     }
2496 #endif
2497     return vn;
2498   }
2499 
2500   // Insert pack operation
2501   BasicType bt = velt_basic_type(p0);
2502   PackNode* pk = PackNode::make(opd, vlen, bt);
2503   DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); )
2504 
2505   for (uint i = 1; i < vlen; i++) {
2506     Node* pi = p->at(i);
2507     Node* in = pi->in(opd_idx);
2508     assert(my_pack(in) == NULL, "Should already have been unpacked");
2509     if (my_pack(in) != NULL) {
2510       NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("Should already have been unpacked");})
2511       return NULL;
2512     }
2513     assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
2514     pk->add_opd(in);
2515   }
2516   _igvn.register_new_node_with_optimizer(pk);
2517   _phase->set_ctrl(pk, _phase->get_ctrl(opd));
2518 #ifdef ASSERT
2519   if (TraceNewVectors) {
2520     tty->print("new Vector node: ");
2521     pk->dump();
2522   }
2523 #endif
2524   return pk;
2525 }
2526 
2527 //------------------------------insert_extracts---------------------------
2528 // If a use of pack p is not a vector use, then replace the
2529 // use with an extract operation.
2530 void SuperWord::insert_extracts(Node_List* p) {
2531   if (p->at(0)->is_Store()) return;
2532   assert(_n_idx_list.is_empty(), "empty (node,index) list");
2533 
2534   // Inspect each use of each pack member.  For each use that is
2535   // not a vector use, replace the use with an extract operation.
2536 
2537   for (uint i = 0; i < p->size(); i++) {
2538     Node* def = p->at(i);
2539     for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
2540       Node* use = def->fast_out(j);
2541       for (uint k = 0; k < use->req(); k++) {
2542         Node* n = use->in(k);
2543         if (def == n) {
2544           Node_List* u_pk = my_pack(use);
2545           if ((u_pk == NULL || !is_cmov_pack(u_pk) || use->is_CMove()) && !is_vector_use(use, k)) {
2546               _n_idx_list.push(use, k);
2547           }
2548         }
2549       }
2550     }
2551   }
2552 
2553   while (_n_idx_list.is_nonempty()) {
2554     Node* use = _n_idx_list.node();
2555     int   idx = _n_idx_list.index();
2556     _n_idx_list.pop();
2557     Node* def = use->in(idx);
2558 
2559     if (def->is_reduction()) continue;
2560 
2561     // Insert extract operation
2562     _igvn.hash_delete(def);
2563     int def_pos = alignment(def) / data_size(def);
2564 
2565     Node* ex = ExtractNode::make(def, def_pos, velt_basic_type(def));
2566     _igvn.register_new_node_with_optimizer(ex);
2567     _phase->set_ctrl(ex, _phase->get_ctrl(def));
2568     _igvn.replace_input_of(use, idx, ex);
2569     _igvn._worklist.push(def);
2570 
2571     bb_insert_after(ex, bb_idx(def));
2572     set_velt_type(ex, velt_type(def));
2573   }
2574 }
2575 
2576 //------------------------------is_vector_use---------------------------
2577 // Is use->in(u_idx) a vector use?
2578 bool SuperWord::is_vector_use(Node* use, int u_idx) {
2579   Node_List* u_pk = my_pack(use);
2580   if (u_pk == NULL) return false;
2581   if (use->is_reduction()) return true;
2582   Node* def = use->in(u_idx);
2583   Node_List* d_pk = my_pack(def);
2584   if (d_pk == NULL) {
2585     // check for scalar promotion
2586     Node* n = u_pk->at(0)->in(u_idx);
2587     for (uint i = 1; i < u_pk->size(); i++) {
2588       if (u_pk->at(i)->in(u_idx) != n) return false;
2589     }
2590     return true;
2591   }
2592   if (u_pk->size() != d_pk->size())
2593     return false;
2594   for (uint i = 0; i < u_pk->size(); i++) {
2595     Node* ui = u_pk->at(i);
2596     Node* di = d_pk->at(i);
2597     if (ui->in(u_idx) != di || alignment(ui) != alignment(di))
2598       return false;
2599   }
2600   return true;
2601 }
2602 
2603 //------------------------------construct_bb---------------------------
2604 // Construct reverse postorder list of block members
2605 bool SuperWord::construct_bb() {
2606   Node* entry = bb();
2607 
2608   assert(_stk.length() == 0,            "stk is empty");
2609   assert(_block.length() == 0,          "block is empty");
2610   assert(_data_entry.length() == 0,     "data_entry is empty");
2611   assert(_mem_slice_head.length() == 0, "mem_slice_head is empty");
2612   assert(_mem_slice_tail.length() == 0, "mem_slice_tail is empty");
2613 
2614   // Find non-control nodes with no inputs from within block,
2615   // create a temporary map from node _idx to bb_idx for use
2616   // by the visited and post_visited sets,
2617   // and count number of nodes in block.
2618   int bb_ct = 0;
2619   for (uint i = 0; i < lpt()->_body.size(); i++) {
2620     Node *n = lpt()->_body.at(i);
2621     set_bb_idx(n, i); // Create a temporary map
2622     if (in_bb(n)) {
2623       if (n->is_LoadStore() || n->is_MergeMem() ||
2624           (n->is_Proj() && !n->as_Proj()->is_CFG())) {
2625         // Bailout if the loop has LoadStore, MergeMem or data Proj
2626         // nodes. Superword optimization does not work with them.
2627         return false;
2628       }
2629       bb_ct++;
2630       if (!n->is_CFG()) {
2631         bool found = false;
2632         for (uint j = 0; j < n->req(); j++) {
2633           Node* def = n->in(j);
2634           if (def && in_bb(def)) {
2635             found = true;
2636             break;
2637           }
2638         }
2639         if (!found) {
2640           assert(n != entry, "can't be entry");
2641           _data_entry.push(n);
2642         }
2643       }
2644     }
2645   }
2646 
2647   // Find memory slices (head and tail)
2648   for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) {
2649     Node *n = lp()->fast_out(i);
2650     if (in_bb(n) && (n->is_Phi() && n->bottom_type() == Type::MEMORY)) {
2651       Node* n_tail  = n->in(LoopNode::LoopBackControl);
2652       if (n_tail != n->in(LoopNode::EntryControl)) {
2653         if (!n_tail->is_Mem()) {
2654           assert(n_tail->is_Mem(), "unexpected node for memory slice: %s", n_tail->Name());
2655           return false; // Bailout
2656         }
2657         _mem_slice_head.push(n);
2658         _mem_slice_tail.push(n_tail);
2659       }
2660     }
2661   }
2662 
2663   // Create an RPO list of nodes in block
2664 
2665   visited_clear();
2666   post_visited_clear();
2667 
2668   // Push all non-control nodes with no inputs from within block, then control entry
2669   for (int j = 0; j < _data_entry.length(); j++) {
2670     Node* n = _data_entry.at(j);
2671     visited_set(n);
2672     _stk.push(n);
2673   }
2674   visited_set(entry);
2675   _stk.push(entry);
2676 
2677   // Do a depth first walk over out edges
2678   int rpo_idx = bb_ct - 1;
2679   int size;
2680   int reduction_uses = 0;
2681   while ((size = _stk.length()) > 0) {
2682     Node* n = _stk.top(); // Leave node on stack
2683     if (!visited_test_set(n)) {
2684       // forward arc in graph
2685     } else if (!post_visited_test(n)) {
2686       // cross or back arc
2687       for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
2688         Node *use = n->fast_out(i);
2689         if (in_bb(use) && !visited_test(use) &&
2690             // Don't go around backedge
2691             (!use->is_Phi() || n == entry)) {
2692           if (use->is_reduction()) {
2693             // First see if we can map the reduction on the given system we are on, then
2694             // make a data entry operation for each reduction we see.
2695             BasicType bt = use->bottom_type()->basic_type();
2696             if (ReductionNode::implemented(use->Opcode(), Matcher::min_vector_size(bt), bt)) {
2697               reduction_uses++;
2698             }
2699           }
2700           _stk.push(use);
2701         }
2702       }
2703       if (_stk.length() == size) {
2704         // There were no additional uses, post visit node now
2705         _stk.pop(); // Remove node from stack
2706         assert(rpo_idx >= 0, "");
2707         _block.at_put_grow(rpo_idx, n);
2708         rpo_idx--;
2709         post_visited_set(n);
2710         assert(rpo_idx >= 0 || _stk.is_empty(), "");
2711       }
2712     } else {
2713       _stk.pop(); // Remove post-visited node from stack
2714     }
2715   }//while
2716 
2717   int ii_current = -1;
2718   unsigned int load_idx = (unsigned int)-1;
2719   _ii_order.clear();
2720   // Create real map of block indices for nodes
2721   for (int j = 0; j < _block.length(); j++) {
2722     Node* n = _block.at(j);
2723     set_bb_idx(n, j);
2724     if (_do_vector_loop && n->is_Load()) {
2725       if (ii_current == -1) {
2726         ii_current = _clone_map.gen(n->_idx);
2727         _ii_order.push(ii_current);
2728         load_idx = _clone_map.idx(n->_idx);
2729       } else if (_clone_map.idx(n->_idx) == load_idx && _clone_map.gen(n->_idx) != ii_current) {
2730         ii_current = _clone_map.gen(n->_idx);
2731         _ii_order.push(ii_current);
2732       }
2733     }
2734   }//for
2735 
2736   // Ensure extra info is allocated.
2737   initialize_bb();
2738 
2739 #ifndef PRODUCT
2740   if (_vector_loop_debug && _ii_order.length() > 0) {
2741     tty->print("SuperWord::construct_bb: List of generations: ");
2742     for (int jj = 0; jj < _ii_order.length(); ++jj) {
2743       tty->print("  %d:%d", jj, _ii_order.at(jj));
2744     }
2745     tty->print_cr(" ");
2746   }
2747   if (TraceSuperWord) {
2748     print_bb();
2749     tty->print_cr("\ndata entry nodes: %s", _data_entry.length() > 0 ? "" : "NONE");
2750     for (int m = 0; m < _data_entry.length(); m++) {
2751       tty->print("%3d ", m);
2752       _data_entry.at(m)->dump();
2753     }
2754     tty->print_cr("\nmemory slices: %s", _mem_slice_head.length() > 0 ? "" : "NONE");
2755     for (int m = 0; m < _mem_slice_head.length(); m++) {
2756       tty->print("%3d ", m); _mem_slice_head.at(m)->dump();
2757       tty->print("    ");    _mem_slice_tail.at(m)->dump();
2758     }
2759   }
2760 #endif
2761   assert(rpo_idx == -1 && bb_ct == _block.length(), "all block members found");
2762   return (_mem_slice_head.length() > 0) || (reduction_uses > 0) || (_data_entry.length() > 0);
2763 }
2764 
2765 //------------------------------initialize_bb---------------------------
2766 // Initialize per node info
2767 void SuperWord::initialize_bb() {
2768   Node* last = _block.at(_block.length() - 1);
2769   grow_node_info(bb_idx(last));
2770 }
2771 
2772 //------------------------------bb_insert_after---------------------------
2773 // Insert n into block after pos
2774 void SuperWord::bb_insert_after(Node* n, int pos) {
2775   int n_pos = pos + 1;
2776   // Make room
2777   for (int i = _block.length() - 1; i >= n_pos; i--) {
2778     _block.at_put_grow(i+1, _block.at(i));
2779   }
2780   for (int j = _node_info.length() - 1; j >= n_pos; j--) {
2781     _node_info.at_put_grow(j+1, _node_info.at(j));
2782   }
2783   // Set value
2784   _block.at_put_grow(n_pos, n);
2785   _node_info.at_put_grow(n_pos, SWNodeInfo::initial);
2786   // Adjust map from node->_idx to _block index
2787   for (int i = n_pos; i < _block.length(); i++) {
2788     set_bb_idx(_block.at(i), i);
2789   }
2790 }
2791 
2792 //------------------------------compute_max_depth---------------------------
2793 // Compute max depth for expressions from beginning of block
2794 // Use to prune search paths during test for independence.
2795 void SuperWord::compute_max_depth() {
2796   int ct = 0;
2797   bool again;
2798   do {
2799     again = false;
2800     for (int i = 0; i < _block.length(); i++) {
2801       Node* n = _block.at(i);
2802       if (!n->is_Phi()) {
2803         int d_orig = depth(n);
2804         int d_in   = 0;
2805         for (DepPreds preds(n, _dg); !preds.done(); preds.next()) {
2806           Node* pred = preds.current();
2807           if (in_bb(pred)) {
2808             d_in = MAX2(d_in, depth(pred));
2809           }
2810         }
2811         if (d_in + 1 != d_orig) {
2812           set_depth(n, d_in + 1);
2813           again = true;
2814         }
2815       }
2816     }
2817     ct++;
2818   } while (again);
2819 
2820   if (TraceSuperWord && Verbose) {
2821     tty->print_cr("compute_max_depth iterated: %d times", ct);
2822   }
2823 }
2824 
2825 //-------------------------compute_vector_element_type-----------------------
2826 // Compute necessary vector element type for expressions
2827 // This propagates backwards a narrower integer type when the
2828 // upper bits of the value are not needed.
2829 // Example:  char a,b,c;  a = b + c;
2830 // Normally the type of the add is integer, but for packed character
2831 // operations the type of the add needs to be char.
2832 void SuperWord::compute_vector_element_type() {
2833   if (TraceSuperWord && Verbose) {
2834     tty->print_cr("\ncompute_velt_type:");
2835   }
2836 
2837   // Initial type
2838   for (int i = 0; i < _block.length(); i++) {
2839     Node* n = _block.at(i);
2840     set_velt_type(n, container_type(n));
2841   }
2842 
2843   // Propagate integer narrowed type backwards through operations
2844   // that don't depend on higher order bits
2845   for (int i = _block.length() - 1; i >= 0; i--) {
2846     Node* n = _block.at(i);
2847     // Only integer types need be examined
2848     const Type* vtn = velt_type(n);
2849     if (vtn->basic_type() == T_INT) {
2850       uint start, end;
2851       VectorNode::vector_operands(n, &start, &end);
2852 
2853       for (uint j = start; j < end; j++) {
2854         Node* in  = n->in(j);
2855         // Don't propagate through a memory
2856         if (!in->is_Mem() && in_bb(in) && velt_type(in)->basic_type() == T_INT &&
2857             data_size(n) < data_size(in)) {
2858           bool same_type = true;
2859           for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
2860             Node *use = in->fast_out(k);
2861             if (!in_bb(use) || !same_velt_type(use, n)) {
2862               same_type = false;
2863               break;
2864             }
2865           }
2866           if (same_type) {
2867             // For right shifts of small integer types (bool, byte, char, short)
2868             // we need precise information about sign-ness. Only Load nodes have
2869             // this information because Store nodes are the same for signed and
2870             // unsigned values. And any arithmetic operation after a load may
2871             // expand a value to signed Int so such right shifts can't be used
2872             // because vector elements do not have upper bits of Int.
2873             const Type* vt = vtn;
2874             if (VectorNode::is_shift(in)) {
2875               Node* load = in->in(1);
2876               if (load->is_Load() && in_bb(load) && (velt_type(load)->basic_type() == T_INT)) {
2877                 vt = velt_type(load);
2878               } else if (in->Opcode() != Op_LShiftI) {
2879                 // Widen type to Int to avoid creation of right shift vector
2880                 // (align + data_size(s1) check in stmts_can_pack() will fail).
2881                 // Note, left shifts work regardless type.
2882                 vt = TypeInt::INT;
2883               }
2884             }
2885             set_velt_type(in, vt);
2886           }
2887         }
2888       }
2889     }
2890   }
2891 #ifndef PRODUCT
2892   if (TraceSuperWord && Verbose) {
2893     for (int i = 0; i < _block.length(); i++) {
2894       Node* n = _block.at(i);
2895       velt_type(n)->dump();
2896       tty->print("\t");
2897       n->dump();
2898     }
2899   }
2900 #endif
2901 }
2902 
2903 //------------------------------memory_alignment---------------------------
2904 // Alignment within a vector memory reference
2905 int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
2906   #ifndef PRODUCT
2907     if(TraceSuperWord && Verbose) {
2908       tty->print("SuperWord::memory_alignment within a vector memory reference for %d:  ", s->_idx); s->dump();
2909     }
2910   #endif
2911   NOT_PRODUCT(SWPointer::Tracer::Depth ddd(0);)
2912   SWPointer p(s, this, NULL, false);
2913   if (!p.valid()) {
2914     NOT_PRODUCT(if(is_trace_alignment()) tty->print("SWPointer::memory_alignment: SWPointer p invalid, return bottom_align");)
2915     return bottom_align;
2916   }
2917   int vw = vector_width_in_bytes(s);
2918   if (vw < 2) {
2919     NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SWPointer::memory_alignment: vector_width_in_bytes < 2, return bottom_align");)
2920     return bottom_align; // No vectors for this type
2921   }
2922   int offset  = p.offset_in_bytes();
2923   offset     += iv_adjust*p.memory_size();
2924   int off_rem = offset % vw;
2925   int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
2926   if (TraceSuperWord && Verbose) {
2927     tty->print_cr("SWPointer::memory_alignment: off_rem = %d, off_mod = %d", off_rem, off_mod);
2928   }
2929   return off_mod;
2930 }
2931 
2932 //---------------------------container_type---------------------------
2933 // Smallest type containing range of values
2934 const Type* SuperWord::container_type(Node* n) {
2935   if (n->is_Mem()) {
2936     BasicType bt = n->as_Mem()->memory_type();
2937     if (n->is_Store() && (bt == T_CHAR)) {
2938       // Use T_SHORT type instead of T_CHAR for stored values because any
2939       // preceding arithmetic operation extends values to signed Int.
2940       bt = T_SHORT;
2941     }
2942     if (n->Opcode() == Op_LoadUB) {
2943       // Adjust type for unsigned byte loads, it is important for right shifts.
2944       // T_BOOLEAN is used because there is no basic type representing type
2945       // TypeInt::UBYTE. Use of T_BOOLEAN for vectors is fine because only
2946       // size (one byte) and sign is important.
2947       bt = T_BOOLEAN;
2948     }
2949     return Type::get_const_basic_type(bt);
2950   }
2951   const Type* t = _igvn.type(n);
2952   if (t->basic_type() == T_INT) {
2953     // A narrow type of arithmetic operations will be determined by
2954     // propagating the type of memory operations.
2955     return TypeInt::INT;
2956   }
2957   return t;
2958 }
2959 
2960 bool SuperWord::same_velt_type(Node* n1, Node* n2) {
2961   const Type* vt1 = velt_type(n1);
2962   const Type* vt2 = velt_type(n2);
2963   if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) {
2964     // Compare vectors element sizes for integer types.
2965     return data_size(n1) == data_size(n2);
2966   }
2967   return vt1 == vt2;
2968 }
2969 
2970 //------------------------------in_packset---------------------------
2971 // Are s1 and s2 in a pack pair and ordered as s1,s2?
2972 bool SuperWord::in_packset(Node* s1, Node* s2) {
2973   for (int i = 0; i < _packset.length(); i++) {
2974     Node_List* p = _packset.at(i);
2975     assert(p->size() == 2, "must be");
2976     if (p->at(0) == s1 && p->at(p->size()-1) == s2) {
2977       return true;
2978     }
2979   }
2980   return false;
2981 }
2982 
2983 //------------------------------in_pack---------------------------
2984 // Is s in pack p?
2985 Node_List* SuperWord::in_pack(Node* s, Node_List* p) {
2986   for (uint i = 0; i < p->size(); i++) {
2987     if (p->at(i) == s) {
2988       return p;
2989     }
2990   }
2991   return NULL;
2992 }
2993 
2994 //------------------------------remove_pack_at---------------------------
2995 // Remove the pack at position pos in the packset
2996 void SuperWord::remove_pack_at(int pos) {
2997   Node_List* p = _packset.at(pos);
2998   for (uint i = 0; i < p->size(); i++) {
2999     Node* s = p->at(i);
3000     set_my_pack(s, NULL);
3001   }
3002   _packset.remove_at(pos);
3003 }
3004 
3005 void SuperWord::packset_sort(int n) {
3006   // simple bubble sort so that we capitalize with O(n) when its already sorted
3007   while (n != 0) {
3008     bool swapped = false;
3009     for (int i = 1; i < n; i++) {
3010       Node_List* q_low = _packset.at(i-1);
3011       Node_List* q_i = _packset.at(i);
3012 
3013       // only swap when we find something to swap
3014       if (alignment(q_low->at(0)) > alignment(q_i->at(0))) {
3015         Node_List* t = q_i;
3016         *(_packset.adr_at(i)) = q_low;
3017         *(_packset.adr_at(i-1)) = q_i;
3018         swapped = true;
3019       }
3020     }
3021     if (swapped == false) break;
3022     n--;
3023   }
3024 }
3025 
3026 //------------------------------executed_first---------------------------
3027 // Return the node executed first in pack p.  Uses the RPO block list
3028 // to determine order.
3029 Node* SuperWord::executed_first(Node_List* p) {
3030   Node* n = p->at(0);
3031   int n_rpo = bb_idx(n);
3032   for (uint i = 1; i < p->size(); i++) {
3033     Node* s = p->at(i);
3034     int s_rpo = bb_idx(s);
3035     if (s_rpo < n_rpo) {
3036       n = s;
3037       n_rpo = s_rpo;
3038     }
3039   }
3040   return n;
3041 }
3042 
3043 //------------------------------executed_last---------------------------
3044 // Return the node executed last in pack p.
3045 Node* SuperWord::executed_last(Node_List* p) {
3046   Node* n = p->at(0);
3047   int n_rpo = bb_idx(n);
3048   for (uint i = 1; i < p->size(); i++) {
3049     Node* s = p->at(i);
3050     int s_rpo = bb_idx(s);
3051     if (s_rpo > n_rpo) {
3052       n = s;
3053       n_rpo = s_rpo;
3054     }
3055   }
3056   return n;
3057 }
3058 
3059 LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
3060   LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest;
3061   for (uint i = 0; i < p->size(); i++) {
3062     Node* n = p->at(i);
3063     assert(n->is_Load(), "only meaningful for loads");
3064     if (!n->depends_only_on_test()) {
3065       dep = LoadNode::Pinned;
3066     }
3067   }
3068   return dep;
3069 }
3070 
3071 
3072 //----------------------------align_initial_loop_index---------------------------
3073 // Adjust pre-loop limit so that in main loop, a load/store reference
3074 // to align_to_ref will be a position zero in the vector.
3075 //   (iv + k) mod vector_align == 0
3076 void SuperWord::align_initial_loop_index(MemNode* align_to_ref) {
3077   CountedLoopNode *main_head = lp()->as_CountedLoop();
3078   assert(main_head->is_main_loop(), "");
3079   CountedLoopEndNode* pre_end = get_pre_loop_end(main_head);
3080   assert(pre_end != NULL, "we must have a correct pre-loop");
3081   Node *pre_opaq1 = pre_end->limit();
3082   assert(pre_opaq1->Opcode() == Op_Opaque1, "");
3083   Opaque1Node *pre_opaq = (Opaque1Node*)pre_opaq1;
3084   Node *lim0 = pre_opaq->in(1);
3085 
3086   // Where we put new limit calculations
3087   Node *pre_ctrl = pre_end->loopnode()->in(LoopNode::EntryControl);
3088 
3089   // Ensure the original loop limit is available from the
3090   // pre-loop Opaque1 node.
3091   Node *orig_limit = pre_opaq->original_loop_limit();
3092   assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
3093 
3094   SWPointer align_to_ref_p(align_to_ref, this, NULL, false);
3095   assert(align_to_ref_p.valid(), "sanity");
3096 
3097   // Given:
3098   //     lim0 == original pre loop limit
3099   //     V == v_align (power of 2)
3100   //     invar == extra invariant piece of the address expression
3101   //     e == offset [ +/- invar ]
3102   //
3103   // When reassociating expressions involving '%' the basic rules are:
3104   //     (a - b) % k == 0   =>  a % k == b % k
3105   // and:
3106   //     (a + b) % k == 0   =>  a % k == (k - b) % k
3107   //
3108   // For stride > 0 && scale > 0,
3109   //   Derive the new pre-loop limit "lim" such that the two constraints:
3110   //     (1) lim = lim0 + N           (where N is some positive integer < V)
3111   //     (2) (e + lim) % V == 0
3112   //   are true.
3113   //
3114   //   Substituting (1) into (2),
3115   //     (e + lim0 + N) % V == 0
3116   //   solve for N:
3117   //     N = (V - (e + lim0)) % V
3118   //   substitute back into (1), so that new limit
3119   //     lim = lim0 + (V - (e + lim0)) % V
3120   //
3121   // For stride > 0 && scale < 0
3122   //   Constraints:
3123   //     lim = lim0 + N
3124   //     (e - lim) % V == 0
3125   //   Solving for lim:
3126   //     (e - lim0 - N) % V == 0
3127   //     N = (e - lim0) % V
3128   //     lim = lim0 + (e - lim0) % V
3129   //
3130   // For stride < 0 && scale > 0
3131   //   Constraints:
3132   //     lim = lim0 - N
3133   //     (e + lim) % V == 0
3134   //   Solving for lim:
3135   //     (e + lim0 - N) % V == 0
3136   //     N = (e + lim0) % V
3137   //     lim = lim0 - (e + lim0) % V
3138   //
3139   // For stride < 0 && scale < 0
3140   //   Constraints:
3141   //     lim = lim0 - N
3142   //     (e - lim) % V == 0
3143   //   Solving for lim:
3144   //     (e - lim0 + N) % V == 0
3145   //     N = (V - (e - lim0)) % V
3146   //     lim = lim0 - (V - (e - lim0)) % V
3147 
3148   int vw = vector_width_in_bytes(align_to_ref);
3149   int stride   = iv_stride();
3150   int scale    = align_to_ref_p.scale_in_bytes();
3151   int elt_size = align_to_ref_p.memory_size();
3152   int v_align  = vw / elt_size;
3153   assert(v_align > 1, "sanity");
3154   int offset   = align_to_ref_p.offset_in_bytes() / elt_size;
3155   Node *offsn  = _igvn.intcon(offset);
3156 
3157   Node *e = offsn;
3158   if (align_to_ref_p.invar() != NULL) {
3159     // incorporate any extra invariant piece producing (offset +/- invar) >>> log2(elt)
3160     Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
3161     Node* aref     = new URShiftINode(align_to_ref_p.invar(), log2_elt);
3162     _igvn.register_new_node_with_optimizer(aref);
3163     _phase->set_ctrl(aref, pre_ctrl);
3164     if (align_to_ref_p.negate_invar()) {
3165       e = new SubINode(e, aref);
3166     } else {
3167       e = new AddINode(e, aref);
3168     }
3169     _igvn.register_new_node_with_optimizer(e);
3170     _phase->set_ctrl(e, pre_ctrl);
3171   }
3172   if (vw > ObjectAlignmentInBytes) {
3173     // incorporate base e +/- base && Mask >>> log2(elt)
3174     Node* xbase = new CastP2XNode(NULL, align_to_ref_p.base());
3175     _igvn.register_new_node_with_optimizer(xbase);
3176 #ifdef _LP64
3177     xbase  = new ConvL2INode(xbase);
3178     _igvn.register_new_node_with_optimizer(xbase);
3179 #endif
3180     Node* mask = _igvn.intcon(vw-1);
3181     Node* masked_xbase  = new AndINode(xbase, mask);
3182     _igvn.register_new_node_with_optimizer(masked_xbase);
3183     Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
3184     Node* bref     = new URShiftINode(masked_xbase, log2_elt);
3185     _igvn.register_new_node_with_optimizer(bref);
3186     _phase->set_ctrl(bref, pre_ctrl);
3187     e = new AddINode(e, bref);
3188     _igvn.register_new_node_with_optimizer(e);
3189     _phase->set_ctrl(e, pre_ctrl);
3190   }
3191 
3192   // compute e +/- lim0
3193   if (scale < 0) {
3194     e = new SubINode(e, lim0);
3195   } else {
3196     e = new AddINode(e, lim0);
3197   }
3198   _igvn.register_new_node_with_optimizer(e);
3199   _phase->set_ctrl(e, pre_ctrl);
3200 
3201   if (stride * scale > 0) {
3202     // compute V - (e +/- lim0)
3203     Node* va  = _igvn.intcon(v_align);
3204     e = new SubINode(va, e);
3205     _igvn.register_new_node_with_optimizer(e);
3206     _phase->set_ctrl(e, pre_ctrl);
3207   }
3208   // compute N = (exp) % V
3209   Node* va_msk = _igvn.intcon(v_align - 1);
3210   Node* N = new AndINode(e, va_msk);
3211   _igvn.register_new_node_with_optimizer(N);
3212   _phase->set_ctrl(N, pre_ctrl);
3213 
3214   //   substitute back into (1), so that new limit
3215   //     lim = lim0 + N
3216   Node* lim;
3217   if (stride < 0) {
3218     lim = new SubINode(lim0, N);
3219   } else {
3220     lim = new AddINode(lim0, N);
3221   }
3222   _igvn.register_new_node_with_optimizer(lim);
3223   _phase->set_ctrl(lim, pre_ctrl);
3224   Node* constrained =
3225     (stride > 0) ? (Node*) new MinINode(lim, orig_limit)
3226                  : (Node*) new MaxINode(lim, orig_limit);
3227   _igvn.register_new_node_with_optimizer(constrained);
3228   _phase->set_ctrl(constrained, pre_ctrl);
3229   _igvn.hash_delete(pre_opaq);
3230   pre_opaq->set_req(1, constrained);
3231 }
3232 
3233 //----------------------------get_pre_loop_end---------------------------
3234 // Find pre loop end from main loop.  Returns null if none.
3235 CountedLoopEndNode* SuperWord::get_pre_loop_end(CountedLoopNode* cl) {
3236   // The loop cannot be optimized if the graph shape at
3237   // the loop entry is inappropriate.
3238   if (!PhaseIdealLoop::is_canonical_loop_entry(cl)) {
3239     return NULL;
3240   }
3241 
3242   Node* p_f = cl->in(LoopNode::EntryControl)->in(0)->in(0);
3243   if (!p_f->is_IfFalse()) return NULL;
3244   if (!p_f->in(0)->is_CountedLoopEnd()) return NULL;
3245   CountedLoopEndNode* pre_end = p_f->in(0)->as_CountedLoopEnd();
3246   CountedLoopNode* loop_node = pre_end->loopnode();
3247   if (loop_node == NULL || !loop_node->is_pre_loop()) return NULL;
3248   return pre_end;
3249 }
3250 
3251 //------------------------------init---------------------------
3252 void SuperWord::init() {
3253   _dg.init();
3254   _packset.clear();
3255   _disjoint_ptrs.clear();
3256   _block.clear();
3257   _post_block.clear();
3258   _data_entry.clear();
3259   _mem_slice_head.clear();
3260   _mem_slice_tail.clear();
3261   _iteration_first.clear();
3262   _iteration_last.clear();
3263   _node_info.clear();
3264   _align_to_ref = NULL;
3265   _lpt = NULL;
3266   _lp = NULL;
3267   _bb = NULL;
3268   _iv = NULL;
3269   _race_possible = 0;
3270   _early_return = false;
3271   _num_work_vecs = 0;
3272   _num_reductions = 0;
3273 }
3274 
3275 //------------------------------restart---------------------------
3276 void SuperWord::restart() {
3277   _dg.init();
3278   _packset.clear();
3279   _disjoint_ptrs.clear();
3280   _block.clear();
3281   _post_block.clear();
3282   _data_entry.clear();
3283   _mem_slice_head.clear();
3284   _mem_slice_tail.clear();
3285   _node_info.clear();
3286 }
3287 
3288 //------------------------------print_packset---------------------------
3289 void SuperWord::print_packset() {
3290 #ifndef PRODUCT
3291   tty->print_cr("packset");
3292   for (int i = 0; i < _packset.length(); i++) {
3293     tty->print_cr("Pack: %d", i);
3294     Node_List* p = _packset.at(i);
3295     print_pack(p);
3296   }
3297 #endif
3298 }
3299 
3300 //------------------------------print_pack---------------------------
3301 void SuperWord::print_pack(Node_List* p) {
3302   for (uint i = 0; i < p->size(); i++) {
3303     print_stmt(p->at(i));
3304   }
3305 }
3306 
3307 //------------------------------print_bb---------------------------
3308 void SuperWord::print_bb() {
3309 #ifndef PRODUCT
3310   tty->print_cr("\nBlock");
3311   for (int i = 0; i < _block.length(); i++) {
3312     Node* n = _block.at(i);
3313     tty->print("%d ", i);
3314     if (n) {
3315       n->dump();
3316     }
3317   }
3318 #endif
3319 }
3320 
3321 //------------------------------print_stmt---------------------------
3322 void SuperWord::print_stmt(Node* s) {
3323 #ifndef PRODUCT
3324   tty->print(" align: %d \t", alignment(s));
3325   s->dump();
3326 #endif
3327 }
3328 
3329 //------------------------------blank---------------------------
3330 char* SuperWord::blank(uint depth) {
3331   static char blanks[101];
3332   assert(depth < 101, "too deep");
3333   for (uint i = 0; i < depth; i++) blanks[i] = ' ';
3334   blanks[depth] = '\0';
3335   return blanks;
3336 }
3337 
3338 
3339 //==============================SWPointer===========================
3340 #ifndef PRODUCT
3341 int SWPointer::Tracer::_depth = 0;
3342 #endif
3343 //----------------------------SWPointer------------------------
3344 SWPointer::SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only) :
3345   _mem(mem), _slp(slp),  _base(NULL),  _adr(NULL),
3346   _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
3347   _nstack(nstack), _analyze_only(analyze_only),
3348   _stack_idx(0)
3349 #ifndef PRODUCT
3350   , _tracer(slp)
3351 #endif
3352 {
3353   NOT_PRODUCT(_tracer.ctor_1(mem);)
3354 
3355   Node* adr = mem->in(MemNode::Address);
3356   if (!adr->is_AddP()) {
3357     assert(!valid(), "too complex");
3358     return;
3359   }
3360   // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant)
3361   Node* base = adr->in(AddPNode::Base);
3362   // The base address should be loop invariant
3363   if (!invariant(base)) {
3364     assert(!valid(), "base address is loop variant");
3365     return;
3366   }
3367   //unsafe reference could not be aligned appropriately without runtime checking
3368   if (base == NULL || base->bottom_type() == Type::TOP) {
3369     assert(!valid(), "unsafe access");
3370     return;
3371   }
3372 
3373   NOT_PRODUCT(if(_slp->is_trace_alignment()) _tracer.store_depth();)
3374   NOT_PRODUCT(_tracer.ctor_2(adr);)
3375 
3376   int i;
3377   for (i = 0; i < 3; i++) {
3378     NOT_PRODUCT(_tracer.ctor_3(adr, i);)
3379 
3380     if (!scaled_iv_plus_offset(adr->in(AddPNode::Offset))) {
3381       assert(!valid(), "too complex");
3382       return;
3383     }
3384     adr = adr->in(AddPNode::Address);
3385     NOT_PRODUCT(_tracer.ctor_4(adr, i);)
3386 
3387     if (base == adr || !adr->is_AddP()) {
3388       NOT_PRODUCT(_tracer.ctor_5(adr, base, i);)
3389       break; // stop looking at addp's
3390     }
3391   }
3392   NOT_PRODUCT(if(_slp->is_trace_alignment()) _tracer.restore_depth();)
3393   NOT_PRODUCT(_tracer.ctor_6(mem);)
3394 
3395   _base = base;
3396   _adr  = adr;
3397   assert(valid(), "Usable");
3398 }
3399 
3400 // Following is used to create a temporary object during
3401 // the pattern match of an address expression.
3402 SWPointer::SWPointer(SWPointer* p) :
3403   _mem(p->_mem), _slp(p->_slp),  _base(NULL),  _adr(NULL),
3404   _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
3405   _nstack(p->_nstack), _analyze_only(p->_analyze_only),
3406   _stack_idx(p->_stack_idx)
3407   #ifndef PRODUCT
3408   , _tracer(p->_slp)
3409   #endif
3410 {}
3411 
3412 
3413 bool SWPointer::invariant(Node* n) {
3414   NOT_PRODUCT(Tracer::Depth dd;)
3415   Node *n_c = phase()->get_ctrl(n);
3416   NOT_PRODUCT(_tracer.invariant_1(n, n_c);)
3417   return !lpt()->is_member(phase()->get_loop(n_c));
3418 }
3419 //------------------------scaled_iv_plus_offset--------------------
3420 // Match: k*iv + offset
3421 // where: k is a constant that maybe zero, and
3422 //        offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional
3423 bool SWPointer::scaled_iv_plus_offset(Node* n) {
3424   NOT_PRODUCT(Tracer::Depth ddd;)
3425   NOT_PRODUCT(_tracer.scaled_iv_plus_offset_1(n);)
3426 
3427   if (scaled_iv(n)) {
3428     NOT_PRODUCT(_tracer.scaled_iv_plus_offset_2(n);)
3429     return true;
3430   }
3431 
3432   if (offset_plus_k(n)) {
3433     NOT_PRODUCT(_tracer.scaled_iv_plus_offset_3(n);)
3434     return true;
3435   }
3436 
3437   int opc = n->Opcode();
3438   if (opc == Op_AddI) {
3439     if (scaled_iv(n->in(1)) && offset_plus_k(n->in(2))) {
3440       NOT_PRODUCT(_tracer.scaled_iv_plus_offset_4(n);)
3441       return true;
3442     }
3443     if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
3444       NOT_PRODUCT(_tracer.scaled_iv_plus_offset_5(n);)
3445       return true;
3446     }
3447   } else if (opc == Op_SubI) {
3448     if (scaled_iv(n->in(1)) && offset_plus_k(n->in(2), true)) {
3449       NOT_PRODUCT(_tracer.scaled_iv_plus_offset_6(n);)
3450       return true;
3451     }
3452     if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
3453       _scale *= -1;
3454       NOT_PRODUCT(_tracer.scaled_iv_plus_offset_7(n);)
3455       return true;
3456     }
3457   }
3458 
3459   NOT_PRODUCT(_tracer.scaled_iv_plus_offset_8(n);)
3460   return false;
3461 }
3462 
3463 //----------------------------scaled_iv------------------------
3464 // Match: k*iv where k is a constant that's not zero
3465 bool SWPointer::scaled_iv(Node* n) {
3466   NOT_PRODUCT(Tracer::Depth ddd;)
3467   NOT_PRODUCT(_tracer.scaled_iv_1(n);)
3468 
3469   if (_scale != 0) { // already found a scale
3470     NOT_PRODUCT(_tracer.scaled_iv_2(n, _scale);)
3471     return false;
3472   }
3473 
3474   if (n == iv()) {
3475     _scale = 1;
3476     NOT_PRODUCT(_tracer.scaled_iv_3(n, _scale);)
3477     return true;
3478   }
3479   if (_analyze_only && (invariant(n) == false)) {
3480     _nstack->push(n, _stack_idx++);
3481   }
3482 
3483   int opc = n->Opcode();
3484   if (opc == Op_MulI) {
3485     if (n->in(1) == iv() && n->in(2)->is_Con()) {
3486       _scale = n->in(2)->get_int();
3487       NOT_PRODUCT(_tracer.scaled_iv_4(n, _scale);)
3488       return true;
3489     } else if (n->in(2) == iv() && n->in(1)->is_Con()) {
3490       _scale = n->in(1)->get_int();
3491       NOT_PRODUCT(_tracer.scaled_iv_5(n, _scale);)
3492       return true;
3493     }
3494   } else if (opc == Op_LShiftI) {
3495     if (n->in(1) == iv() && n->in(2)->is_Con()) {
3496       _scale = 1 << n->in(2)->get_int();
3497       NOT_PRODUCT(_tracer.scaled_iv_6(n, _scale);)
3498       return true;
3499     }
3500   } else if (opc == Op_ConvI2L) {
3501     if (n->in(1)->Opcode() == Op_CastII &&
3502         n->in(1)->as_CastII()->has_range_check()) {
3503       // Skip range check dependent CastII nodes
3504       n = n->in(1);
3505     }
3506     if (scaled_iv_plus_offset(n->in(1))) {
3507       NOT_PRODUCT(_tracer.scaled_iv_7(n);)
3508       return true;
3509     }
3510   } else if (opc == Op_LShiftL) {
3511     if (!has_iv() && _invar == NULL) {
3512       // Need to preserve the current _offset value, so
3513       // create a temporary object for this expression subtree.
3514       // Hacky, so should re-engineer the address pattern match.
3515       NOT_PRODUCT(Tracer::Depth dddd;)
3516       SWPointer tmp(this);
3517       NOT_PRODUCT(_tracer.scaled_iv_8(n, &tmp);)
3518 
3519       if (tmp.scaled_iv_plus_offset(n->in(1))) {
3520         if (tmp._invar == NULL || _slp->do_vector_loop()) {
3521           int mult = 1 << n->in(2)->get_int();
3522           _scale   = tmp._scale  * mult;
3523           _offset += tmp._offset * mult;
3524           NOT_PRODUCT(_tracer.scaled_iv_9(n, _scale, _offset, mult);)
3525           return true;
3526         }
3527       }
3528     }
3529   }
3530   NOT_PRODUCT(_tracer.scaled_iv_10(n);)
3531   return false;
3532 }
3533 
3534 //----------------------------offset_plus_k------------------------
3535 // Match: offset is (k [+/- invariant])
3536 // where k maybe zero and invariant is optional, but not both.
3537 bool SWPointer::offset_plus_k(Node* n, bool negate) {
3538   NOT_PRODUCT(Tracer::Depth ddd;)
3539   NOT_PRODUCT(_tracer.offset_plus_k_1(n);)
3540 
3541   int opc = n->Opcode();
3542   if (opc == Op_ConI) {
3543     _offset += negate ? -(n->get_int()) : n->get_int();
3544     NOT_PRODUCT(_tracer.offset_plus_k_2(n, _offset);)
3545     return true;
3546   } else if (opc == Op_ConL) {
3547     // Okay if value fits into an int
3548     const TypeLong* t = n->find_long_type();
3549     if (t->higher_equal(TypeLong::INT)) {
3550       jlong loff = n->get_long();
3551       jint  off  = (jint)loff;
3552       _offset += negate ? -off : loff;
3553       NOT_PRODUCT(_tracer.offset_plus_k_3(n, _offset);)
3554       return true;
3555     }
3556     NOT_PRODUCT(_tracer.offset_plus_k_4(n);)
3557     return false;
3558   }
3559   if (_invar != NULL) { // already has an invariant
3560     NOT_PRODUCT(_tracer.offset_plus_k_5(n, _invar);)
3561     return false;
3562   }
3563 
3564   if (_analyze_only && (invariant(n) == false)) {
3565     _nstack->push(n, _stack_idx++);
3566   }
3567   if (opc == Op_AddI) {
3568     if (n->in(2)->is_Con() && invariant(n->in(1))) {
3569       _negate_invar = negate;
3570       _invar = n->in(1);
3571       _offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
3572       NOT_PRODUCT(_tracer.offset_plus_k_6(n, _invar, _negate_invar, _offset);)
3573       return true;
3574     } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
3575       _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
3576       _negate_invar = negate;
3577       _invar = n->in(2);
3578       NOT_PRODUCT(_tracer.offset_plus_k_7(n, _invar, _negate_invar, _offset);)
3579       return true;
3580     }
3581   }
3582   if (opc == Op_SubI) {
3583     if (n->in(2)->is_Con() && invariant(n->in(1))) {
3584       _negate_invar = negate;
3585       _invar = n->in(1);
3586       _offset += !negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
3587       NOT_PRODUCT(_tracer.offset_plus_k_8(n, _invar, _negate_invar, _offset);)
3588       return true;
3589     } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
3590       _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
3591       _negate_invar = !negate;
3592       _invar = n->in(2);
3593       NOT_PRODUCT(_tracer.offset_plus_k_9(n, _invar, _negate_invar, _offset);)
3594       return true;
3595     }
3596   }
3597   if (invariant(n)) {
3598     if (opc == Op_ConvI2L) {
3599       n = n->in(1);
3600       if (n->Opcode() == Op_CastII &&
3601           n->as_CastII()->has_range_check()) {
3602         // Skip range check dependent CastII nodes
3603         assert(invariant(n), "sanity");
3604         n = n->in(1);
3605       }
3606     }
3607     if (n->bottom_type()->isa_int()) {
3608       _negate_invar = negate;
3609       _invar = n;
3610       NOT_PRODUCT(_tracer.offset_plus_k_10(n, _invar, _negate_invar, _offset);)
3611       return true;
3612     }
3613   }
3614 
3615   NOT_PRODUCT(_tracer.offset_plus_k_11(n);)
3616   return false;
3617 }
3618 
3619 //----------------------------print------------------------
3620 void SWPointer::print() {
3621 #ifndef PRODUCT
3622   tty->print("base: %d  adr: %d  scale: %d  offset: %d  invar: %c%d\n",
3623              _base != NULL ? _base->_idx : 0,
3624              _adr  != NULL ? _adr->_idx  : 0,
3625              _scale, _offset,
3626              _negate_invar?'-':'+',
3627              _invar != NULL ? _invar->_idx : 0);
3628 #endif
3629 }
3630 
3631 //----------------------------tracing------------------------
3632 #ifndef PRODUCT
3633 void SWPointer::Tracer::print_depth() {
3634   for (int ii = 0; ii<_depth; ++ii) tty->print("  ");
3635 }
3636 
3637 void SWPointer::Tracer::ctor_1 (Node* mem) {
3638   if(_slp->is_trace_alignment()) {
3639     print_depth(); tty->print(" %d SWPointer::SWPointer: start alignment analysis", mem->_idx); mem->dump();
3640   }
3641 }
3642 
3643 void SWPointer::Tracer::ctor_2(Node* adr) {
3644   if(_slp->is_trace_alignment()) {
3645     //store_depth();
3646     inc_depth();
3647     print_depth(); tty->print(" %d (adr) SWPointer::SWPointer: ", adr->_idx); adr->dump();
3648     inc_depth();
3649     print_depth(); tty->print(" %d (base) SWPointer::SWPointer: ", adr->in(AddPNode::Base)->_idx); adr->in(AddPNode::Base)->dump();
3650   }
3651 }
3652 
3653 void SWPointer::Tracer::ctor_3(Node* adr, int i) {
3654   if(_slp->is_trace_alignment()) {
3655     inc_depth();
3656     Node* offset = adr->in(AddPNode::Offset);
3657     print_depth(); tty->print(" %d (offset) SWPointer::SWPointer: i = %d: ", offset->_idx, i); offset->dump();
3658   }
3659 }
3660 
3661 void SWPointer::Tracer::ctor_4(Node* adr, int i) {
3662   if(_slp->is_trace_alignment()) {
3663     inc_depth();
3664     print_depth(); tty->print(" %d (adr) SWPointer::SWPointer: i = %d: ", adr->_idx, i); adr->dump();
3665   }
3666 }
3667 
3668 void SWPointer::Tracer::ctor_5(Node* adr, Node* base, int i) {
3669   if(_slp->is_trace_alignment()) {
3670     inc_depth();
3671     if (base == adr) {
3672       print_depth(); tty->print_cr("  \\ %d (adr) == %d (base) SWPointer::SWPointer: breaking analysis at i = %d", adr->_idx, base->_idx, i);
3673     } else if (!adr->is_AddP()) {
3674       print_depth(); tty->print_cr("  \\ %d (adr) is NOT Addp SWPointer::SWPointer: breaking analysis at i = %d", adr->_idx, i);
3675     }
3676   }
3677 }
3678 
3679 void SWPointer::Tracer::ctor_6(Node* mem) {
3680   if(_slp->is_trace_alignment()) {
3681     //restore_depth();
3682     print_depth(); tty->print_cr(" %d (adr) SWPointer::SWPointer: stop analysis", mem->_idx);
3683   }
3684 }
3685 
3686 void SWPointer::Tracer::invariant_1(Node *n, Node *n_c) {
3687   if (_slp->do_vector_loop() && _slp->is_debug() && _slp->_lpt->is_member(_slp->_phase->get_loop(n_c)) != (int)_slp->in_bb(n)) {
3688     int is_member =  _slp->_lpt->is_member(_slp->_phase->get_loop(n_c));
3689     int in_bb     =  _slp->in_bb(n);
3690     print_depth(); tty->print("  \\ ");  tty->print_cr(" %d SWPointer::invariant  conditions differ: n_c %d", n->_idx, n_c->_idx);
3691     print_depth(); tty->print("  \\ ");  tty->print_cr("is_member %d, in_bb %d", is_member, in_bb);
3692     print_depth(); tty->print("  \\ ");  n->dump();
3693     print_depth(); tty->print("  \\ ");  n_c->dump();
3694   }
3695 }
3696 
3697 void SWPointer::Tracer::scaled_iv_plus_offset_1(Node* n) {
3698   if(_slp->is_trace_alignment()) {
3699     print_depth(); tty->print(" %d SWPointer::scaled_iv_plus_offset testing node: ", n->_idx);
3700     n->dump();
3701   }
3702 }
3703 
3704 void SWPointer::Tracer::scaled_iv_plus_offset_2(Node* n) {
3705   if(_slp->is_trace_alignment()) {
3706     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: PASSED", n->_idx);
3707   }
3708 }
3709 
3710 void SWPointer::Tracer::scaled_iv_plus_offset_3(Node* n) {
3711   if(_slp->is_trace_alignment()) {
3712     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: PASSED", n->_idx);
3713   }
3714 }
3715 
3716 void SWPointer::Tracer::scaled_iv_plus_offset_4(Node* n) {
3717   if(_slp->is_trace_alignment()) {
3718     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
3719     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump();
3720     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump();
3721   }
3722 }
3723 
3724 void SWPointer::Tracer::scaled_iv_plus_offset_5(Node* n) {
3725   if(_slp->is_trace_alignment()) {
3726     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
3727     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump();
3728     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump();
3729   }
3730 }
3731 
3732 void SWPointer::Tracer::scaled_iv_plus_offset_6(Node* n) {
3733   if(_slp->is_trace_alignment()) {
3734     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_SubI PASSED", n->_idx);
3735     print_depth(); tty->print("  \\  %d SWPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump();
3736     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump();
3737   }
3738 }
3739 
3740 void SWPointer::Tracer::scaled_iv_plus_offset_7(Node* n) {
3741   if(_slp->is_trace_alignment()) {
3742     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_SubI PASSED", n->_idx);
3743     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump();
3744     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump();
3745   }
3746 }
3747 
3748 void SWPointer::Tracer::scaled_iv_plus_offset_8(Node* n) {
3749   if(_slp->is_trace_alignment()) {
3750     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: FAILED", n->_idx);
3751   }
3752 }
3753 
3754 void SWPointer::Tracer::scaled_iv_1(Node* n) {
3755   if(_slp->is_trace_alignment()) {
3756     print_depth(); tty->print(" %d SWPointer::scaled_iv: testing node: ", n->_idx); n->dump();
3757   }
3758 }
3759 
3760 void SWPointer::Tracer::scaled_iv_2(Node* n, int scale) {
3761   if(_slp->is_trace_alignment()) {
3762     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: FAILED since another _scale has been detected before", n->_idx);
3763     print_depth(); tty->print_cr("  \\ SWPointer::scaled_iv: _scale (%d) != 0", scale);
3764   }
3765 }
3766 
3767 void SWPointer::Tracer::scaled_iv_3(Node* n, int scale) {
3768   if(_slp->is_trace_alignment()) {
3769     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: is iv, setting _scale = %d", n->_idx, scale);
3770   }
3771 }
3772 
3773 void SWPointer::Tracer::scaled_iv_4(Node* n, int scale) {
3774   if(_slp->is_trace_alignment()) {
3775     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
3776     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump();
3777     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
3778   }
3779 }
3780 
3781 void SWPointer::Tracer::scaled_iv_5(Node* n, int scale) {
3782   if(_slp->is_trace_alignment()) {
3783     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
3784     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(2) is iv: ", n->in(2)->_idx); n->in(2)->dump();
3785     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
3786   }
3787 }
3788 
3789 void SWPointer::Tracer::scaled_iv_6(Node* n, int scale) {
3790   if(_slp->is_trace_alignment()) {
3791     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_LShiftI PASSED, setting _scale = %d", n->_idx, scale);
3792     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump();
3793     print_depth(); tty->print("  \\ %d SWPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
3794   }
3795 }
3796 
3797 void SWPointer::Tracer::scaled_iv_7(Node* n) {
3798   if(_slp->is_trace_alignment()) {
3799     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_ConvI2L PASSED", n->_idx);
3800     print_depth(); tty->print_cr("  \\ SWPointer::scaled_iv: in(1) %d is scaled_iv_plus_offset: ", n->in(1)->_idx);
3801     inc_depth(); inc_depth();
3802     print_depth(); n->in(1)->dump();
3803     dec_depth(); dec_depth();
3804   }
3805 }
3806 
3807 void SWPointer::Tracer::scaled_iv_8(Node* n, SWPointer* tmp) {
3808   if(_slp->is_trace_alignment()) {
3809     print_depth(); tty->print(" %d SWPointer::scaled_iv: Op_LShiftL, creating tmp SWPointer: ", n->_idx); tmp->print();
3810   }
3811 }
3812 
3813 void SWPointer::Tracer::scaled_iv_9(Node* n, int scale, int _offset, int mult) {
3814   if(_slp->is_trace_alignment()) {
3815     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_LShiftL PASSED, setting _scale = %d, _offset = %d", n->_idx, scale, _offset);
3816     print_depth(); tty->print_cr("  \\ SWPointer::scaled_iv: in(1) %d is scaled_iv_plus_offset, in(2) %d used to get mult = %d: _scale = %d, _offset = %d",
3817     n->in(1)->_idx, n->in(2)->_idx, mult, scale, _offset);
3818     inc_depth(); inc_depth();
3819     print_depth(); n->in(1)->dump();
3820     print_depth(); n->in(2)->dump();
3821     dec_depth(); dec_depth();
3822   }
3823 }
3824 
3825 void SWPointer::Tracer::scaled_iv_10(Node* n) {
3826   if(_slp->is_trace_alignment()) {
3827     print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: FAILED", n->_idx);
3828   }
3829 }
3830 
3831 void SWPointer::Tracer::offset_plus_k_1(Node* n) {
3832   if(_slp->is_trace_alignment()) {
3833     print_depth(); tty->print(" %d SWPointer::offset_plus_k: testing node: ", n->_idx); n->dump();
3834   }
3835 }
3836 
3837 void SWPointer::Tracer::offset_plus_k_2(Node* n, int _offset) {
3838   if(_slp->is_trace_alignment()) {
3839     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_ConI PASSED, setting _offset = %d", n->_idx, _offset);
3840   }
3841 }
3842 
3843 void SWPointer::Tracer::offset_plus_k_3(Node* n, int _offset) {
3844   if(_slp->is_trace_alignment()) {
3845     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_ConL PASSED, setting _offset = %d", n->_idx, _offset);
3846   }
3847 }
3848 
3849 void SWPointer::Tracer::offset_plus_k_4(Node* n) {
3850   if(_slp->is_trace_alignment()) {
3851     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED", n->_idx);
3852     print_depth(); tty->print_cr("  \\ " JLONG_FORMAT " SWPointer::offset_plus_k: Op_ConL FAILED, k is too big", n->get_long());
3853   }
3854 }
3855 
3856 void SWPointer::Tracer::offset_plus_k_5(Node* n, Node* _invar) {
3857   if(_slp->is_trace_alignment()) {
3858     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED since another invariant has been detected before", n->_idx);
3859     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: _invar != NULL: ", _invar->_idx); _invar->dump();
3860   }
3861 }
3862 
3863 void SWPointer::Tracer::offset_plus_k_6(Node* n, Node* _invar, bool _negate_invar, int _offset) {
3864   if(_slp->is_trace_alignment()) {
3865     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_AddI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
3866     n->_idx, _negate_invar, _invar->_idx, _offset);
3867     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
3868     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
3869   }
3870 }
3871 
3872 void SWPointer::Tracer::offset_plus_k_7(Node* n, Node* _invar, bool _negate_invar, int _offset) {
3873   if(_slp->is_trace_alignment()) {
3874     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_AddI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
3875     n->_idx, _negate_invar, _invar->_idx, _offset);
3876     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
3877     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
3878   }
3879 }
3880 
3881 void SWPointer::Tracer::offset_plus_k_8(Node* n, Node* _invar, bool _negate_invar, int _offset) {
3882   if(_slp->is_trace_alignment()) {
3883     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_SubI is PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
3884     n->_idx, _negate_invar, _invar->_idx, _offset);
3885     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump();
3886     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
3887   }
3888 }
3889 
3890 void SWPointer::Tracer::offset_plus_k_9(Node* n, Node* _invar, bool _negate_invar, int _offset) {
3891   if(_slp->is_trace_alignment()) {
3892     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_SubI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
3893     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump();
3894     print_depth(); tty->print("  \\ %d SWPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
3895   }
3896 }
3897 
3898 void SWPointer::Tracer::offset_plus_k_10(Node* n, Node* _invar, bool _negate_invar, int _offset) {
3899   if(_slp->is_trace_alignment()) {
3900     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
3901     print_depth(); tty->print_cr("  \\ %d SWPointer::offset_plus_k: is invariant", n->_idx);
3902   }
3903 }
3904 
3905 void SWPointer::Tracer::offset_plus_k_11(Node* n) {
3906   if(_slp->is_trace_alignment()) {
3907     print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED", n->_idx);
3908   }
3909 }
3910 
3911 #endif
3912 // ========================= OrderedPair =====================
3913 
3914 const OrderedPair OrderedPair::initial;
3915 
3916 // ========================= SWNodeInfo =====================
3917 
3918 const SWNodeInfo SWNodeInfo::initial;
3919 
3920 
3921 // ============================ DepGraph ===========================
3922 
3923 //------------------------------make_node---------------------------
3924 // Make a new dependence graph node for an ideal node.
3925 DepMem* DepGraph::make_node(Node* node) {
3926   DepMem* m = new (_arena) DepMem(node);
3927   if (node != NULL) {
3928     assert(_map.at_grow(node->_idx) == NULL, "one init only");
3929     _map.at_put_grow(node->_idx, m);
3930   }
3931   return m;
3932 }
3933 
3934 //------------------------------make_edge---------------------------
3935 // Make a new dependence graph edge from dpred -> dsucc
3936 DepEdge* DepGraph::make_edge(DepMem* dpred, DepMem* dsucc) {
3937   DepEdge* e = new (_arena) DepEdge(dpred, dsucc, dsucc->in_head(), dpred->out_head());
3938   dpred->set_out_head(e);
3939   dsucc->set_in_head(e);
3940   return e;
3941 }
3942 
3943 // ========================== DepMem ========================
3944 
3945 //------------------------------in_cnt---------------------------
3946 int DepMem::in_cnt() {
3947   int ct = 0;
3948   for (DepEdge* e = _in_head; e != NULL; e = e->next_in()) ct++;
3949   return ct;
3950 }
3951 
3952 //------------------------------out_cnt---------------------------
3953 int DepMem::out_cnt() {
3954   int ct = 0;
3955   for (DepEdge* e = _out_head; e != NULL; e = e->next_out()) ct++;
3956   return ct;
3957 }
3958 
3959 //------------------------------print-----------------------------
3960 void DepMem::print() {
3961 #ifndef PRODUCT
3962   tty->print("  DepNode %d (", _node->_idx);
3963   for (DepEdge* p = _in_head; p != NULL; p = p->next_in()) {
3964     Node* pred = p->pred()->node();
3965     tty->print(" %d", pred != NULL ? pred->_idx : 0);
3966   }
3967   tty->print(") [");
3968   for (DepEdge* s = _out_head; s != NULL; s = s->next_out()) {
3969     Node* succ = s->succ()->node();
3970     tty->print(" %d", succ != NULL ? succ->_idx : 0);
3971   }
3972   tty->print_cr(" ]");
3973 #endif
3974 }
3975 
3976 // =========================== DepEdge =========================
3977 
3978 //------------------------------DepPreds---------------------------
3979 void DepEdge::print() {
3980 #ifndef PRODUCT
3981   tty->print_cr("DepEdge: %d [ %d ]", _pred->node()->_idx, _succ->node()->_idx);
3982 #endif
3983 }
3984 
3985 // =========================== DepPreds =========================
3986 // Iterator over predecessor edges in the dependence graph.
3987 
3988 //------------------------------DepPreds---------------------------
3989 DepPreds::DepPreds(Node* n, DepGraph& dg) {
3990   _n = n;
3991   _done = false;
3992   if (_n->is_Store() || _n->is_Load()) {
3993     _next_idx = MemNode::Address;
3994     _end_idx  = n->req();
3995     _dep_next = dg.dep(_n)->in_head();
3996   } else if (_n->is_Mem()) {
3997     _next_idx = 0;
3998     _end_idx  = 0;
3999     _dep_next = dg.dep(_n)->in_head();
4000   } else {
4001     _next_idx = 1;
4002     _end_idx  = _n->req();
4003     _dep_next = NULL;
4004   }
4005   next();
4006 }
4007 
4008 //------------------------------next---------------------------
4009 void DepPreds::next() {
4010   if (_dep_next != NULL) {
4011     _current  = _dep_next->pred()->node();
4012     _dep_next = _dep_next->next_in();
4013   } else if (_next_idx < _end_idx) {
4014     _current  = _n->in(_next_idx++);
4015   } else {
4016     _done = true;
4017   }
4018 }
4019 
4020 // =========================== DepSuccs =========================
4021 // Iterator over successor edges in the dependence graph.
4022 
4023 //------------------------------DepSuccs---------------------------
4024 DepSuccs::DepSuccs(Node* n, DepGraph& dg) {
4025   _n = n;
4026   _done = false;
4027   if (_n->is_Load()) {
4028     _next_idx = 0;
4029     _end_idx  = _n->outcnt();
4030     _dep_next = dg.dep(_n)->out_head();
4031   } else if (_n->is_Mem() || _n->is_Phi() && _n->bottom_type() == Type::MEMORY) {
4032     _next_idx = 0;
4033     _end_idx  = 0;
4034     _dep_next = dg.dep(_n)->out_head();
4035   } else {
4036     _next_idx = 0;
4037     _end_idx  = _n->outcnt();
4038     _dep_next = NULL;
4039   }
4040   next();
4041 }
4042 
4043 //-------------------------------next---------------------------
4044 void DepSuccs::next() {
4045   if (_dep_next != NULL) {
4046     _current  = _dep_next->succ()->node();
4047     _dep_next = _dep_next->next_out();
4048   } else if (_next_idx < _end_idx) {
4049     _current  = _n->raw_out(_next_idx++);
4050   } else {
4051     _done = true;
4052   }
4053 }
4054 
4055 //
4056 // --------------------------------- vectorization/simd -----------------------------------
4057 //
4058 bool SuperWord::same_origin_idx(Node* a, Node* b) const {
4059   return a != NULL && b != NULL && _clone_map.same_idx(a->_idx, b->_idx);
4060 }
4061 bool SuperWord::same_generation(Node* a, Node* b) const {
4062   return a != NULL && b != NULL && _clone_map.same_gen(a->_idx, b->_idx);
4063 }
4064 
4065 Node*  SuperWord::find_phi_for_mem_dep(LoadNode* ld) {
4066   assert(in_bb(ld), "must be in block");
4067   if (_clone_map.gen(ld->_idx) == _ii_first) {
4068 #ifndef PRODUCT
4069     if (_vector_loop_debug) {
4070       tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(ld->_idx)=%d",
4071         _clone_map.gen(ld->_idx));
4072     }
4073 #endif
4074     return NULL; //we think that any ld in the first gen being vectorizable
4075   }
4076 
4077   Node* mem = ld->in(MemNode::Memory);
4078   if (mem->outcnt() <= 1) {
4079     // we don't want to remove the only edge from mem node to load
4080 #ifndef PRODUCT
4081     if (_vector_loop_debug) {
4082       tty->print_cr("SuperWord::find_phi_for_mem_dep input node %d to load %d has no other outputs and edge mem->load cannot be removed",
4083         mem->_idx, ld->_idx);
4084       ld->dump();
4085       mem->dump();
4086     }
4087 #endif
4088     return NULL;
4089   }
4090   if (!in_bb(mem) || same_generation(mem, ld)) {
4091 #ifndef PRODUCT
4092     if (_vector_loop_debug) {
4093       tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(mem->_idx)=%d",
4094         _clone_map.gen(mem->_idx));
4095     }
4096 #endif
4097     return NULL; // does not depend on loop volatile node or depends on the same generation
4098   }
4099 
4100   //otherwise first node should depend on mem-phi
4101   Node* first = first_node(ld);
4102   assert(first->is_Load(), "must be Load");
4103   Node* phi = first->as_Load()->in(MemNode::Memory);
4104   if (!phi->is_Phi() || phi->bottom_type() != Type::MEMORY) {
4105 #ifndef PRODUCT
4106     if (_vector_loop_debug) {
4107       tty->print_cr("SuperWord::find_phi_for_mem_dep load is not vectorizable node, since it's `first` does not take input from mem phi");
4108       ld->dump();
4109       first->dump();
4110     }
4111 #endif
4112     return NULL;
4113   }
4114 
4115   Node* tail = 0;
4116   for (int m = 0; m < _mem_slice_head.length(); m++) {
4117     if (_mem_slice_head.at(m) == phi) {
4118       tail = _mem_slice_tail.at(m);
4119     }
4120   }
4121   if (tail == 0) { //test that found phi is in the list  _mem_slice_head
4122 #ifndef PRODUCT
4123     if (_vector_loop_debug) {
4124       tty->print_cr("SuperWord::find_phi_for_mem_dep load %d is not vectorizable node, its phi %d is not _mem_slice_head",
4125         ld->_idx, phi->_idx);
4126       ld->dump();
4127       phi->dump();
4128     }
4129 #endif
4130     return NULL;
4131   }
4132 
4133   // now all conditions are met
4134   return phi;
4135 }
4136 
4137 Node* SuperWord::first_node(Node* nd) {
4138   for (int ii = 0; ii < _iteration_first.length(); ii++) {
4139     Node* nnn = _iteration_first.at(ii);
4140     if (same_origin_idx(nnn, nd)) {
4141 #ifndef PRODUCT
4142       if (_vector_loop_debug) {
4143         tty->print_cr("SuperWord::first_node: %d is the first iteration node for %d (_clone_map.idx(nnn->_idx) = %d)",
4144           nnn->_idx, nd->_idx, _clone_map.idx(nnn->_idx));
4145       }
4146 #endif
4147       return nnn;
4148     }
4149   }
4150 
4151 #ifndef PRODUCT
4152   if (_vector_loop_debug) {
4153     tty->print_cr("SuperWord::first_node: did not find first iteration node for %d (_clone_map.idx(nd->_idx)=%d)",
4154       nd->_idx, _clone_map.idx(nd->_idx));
4155   }
4156 #endif
4157   return 0;
4158 }
4159 
4160 Node* SuperWord::last_node(Node* nd) {
4161   for (int ii = 0; ii < _iteration_last.length(); ii++) {
4162     Node* nnn = _iteration_last.at(ii);
4163     if (same_origin_idx(nnn, nd)) {
4164 #ifndef PRODUCT
4165       if (_vector_loop_debug) {
4166         tty->print_cr("SuperWord::last_node _clone_map.idx(nnn->_idx)=%d, _clone_map.idx(nd->_idx)=%d",
4167           _clone_map.idx(nnn->_idx), _clone_map.idx(nd->_idx));
4168       }
4169 #endif
4170       return nnn;
4171     }
4172   }
4173   return 0;
4174 }
4175 
4176 int SuperWord::mark_generations() {
4177   Node *ii_err = NULL, *tail_err = NULL;
4178   for (int i = 0; i < _mem_slice_head.length(); i++) {
4179     Node* phi  = _mem_slice_head.at(i);
4180     assert(phi->is_Phi(), "must be phi");
4181 
4182     Node* tail = _mem_slice_tail.at(i);
4183     if (_ii_last == -1) {
4184       tail_err = tail;
4185       _ii_last = _clone_map.gen(tail->_idx);
4186     }
4187     else if (_ii_last != _clone_map.gen(tail->_idx)) {
4188 #ifndef PRODUCT
4189       if (TraceSuperWord && Verbose) {
4190         tty->print_cr("SuperWord::mark_generations _ii_last error - found different generations in two tail nodes ");
4191         tail->dump();
4192         tail_err->dump();
4193       }
4194 #endif
4195       return -1;
4196     }
4197 
4198     // find first iteration in the loop
4199     for (DUIterator_Fast imax, i = phi->fast_outs(imax); i < imax; i++) {
4200       Node* ii = phi->fast_out(i);
4201       if (in_bb(ii) && ii->is_Store()) { // we speculate that normally Stores of one and one only generation have deps from mem phi
4202         if (_ii_first == -1) {
4203           ii_err = ii;
4204           _ii_first = _clone_map.gen(ii->_idx);
4205         } else if (_ii_first != _clone_map.gen(ii->_idx)) {
4206 #ifndef PRODUCT
4207           if (TraceSuperWord && Verbose) {
4208             tty->print_cr("SuperWord::mark_generations: _ii_first was found before and not equal to one in this node (%d)", _ii_first);
4209             ii->dump();
4210             if (ii_err!= 0) {
4211               ii_err->dump();
4212             }
4213           }
4214 #endif
4215           return -1; // this phi has Stores from different generations of unroll and cannot be simd/vectorized
4216         }
4217       }
4218     }//for (DUIterator_Fast imax,
4219   }//for (int i...
4220 
4221   if (_ii_first == -1 || _ii_last == -1) {
4222     if (TraceSuperWord && Verbose) {
4223       tty->print_cr("SuperWord::mark_generations unknown error, something vent wrong");
4224     }
4225     return -1; // something vent wrong
4226   }
4227   // collect nodes in the first and last generations
4228   assert(_iteration_first.length() == 0, "_iteration_first must be empty");
4229   assert(_iteration_last.length() == 0, "_iteration_last must be empty");
4230   for (int j = 0; j < _block.length(); j++) {
4231     Node* n = _block.at(j);
4232     node_idx_t gen = _clone_map.gen(n->_idx);
4233     if ((signed)gen == _ii_first) {
4234       _iteration_first.push(n);
4235     } else if ((signed)gen == _ii_last) {
4236       _iteration_last.push(n);
4237     }
4238   }
4239 
4240   // building order of iterations
4241   if (_ii_order.length() == 0 && ii_err != 0) {
4242     assert(in_bb(ii_err) && ii_err->is_Store(), "should be Store in bb");
4243     Node* nd = ii_err;
4244     while(_clone_map.gen(nd->_idx) != _ii_last) {
4245       _ii_order.push(_clone_map.gen(nd->_idx));
4246       bool found = false;
4247       for (DUIterator_Fast imax, i = nd->fast_outs(imax); i < imax; i++) {
4248         Node* use = nd->fast_out(i);
4249         if (same_origin_idx(use, nd) && use->as_Store()->in(MemNode::Memory) == nd) {
4250           found = true;
4251           nd = use;
4252           break;
4253         }
4254       }//for
4255 
4256       if (found == false) {
4257         if (TraceSuperWord && Verbose) {
4258           tty->print_cr("SuperWord::mark_generations: Cannot build order of iterations - no dependent Store for %d", nd->_idx);
4259         }
4260         _ii_order.clear();
4261         return -1;
4262       }
4263     } //while
4264     _ii_order.push(_clone_map.gen(nd->_idx));
4265   }
4266 
4267 #ifndef PRODUCT
4268   if (_vector_loop_debug) {
4269     tty->print_cr("SuperWord::mark_generations");
4270     tty->print_cr("First generation (%d) nodes:", _ii_first);
4271     for (int ii = 0; ii < _iteration_first.length(); ii++)  _iteration_first.at(ii)->dump();
4272     tty->print_cr("Last generation (%d) nodes:", _ii_last);
4273     for (int ii = 0; ii < _iteration_last.length(); ii++)  _iteration_last.at(ii)->dump();
4274     tty->print_cr(" ");
4275 
4276     tty->print("SuperWord::List of generations: ");
4277     for (int jj = 0; jj < _ii_order.length(); ++jj) {
4278       tty->print("%d:%d ", jj, _ii_order.at(jj));
4279     }
4280     tty->print_cr(" ");
4281   }
4282 #endif
4283 
4284   return _ii_first;
4285 }
4286 
4287 bool SuperWord::fix_commutative_inputs(Node* gold, Node* fix) {
4288   assert(gold->is_Add() && fix->is_Add() || gold->is_Mul() && fix->is_Mul(), "should be only Add or Mul nodes");
4289   assert(same_origin_idx(gold, fix), "should be clones of the same node");
4290   Node* gin1 = gold->in(1);
4291   Node* gin2 = gold->in(2);
4292   Node* fin1 = fix->in(1);
4293   Node* fin2 = fix->in(2);
4294   bool swapped = false;
4295 
4296   if (in_bb(gin1) && in_bb(gin2) && in_bb(fin1) && in_bb(fin1)) {
4297     if (same_origin_idx(gin1, fin1) &&
4298         same_origin_idx(gin2, fin2)) {
4299       return true; // nothing to fix
4300     }
4301     if (same_origin_idx(gin1, fin2) &&
4302         same_origin_idx(gin2, fin1)) {
4303       fix->swap_edges(1, 2);
4304       swapped = true;
4305     }
4306   }
4307   // at least one input comes from outside of bb
4308   if (gin1->_idx == fin1->_idx)  {
4309     return true; // nothing to fix
4310   }
4311   if (!swapped && (gin1->_idx == fin2->_idx || gin2->_idx == fin1->_idx))  { //swapping is expensive, check condition first
4312     fix->swap_edges(1, 2);
4313     swapped = true;
4314   }
4315 
4316   if (swapped) {
4317 #ifndef PRODUCT
4318     if (_vector_loop_debug) {
4319       tty->print_cr("SuperWord::fix_commutative_inputs: fixed node %d", fix->_idx);
4320     }
4321 #endif
4322     return true;
4323   }
4324 
4325   if (TraceSuperWord && Verbose) {
4326     tty->print_cr("SuperWord::fix_commutative_inputs: cannot fix node %d", fix->_idx);
4327   }
4328 
4329   return false;
4330 }
4331 
4332 bool SuperWord::pack_parallel() {
4333 #ifndef PRODUCT
4334   if (_vector_loop_debug) {
4335     tty->print_cr("SuperWord::pack_parallel: START");
4336   }
4337 #endif
4338 
4339   _packset.clear();
4340 
4341   for (int ii = 0; ii < _iteration_first.length(); ii++) {
4342     Node* nd = _iteration_first.at(ii);
4343     if (in_bb(nd) && (nd->is_Load() || nd->is_Store() || nd->is_Add() || nd->is_Mul())) {
4344       Node_List* pk = new Node_List();
4345       pk->push(nd);
4346       for (int gen = 1; gen < _ii_order.length(); ++gen) {
4347         for (int kk = 0; kk < _block.length(); kk++) {
4348           Node* clone = _block.at(kk);
4349           if (same_origin_idx(clone, nd) &&
4350               _clone_map.gen(clone->_idx) == _ii_order.at(gen)) {
4351             if (nd->is_Add() || nd->is_Mul()) {
4352               fix_commutative_inputs(nd, clone);
4353             }
4354             pk->push(clone);
4355             if (pk->size() == 4) {
4356               _packset.append(pk);
4357 #ifndef PRODUCT
4358               if (_vector_loop_debug) {
4359                 tty->print_cr("SuperWord::pack_parallel: added pack ");
4360                 pk->dump();
4361               }
4362 #endif
4363               if (_clone_map.gen(clone->_idx) != _ii_last) {
4364                 pk = new Node_List();
4365               }
4366             }
4367             break;
4368           }
4369         }
4370       }//for
4371     }//if
4372   }//for
4373 
4374 #ifndef PRODUCT
4375   if (_vector_loop_debug) {
4376     tty->print_cr("SuperWord::pack_parallel: END");
4377   }
4378 #endif
4379 
4380   return true;
4381 }
4382 
4383 bool SuperWord::hoist_loads_in_graph() {
4384   GrowableArray<Node*> loads;
4385 
4386 #ifndef PRODUCT
4387   if (_vector_loop_debug) {
4388     tty->print_cr("SuperWord::hoist_loads_in_graph: total number _mem_slice_head.length() = %d", _mem_slice_head.length());
4389   }
4390 #endif
4391 
4392   for (int i = 0; i < _mem_slice_head.length(); i++) {
4393     Node* n = _mem_slice_head.at(i);
4394     if ( !in_bb(n) || !n->is_Phi() || n->bottom_type() != Type::MEMORY) {
4395       if (TraceSuperWord && Verbose) {
4396         tty->print_cr("SuperWord::hoist_loads_in_graph: skipping unexpected node n=%d", n->_idx);
4397       }
4398       continue;
4399     }
4400 
4401 #ifndef PRODUCT
4402     if (_vector_loop_debug) {
4403       tty->print_cr("SuperWord::hoist_loads_in_graph: processing phi %d  = _mem_slice_head.at(%d);", n->_idx, i);
4404     }
4405 #endif
4406 
4407     for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
4408       Node* ld = n->fast_out(i);
4409       if (ld->is_Load() && ld->as_Load()->in(MemNode::Memory) == n && in_bb(ld)) {
4410         for (int i = 0; i < _block.length(); i++) {
4411           Node* ld2 = _block.at(i);
4412           if (ld2->is_Load() && same_origin_idx(ld, ld2) &&
4413               !same_generation(ld, ld2)) { // <= do not collect the first generation ld
4414 #ifndef PRODUCT
4415             if (_vector_loop_debug) {
4416               tty->print_cr("SuperWord::hoist_loads_in_graph: will try to hoist load ld2->_idx=%d, cloned from %d (ld->_idx=%d)",
4417                 ld2->_idx, _clone_map.idx(ld->_idx), ld->_idx);
4418             }
4419 #endif
4420             // could not do on-the-fly, since iterator is immutable
4421             loads.push(ld2);
4422           }
4423         }// for
4424       }//if
4425     }//for (DUIterator_Fast imax,
4426   }//for (int i = 0; i
4427 
4428   for (int i = 0; i < loads.length(); i++) {
4429     LoadNode* ld = loads.at(i)->as_Load();
4430     Node* phi = find_phi_for_mem_dep(ld);
4431     if (phi != NULL) {
4432 #ifndef PRODUCT
4433       if (_vector_loop_debug) {
4434         tty->print_cr("SuperWord::hoist_loads_in_graph replacing MemNode::Memory(%d) edge in %d with one from %d",
4435           MemNode::Memory, ld->_idx, phi->_idx);
4436       }
4437 #endif
4438       _igvn.replace_input_of(ld, MemNode::Memory, phi);
4439     }
4440   }//for
4441 
4442   restart(); // invalidate all basic structures, since we rebuilt the graph
4443 
4444   if (TraceSuperWord && Verbose) {
4445     tty->print_cr("\nSuperWord::hoist_loads_in_graph() the graph was rebuilt, all structures invalidated and need rebuild");
4446   }
4447 
4448   return true;
4449 }
4450