51 _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
52 _block(arena(), 8, 0, NULL), // nodes in current block
53 _data_entry(arena(), 8, 0, NULL), // nodes with all inputs from outside
54 _mem_slice_head(arena(), 8, 0, NULL), // memory slice heads
55 _mem_slice_tail(arena(), 8, 0, NULL), // memory slice tails
56 _node_info(arena(), 8, 0, SWNodeInfo::initial), // info needed per node
57 _clone_map(phase->C->clone_map()), // map of nodes created in cloning
58 _align_to_ref(NULL), // memory reference to align vectors to
59 _disjoint_ptrs(arena(), 8, 0, OrderedPair::initial), // runtime disambiguated pointer pairs
60 _dg(_arena), // dependence graph
61 _visited(arena()), // visited node set
62 _post_visited(arena()), // post visited node set
63 _n_idx_list(arena(), 8), // scratch list of (node,index) pairs
64 _stk(arena(), 8, 0, NULL), // scratch stack of nodes
65 _nlist(arena(), 8, 0, NULL), // scratch list of nodes
66 _lpt(NULL), // loop tree node
67 _lp(NULL), // LoopNode
68 _bb(NULL), // basic block
69 _iv(NULL), // induction var
70 _race_possible(false), // cases where SDMU is true
71 _num_work_vecs(0), // amount of vector work we have
72 _num_reductions(0), // amount of reduction work we have
73 _do_vector_loop(phase->C->do_vector_loop()), // whether to do vectorization/simd style
74 _ii_first(-1), // first loop generation index - only if do_vector_loop()
75 _ii_last(-1), // last loop generation index - only if do_vector_loop()
76 _ii_order(arena(), 8, 0, 0),
77 _vector_loop_debug(phase->C->has_method() && phase->C->method_has_option("VectorizeDebug"))
78 {}
79
80 //------------------------------transform_loop---------------------------
81 void SuperWord::transform_loop(IdealLoopTree* lpt) {
82 assert(UseSuperWord, "should be");
83 // Do vectors exist on this architecture?
84 if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
85
86 assert(lpt->_head->is_CountedLoop(), "must be");
87 CountedLoopNode *cl = lpt->_head->as_CountedLoop();
88
89 if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
90
91 if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops
92
93 // Check for no control flow in body (other than exit)
94 Node *cl_exit = cl->loopexit();
95 if (cl_exit->in(0) != lpt->_head) return;
96
97 // Make sure the are no extra control users of the loop backedge
98 if (cl->back_control()->outcnt() != 1) {
99 return;
100 }
101
102 // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
103 CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
104 if (pre_end == NULL) return;
105 Node *pre_opaq1 = pre_end->limit();
106 if (pre_opaq1->Opcode() != Op_Opaque1) return;
107
108 init(); // initialize data structures
109
110 set_lpt(lpt);
111 set_lp(cl);
112
113 // For now, define one block which is the entire loop body
114 set_bb(cl);
115
116 assert(_packset.length() == 0, "packset must be empty");
117 SLP_extract();
118 }
119
120 //------------------------------SLP_extract---------------------------
121 // Extract the superword level parallelism
122 //
123 // 1) A reverse post-order of nodes in the block is constructed. By scanning
124 // this list from first to last, all definitions are visited before their uses.
125 //
126 // 2) A point-to-point dependence graph is constructed between memory references.
127 // This simplies the upcoming "independence" checker.
128 //
129 // 3) The maximum depth in the node graph from the beginning of the block
130 // to each node is computed. This is used to prune the graph search
131 // in the independence checker.
132 //
133 // 4) For integer types, the necessary bit width is propagated backwards
134 // from stores to allow packed operations on byte, char, and short
135 // integers. This reverses the promotion to type "int" that javac
136 // did for operations like: char c1,c2,c3; c1 = c2 + c3.
137 //
251
252 Node_List align_to_refs;
253 int best_iv_adjustment = 0;
254 MemNode* best_align_to_mem_ref = NULL;
255
256 while (memops.size() != 0) {
257 // Find a memory reference to align to.
258 MemNode* mem_ref = find_align_to_ref(memops);
259 if (mem_ref == NULL) break;
260 align_to_refs.push(mem_ref);
261 int iv_adjustment = get_iv_adjustment(mem_ref);
262
263 if (best_align_to_mem_ref == NULL) {
264 // Set memory reference which is the best from all memory operations
265 // to be used for alignment. The pre-loop trip count is modified to align
266 // this reference to a vector-aligned address.
267 best_align_to_mem_ref = mem_ref;
268 best_iv_adjustment = iv_adjustment;
269 }
270
271 SWPointer align_to_ref_p(mem_ref, this);
272 // Set alignment relative to "align_to_ref" for all related memory operations.
273 for (int i = memops.size() - 1; i >= 0; i--) {
274 MemNode* s = memops.at(i)->as_Mem();
275 if (isomorphic(s, mem_ref)) {
276 SWPointer p2(s, this);
277 if (p2.comparable(align_to_ref_p)) {
278 int align = memory_alignment(s, iv_adjustment);
279 set_alignment(s, align);
280 }
281 }
282 }
283
284 // Create initial pack pairs of memory operations for which
285 // alignment is set and vectors will be aligned.
286 bool create_pack = true;
287 if (memory_alignment(mem_ref, best_iv_adjustment) == 0 || _do_vector_loop) {
288 if (!Matcher::misaligned_vectors_ok()) {
289 int vw = vector_width(mem_ref);
290 int vw_best = vector_width(best_align_to_mem_ref);
291 if (vw > vw_best) {
292 // Do not vectorize a memory access with more elements per vector
293 // if unaligned memory access is not allowed because number of
294 // iterations in pre-loop will be not enough to align it.
295 create_pack = false;
296 } else {
297 SWPointer p2(best_align_to_mem_ref, this);
298 if (align_to_ref_p.invar() != p2.invar()) {
299 // Do not vectorize memory accesses with different invariants
300 // if unaligned memory accesses are not allowed.
301 create_pack = false;
302 }
303 }
304 }
305 } else {
306 if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
307 // Can't allow vectorization of unaligned memory accesses with the
308 // same type since it could be overlapped accesses to the same array.
309 create_pack = false;
310 } else {
311 // Allow independent (different type) unaligned memory operations
312 // if HW supports them.
313 if (!Matcher::misaligned_vectors_ok()) {
314 create_pack = false;
315 } else {
316 // Check if packs of the same memory type but
317 // with a different alignment were created before.
394 set_align_to_ref(best_align_to_mem_ref);
395
396 #ifndef PRODUCT
397 if (TraceSuperWord) {
398 tty->print_cr("\nAfter find_adjacent_refs");
399 print_packset();
400 }
401 #endif
402 }
403
404 //------------------------------find_align_to_ref---------------------------
405 // Find a memory reference to align the loop induction variable to.
406 // Looks first at stores then at loads, looking for a memory reference
407 // with the largest number of references similar to it.
408 MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
409 GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
410
411 // Count number of comparable memory ops
412 for (uint i = 0; i < memops.size(); i++) {
413 MemNode* s1 = memops.at(i)->as_Mem();
414 SWPointer p1(s1, this);
415 // Discard if pre loop can't align this reference
416 if (!ref_is_alignable(p1)) {
417 *cmp_ct.adr_at(i) = 0;
418 continue;
419 }
420 for (uint j = i+1; j < memops.size(); j++) {
421 MemNode* s2 = memops.at(j)->as_Mem();
422 if (isomorphic(s1, s2)) {
423 SWPointer p2(s2, this);
424 if (p1.comparable(p2)) {
425 (*cmp_ct.adr_at(i))++;
426 (*cmp_ct.adr_at(j))++;
427 }
428 }
429 }
430 }
431
432 // Find Store (or Load) with the greatest number of "comparable" references,
433 // biggest vector size, smallest data size and smallest iv offset.
434 int max_ct = 0;
435 int max_vw = 0;
436 int max_idx = -1;
437 int min_size = max_jint;
438 int min_iv_offset = max_jint;
439 for (uint j = 0; j < memops.size(); j++) {
440 MemNode* s = memops.at(j)->as_Mem();
441 if (s->is_Store()) {
442 int vw = vector_width_in_bytes(s);
443 assert(vw > 1, "sanity");
444 SWPointer p(s, this);
445 if (cmp_ct.at(j) > max_ct ||
446 cmp_ct.at(j) == max_ct &&
447 (vw > max_vw ||
448 vw == max_vw &&
449 (data_size(s) < min_size ||
450 data_size(s) == min_size &&
451 (p.offset_in_bytes() < min_iv_offset)))) {
452 max_ct = cmp_ct.at(j);
453 max_vw = vw;
454 max_idx = j;
455 min_size = data_size(s);
456 min_iv_offset = p.offset_in_bytes();
457 }
458 }
459 }
460 // If no stores, look at loads
461 if (max_ct == 0) {
462 for (uint j = 0; j < memops.size(); j++) {
463 MemNode* s = memops.at(j)->as_Mem();
464 if (s->is_Load()) {
465 int vw = vector_width_in_bytes(s);
466 assert(vw > 1, "sanity");
467 SWPointer p(s, this);
468 if (cmp_ct.at(j) > max_ct ||
469 cmp_ct.at(j) == max_ct &&
470 (vw > max_vw ||
471 vw == max_vw &&
472 (data_size(s) < min_size ||
473 data_size(s) == min_size &&
474 (p.offset_in_bytes() < min_iv_offset)))) {
475 max_ct = cmp_ct.at(j);
476 max_vw = vw;
477 max_idx = j;
478 min_size = data_size(s);
479 min_iv_offset = p.offset_in_bytes();
480 }
481 }
482 }
483 }
484
485 #ifdef ASSERT
486 if (TraceSuperWord && Verbose) {
487 tty->print_cr("\nVector memops after find_align_to_ref");
558 } else if (span % vw == 0) {
559 // If span is a multiple of vw, we can simplify formula (1) to:
560 // (init_offset + i * span) % vw == 0
561 // =>
562 // (init_offset % vw) + ((i * span) % vw) == 0
563 // =>
564 // init_offset % vw == 0
565 //
566 // Because we add a multiple of vw to the initial offset, the final
567 // offset is a multiple of vw if and only if init_offset is a multiple.
568 //
569 return (init_offset % vw) == 0;
570 }
571 }
572 return false;
573 }
574
575 //---------------------------get_iv_adjustment---------------------------
576 // Calculate loop's iv adjustment for this memory ops.
577 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
578 SWPointer align_to_ref_p(mem_ref, this);
579 int offset = align_to_ref_p.offset_in_bytes();
580 int scale = align_to_ref_p.scale_in_bytes();
581 int elt_size = align_to_ref_p.memory_size();
582 int vw = vector_width_in_bytes(mem_ref);
583 assert(vw > 1, "sanity");
584 int iv_adjustment;
585 if (scale != 0) {
586 int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
587 // At least one iteration is executed in pre-loop by default. As result
588 // several iterations are needed to align memory operations in main-loop even
589 // if offset is 0.
590 int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
591 assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
592 err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
593 iv_adjustment = iv_adjustment_in_bytes/elt_size;
594 } else {
595 // This memory op is not dependent on iv (scale == 0)
596 iv_adjustment = 0;
597 }
598
632 _nlist.at(j)->dump();
633 }
634 }
635 #endif
636 // Make the slice dependent on the root
637 DepMem* slice = _dg.dep(n);
638 _dg.make_edge(_dg.root(), slice);
639
640 // Create a sink for the slice
641 DepMem* slice_sink = _dg.make_node(NULL);
642 _dg.make_edge(slice_sink, _dg.tail());
643
644 // Now visit each pair of memory ops, creating the edges
645 for (int j = _nlist.length() - 1; j >= 0 ; j--) {
646 Node* s1 = _nlist.at(j);
647
648 // If no dependency yet, use slice
649 if (_dg.dep(s1)->in_cnt() == 0) {
650 _dg.make_edge(slice, s1);
651 }
652 SWPointer p1(s1->as_Mem(), this);
653 bool sink_dependent = true;
654 for (int k = j - 1; k >= 0; k--) {
655 Node* s2 = _nlist.at(k);
656 if (s1->is_Load() && s2->is_Load())
657 continue;
658 SWPointer p2(s2->as_Mem(), this);
659
660 int cmp = p1.cmp(p2);
661 if (SuperWordRTDepCheck &&
662 p1.base() != p2.base() && p1.valid() && p2.valid()) {
663 // Create a runtime check to disambiguate
664 OrderedPair pp(p1.base(), p2.base());
665 _disjoint_ptrs.append_if_missing(pp);
666 } else if (!SWPointer::not_equal(cmp)) {
667 // Possibly same address
668 _dg.make_edge(s1, s2);
669 sink_dependent = false;
670 }
671 }
672 if (sink_dependent) {
673 _dg.make_edge(s1, slice_sink);
674 }
675 }
676 #ifndef PRODUCT
677 if (TraceSuperWord) {
678 tty->print_cr("\nDependence graph for slice: %d", n->_idx);
778 return false;
779 }
780
781 //------------------------------are_adjacent_refs---------------------------
782 // Is s1 immediately before s2 in memory?
783 bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
784 if (!s1->is_Mem() || !s2->is_Mem()) return false;
785 if (!in_bb(s1) || !in_bb(s2)) return false;
786
787 // Do not use superword for non-primitives
788 if (!is_java_primitive(s1->as_Mem()->memory_type()) ||
789 !is_java_primitive(s2->as_Mem()->memory_type())) {
790 return false;
791 }
792
793 // FIXME - co_locate_pack fails on Stores in different mem-slices, so
794 // only pack memops that are in the same alias set until that's fixed.
795 if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
796 _phase->C->get_alias_index(s2->as_Mem()->adr_type()))
797 return false;
798 SWPointer p1(s1->as_Mem(), this);
799 SWPointer p2(s2->as_Mem(), this);
800 if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
801 int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
802 return diff == data_size(s1);
803 }
804
805 //------------------------------isomorphic---------------------------
806 // Are s1 and s2 similar?
807 bool SuperWord::isomorphic(Node* s1, Node* s2) {
808 if (s1->Opcode() != s2->Opcode()) return false;
809 if (s1->req() != s2->req()) return false;
810 if (s1->in(0) != s2->in(0)) return false;
811 if (!same_velt_type(s1, s2)) return false;
812 return true;
813 }
814
815 //------------------------------independent---------------------------
816 // Is there no data path from s1 to s2 or s2 to s1?
817 bool SuperWord::independent(Node* s1, Node* s2) {
818 // assert(s1->Opcode() == s2->Opcode(), "check isomorphic first");
819 int d1 = depth(s1);
1598 // Insert extract (unpack) operations for scalar uses
1599 for (int i = 0; i < _packset.length(); i++) {
1600 insert_extracts(_packset.at(i));
1601 }
1602
1603 Compile* C = _phase->C;
1604 uint max_vlen_in_bytes = 0;
1605 for (int i = 0; i < _block.length(); i++) {
1606 Node* n = _block.at(i);
1607 Node_List* p = my_pack(n);
1608 if (p && n == executed_last(p)) {
1609 uint vlen = p->size();
1610 uint vlen_in_bytes = 0;
1611 Node* vn = NULL;
1612 Node* low_adr = p->at(0);
1613 Node* first = executed_first(p);
1614 int opc = n->Opcode();
1615 if (n->is_Load()) {
1616 Node* ctl = n->in(MemNode::Control);
1617 Node* mem = first->in(MemNode::Memory);
1618 SWPointer p1(n->as_Mem(), this);
1619 // Identify the memory dependency for the new loadVector node by
1620 // walking up through memory chain.
1621 // This is done to give flexibility to the new loadVector node so that
1622 // it can move above independent storeVector nodes.
1623 while (mem->is_StoreVector()) {
1624 SWPointer p2(mem->as_Mem(), this);
1625 int cmp = p1.cmp(p2);
1626 if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
1627 mem = mem->in(MemNode::Memory);
1628 } else {
1629 break; // dependent memory
1630 }
1631 }
1632 Node* adr = low_adr->in(MemNode::Address);
1633 const TypePtr* atyp = n->adr_type();
1634 vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
1635 vlen_in_bytes = vn->as_LoadVector()->memory_size();
1636 } else if (n->is_Store()) {
1637 // Promote value to be stored to vector
1638 Node* val = vector_opd(p, MemNode::ValueIn);
1639 Node* ctl = n->in(MemNode::Control);
1640 Node* mem = first->in(MemNode::Memory);
1641 Node* adr = low_adr->in(MemNode::Address);
1642 const TypePtr* atyp = n->adr_type();
1643 vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
1644 vlen_in_bytes = vn->as_StoreVector()->memory_size();
2121 }
2122 }
2123 }
2124 }
2125 }
2126 #ifndef PRODUCT
2127 if (TraceSuperWord && Verbose) {
2128 for (int i = 0; i < _block.length(); i++) {
2129 Node* n = _block.at(i);
2130 velt_type(n)->dump();
2131 tty->print("\t");
2132 n->dump();
2133 }
2134 }
2135 #endif
2136 }
2137
2138 //------------------------------memory_alignment---------------------------
2139 // Alignment within a vector memory reference
2140 int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
2141 SWPointer p(s, this);
2142 if (!p.valid()) {
2143 return bottom_align;
2144 }
2145 int vw = vector_width_in_bytes(s);
2146 if (vw < 2) {
2147 return bottom_align; // No vectors for this type
2148 }
2149 int offset = p.offset_in_bytes();
2150 offset += iv_adjust*p.memory_size();
2151 int off_rem = offset % vw;
2152 int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
2153 return off_mod;
2154 }
2155
2156 //---------------------------container_type---------------------------
2157 // Smallest type containing range of values
2158 const Type* SuperWord::container_type(Node* n) {
2159 if (n->is_Mem()) {
2160 BasicType bt = n->as_Mem()->memory_type();
2161 if (n->is_Store() && (bt == T_CHAR)) {
2285 // to align_to_ref will be a position zero in the vector.
2286 // (iv + k) mod vector_align == 0
2287 void SuperWord::align_initial_loop_index(MemNode* align_to_ref) {
2288 CountedLoopNode *main_head = lp()->as_CountedLoop();
2289 assert(main_head->is_main_loop(), "");
2290 CountedLoopEndNode* pre_end = get_pre_loop_end(main_head);
2291 assert(pre_end != NULL, "we must have a correct pre-loop");
2292 Node *pre_opaq1 = pre_end->limit();
2293 assert(pre_opaq1->Opcode() == Op_Opaque1, "");
2294 Opaque1Node *pre_opaq = (Opaque1Node*)pre_opaq1;
2295 Node *lim0 = pre_opaq->in(1);
2296
2297 // Where we put new limit calculations
2298 Node *pre_ctrl = pre_end->loopnode()->in(LoopNode::EntryControl);
2299
2300 // Ensure the original loop limit is available from the
2301 // pre-loop Opaque1 node.
2302 Node *orig_limit = pre_opaq->original_loop_limit();
2303 assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
2304
2305 SWPointer align_to_ref_p(align_to_ref, this);
2306 assert(align_to_ref_p.valid(), "sanity");
2307
2308 // Given:
2309 // lim0 == original pre loop limit
2310 // V == v_align (power of 2)
2311 // invar == extra invariant piece of the address expression
2312 // e == offset [ +/- invar ]
2313 //
2314 // When reassociating expressions involving '%' the basic rules are:
2315 // (a - b) % k == 0 => a % k == b % k
2316 // and:
2317 // (a + b) % k == 0 => a % k == (k - b) % k
2318 //
2319 // For stride > 0 && scale > 0,
2320 // Derive the new pre-loop limit "lim" such that the two constraints:
2321 // (1) lim = lim0 + N (where N is some positive integer < V)
2322 // (2) (e + lim) % V == 0
2323 // are true.
2324 //
2325 // Substituting (1) into (2),
2459
2460
2461 //------------------------------init---------------------------
2462 void SuperWord::init() {
2463 _dg.init();
2464 _packset.clear();
2465 _disjoint_ptrs.clear();
2466 _block.clear();
2467 _data_entry.clear();
2468 _mem_slice_head.clear();
2469 _mem_slice_tail.clear();
2470 _iteration_first.clear();
2471 _iteration_last.clear();
2472 _node_info.clear();
2473 _align_to_ref = NULL;
2474 _lpt = NULL;
2475 _lp = NULL;
2476 _bb = NULL;
2477 _iv = NULL;
2478 _race_possible = 0;
2479 _num_work_vecs = 0;
2480 _num_reductions = 0;
2481 }
2482
2483 //------------------------------restart---------------------------
2484 void SuperWord::restart() {
2485 _dg.init();
2486 _packset.clear();
2487 _disjoint_ptrs.clear();
2488 _block.clear();
2489 _data_entry.clear();
2490 _mem_slice_head.clear();
2491 _mem_slice_tail.clear();
2492 _node_info.clear();
2493 }
2494
2495 //------------------------------print_packset---------------------------
2496 void SuperWord::print_packset() {
2497 #ifndef PRODUCT
2498 tty->print_cr("packset");
2529 void SuperWord::print_stmt(Node* s) {
2530 #ifndef PRODUCT
2531 tty->print(" align: %d \t", alignment(s));
2532 s->dump();
2533 #endif
2534 }
2535
2536 //------------------------------blank---------------------------
2537 char* SuperWord::blank(uint depth) {
2538 static char blanks[101];
2539 assert(depth < 101, "too deep");
2540 for (uint i = 0; i < depth; i++) blanks[i] = ' ';
2541 blanks[depth] = '\0';
2542 return blanks;
2543 }
2544
2545
2546 //==============================SWPointer===========================
2547
2548 //----------------------------SWPointer------------------------
2549 SWPointer::SWPointer(MemNode* mem, SuperWord* slp) :
2550 _mem(mem), _slp(slp), _base(NULL), _adr(NULL),
2551 _scale(0), _offset(0), _invar(NULL), _negate_invar(false) {
2552
2553 Node* adr = mem->in(MemNode::Address);
2554 if (!adr->is_AddP()) {
2555 assert(!valid(), "too complex");
2556 return;
2557 }
2558 // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant)
2559 Node* base = adr->in(AddPNode::Base);
2560 // The base address should be loop invariant
2561 if (!invariant(base)) {
2562 assert(!valid(), "base address is loop variant");
2563 return;
2564 }
2565 //unsafe reference could not be aligned appropriately without runtime checking
2566 if (base == NULL || base->bottom_type() == Type::TOP) {
2567 assert(!valid(), "unsafe access");
2568 return;
2569 }
2570 for (int i = 0; i < 3; i++) {
2571 if (!scaled_iv_plus_offset(adr->in(AddPNode::Offset))) {
2572 assert(!valid(), "too complex");
2573 return;
2574 }
2575 adr = adr->in(AddPNode::Address);
2576 if (base == adr || !adr->is_AddP()) {
2577 break; // stop looking at addp's
2578 }
2579 }
2580 _base = base;
2581 _adr = adr;
2582 assert(valid(), "Usable");
2583 }
2584
2585 // Following is used to create a temporary object during
2586 // the pattern match of an address expression.
2587 SWPointer::SWPointer(SWPointer* p) :
2588 _mem(p->_mem), _slp(p->_slp), _base(NULL), _adr(NULL),
2589 _scale(0), _offset(0), _invar(NULL), _negate_invar(false) {}
2590
2591 //------------------------scaled_iv_plus_offset--------------------
2592 // Match: k*iv + offset
2593 // where: k is a constant that maybe zero, and
2594 // offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional
2595 bool SWPointer::scaled_iv_plus_offset(Node* n) {
2596 if (scaled_iv(n)) {
2597 return true;
2598 }
2599 if (offset_plus_k(n)) {
2600 return true;
2601 }
2602 int opc = n->Opcode();
2603 if (opc == Op_AddI) {
2604 if (scaled_iv(n->in(1)) && offset_plus_k(n->in(2))) {
2605 return true;
2606 }
2607 if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
2608 return true;
2609 }
2612 return true;
2613 }
2614 if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
2615 _scale *= -1;
2616 return true;
2617 }
2618 }
2619 return false;
2620 }
2621
2622 //----------------------------scaled_iv------------------------
2623 // Match: k*iv where k is a constant that's not zero
2624 bool SWPointer::scaled_iv(Node* n) {
2625 if (_scale != 0) {
2626 return false; // already found a scale
2627 }
2628 if (n == iv()) {
2629 _scale = 1;
2630 return true;
2631 }
2632 int opc = n->Opcode();
2633 if (opc == Op_MulI) {
2634 if (n->in(1) == iv() && n->in(2)->is_Con()) {
2635 _scale = n->in(2)->get_int();
2636 return true;
2637 } else if (n->in(2) == iv() && n->in(1)->is_Con()) {
2638 _scale = n->in(1)->get_int();
2639 return true;
2640 }
2641 } else if (opc == Op_LShiftI) {
2642 if (n->in(1) == iv() && n->in(2)->is_Con()) {
2643 _scale = 1 << n->in(2)->get_int();
2644 return true;
2645 }
2646 } else if (opc == Op_ConvI2L) {
2647 if (scaled_iv_plus_offset(n->in(1))) {
2648 return true;
2649 }
2650 } else if (opc == Op_LShiftL) {
2651 if (!has_iv() && _invar == NULL) {
2669 //----------------------------offset_plus_k------------------------
2670 // Match: offset is (k [+/- invariant])
2671 // where k maybe zero and invariant is optional, but not both.
2672 bool SWPointer::offset_plus_k(Node* n, bool negate) {
2673 int opc = n->Opcode();
2674 if (opc == Op_ConI) {
2675 _offset += negate ? -(n->get_int()) : n->get_int();
2676 return true;
2677 } else if (opc == Op_ConL) {
2678 // Okay if value fits into an int
2679 const TypeLong* t = n->find_long_type();
2680 if (t->higher_equal(TypeLong::INT)) {
2681 jlong loff = n->get_long();
2682 jint off = (jint)loff;
2683 _offset += negate ? -off : loff;
2684 return true;
2685 }
2686 return false;
2687 }
2688 if (_invar != NULL) return false; // already have an invariant
2689 if (opc == Op_AddI) {
2690 if (n->in(2)->is_Con() && invariant(n->in(1))) {
2691 _negate_invar = negate;
2692 _invar = n->in(1);
2693 _offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
2694 return true;
2695 } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
2696 _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
2697 _negate_invar = negate;
2698 _invar = n->in(2);
2699 return true;
2700 }
2701 }
2702 if (opc == Op_SubI) {
2703 if (n->in(2)->is_Con() && invariant(n->in(1))) {
2704 _negate_invar = negate;
2705 _invar = n->in(1);
2706 _offset += !negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
2707 return true;
2708 } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
|
51 _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
52 _block(arena(), 8, 0, NULL), // nodes in current block
53 _data_entry(arena(), 8, 0, NULL), // nodes with all inputs from outside
54 _mem_slice_head(arena(), 8, 0, NULL), // memory slice heads
55 _mem_slice_tail(arena(), 8, 0, NULL), // memory slice tails
56 _node_info(arena(), 8, 0, SWNodeInfo::initial), // info needed per node
57 _clone_map(phase->C->clone_map()), // map of nodes created in cloning
58 _align_to_ref(NULL), // memory reference to align vectors to
59 _disjoint_ptrs(arena(), 8, 0, OrderedPair::initial), // runtime disambiguated pointer pairs
60 _dg(_arena), // dependence graph
61 _visited(arena()), // visited node set
62 _post_visited(arena()), // post visited node set
63 _n_idx_list(arena(), 8), // scratch list of (node,index) pairs
64 _stk(arena(), 8, 0, NULL), // scratch stack of nodes
65 _nlist(arena(), 8, 0, NULL), // scratch list of nodes
66 _lpt(NULL), // loop tree node
67 _lp(NULL), // LoopNode
68 _bb(NULL), // basic block
69 _iv(NULL), // induction var
70 _race_possible(false), // cases where SDMU is true
71 _early_return(true), // analysis evaluations routine
72 _num_work_vecs(0), // amount of vector work we have
73 _num_reductions(0), // amount of reduction work we have
74 _do_vector_loop(phase->C->do_vector_loop()), // whether to do vectorization/simd style
75 _ii_first(-1), // first loop generation index - only if do_vector_loop()
76 _ii_last(-1), // last loop generation index - only if do_vector_loop()
77 _ii_order(arena(), 8, 0, 0),
78 _vector_loop_debug(phase->C->has_method() && phase->C->method_has_option("VectorizeDebug"))
79 {}
80
81 //------------------------------transform_loop---------------------------
82 void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
83 assert(UseSuperWord, "should be");
84 // Do vectors exist on this architecture?
85 if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
86
87 assert(lpt->_head->is_CountedLoop(), "must be");
88 CountedLoopNode *cl = lpt->_head->as_CountedLoop();
89
90 if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
91
92 if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops
93
94 // Check for no control flow in body (other than exit)
95 Node *cl_exit = cl->loopexit();
96 if (cl_exit->in(0) != lpt->_head) return;
97
98 // Make sure the are no extra control users of the loop backedge
99 if (cl->back_control()->outcnt() != 1) {
100 return;
101 }
102
103 // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
104 CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
105 if (pre_end == NULL) return;
106 Node *pre_opaq1 = pre_end->limit();
107 if (pre_opaq1->Opcode() != Op_Opaque1) return;
108
109 init(); // initialize data structures
110
111 set_lpt(lpt);
112 set_lp(cl);
113
114 // For now, define one block which is the entire loop body
115 set_bb(cl);
116
117 if (do_optimization) {
118 assert(_packset.length() == 0, "packset must be empty");
119 SLP_extract();
120 }
121 }
122
123 //------------------------------early unrolling analysis------------------------------
124 void SuperWord::unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor) {
125 bool is_slp = true;
126 ResourceMark rm;
127 size_t ignored_size = lpt()->_body.size();
128 int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
129 Node_Stack nstack((int)ignored_size);
130 Node *cl_exit = cl->loopexit();
131
132 // First clear the entries
133 for (uint i = 0; i < lpt()->_body.size(); i++) {
134 ignored_loop_nodes[i] = -1;
135 }
136
137 int max_vector = Matcher::max_vector_size(T_INT);
138
139 // Process the loop, some/all of the stack entries will not be in order, ergo
140 // need to preprocess the ignored initial state before we process the loop
141 for (uint i = 0; i < lpt()->_body.size(); i++) {
142 Node* n = lpt()->_body.at(i);
143 if (n == cl->incr() ||
144 n->is_reduction() ||
145 n->is_AddP() ||
146 n->is_Cmp() ||
147 n->is_IfTrue() ||
148 n->is_CountedLoop() ||
149 (n == cl_exit)) {
150 ignored_loop_nodes[i] = n->_idx;
151 continue;
152 }
153
154 if (n->is_If()) {
155 IfNode *iff = n->as_If();
156 if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
157 if (lpt()->is_loop_exit(iff)) {
158 ignored_loop_nodes[i] = n->_idx;
159 continue;
160 }
161 }
162 }
163
164 if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
165 Node* n_tail = n->in(LoopNode::LoopBackControl);
166 if (n_tail != n->in(LoopNode::EntryControl)) {
167 if (!n_tail->is_Mem()) {
168 is_slp = false;
169 break;
170 }
171 }
172 }
173
174 // This must happen after check of phi/if
175 if (n->is_Phi() || n->is_If()) {
176 ignored_loop_nodes[i] = n->_idx;
177 continue;
178 }
179
180 if (n->is_LoadStore() || n->is_MergeMem() ||
181 (n->is_Proj() && !n->as_Proj()->is_CFG())) {
182 is_slp = false;
183 break;
184 }
185
186 if (n->is_Mem()) {
187 Node* adr = n->in(MemNode::Address);
188 Node* n_ctrl = _phase->get_ctrl(adr);
189
190 // save a queue of post process nodes
191 if (n_ctrl != NULL && lpt()->is_member(_phase->get_loop(n_ctrl))) {
192 MemNode* current = n->as_Mem();
193 BasicType bt = current->memory_type();
194 if (is_java_primitive(bt) == false) {
195 ignored_loop_nodes[i] = n->_idx;
196 continue;
197 }
198
199 // Process the memory expression
200 int stack_idx = 0;
201 bool have_side_effects = true;
202 if (adr->is_AddP() == false) {
203 nstack.push(adr, stack_idx++);
204 } else {
205 // Mark the components of the memory operation in nstack
206 SWPointer p1(current, this, &nstack, true);
207 have_side_effects = p1.node_stack()->is_nonempty();
208 }
209
210 // Process the pointer stack
211 while (have_side_effects) {
212 Node* pointer_node = nstack.node();
213 for (uint j = 0; j < lpt()->_body.size(); j++) {
214 Node* cur_node = lpt()->_body.at(j);
215 if (cur_node == pointer_node) {
216 ignored_loop_nodes[j] = cur_node->_idx;
217 break;
218 }
219 }
220 nstack.pop();
221 have_side_effects = nstack.is_nonempty();
222 }
223 }
224 }
225 }
226
227 if (is_slp) {
228 // Now we try to find the maximum supported consistent vector which the machine
229 // description can use
230 for (uint i = 0; i < lpt()->_body.size(); i++) {
231 if (ignored_loop_nodes[i] != -1) continue;
232
233 BasicType bt;
234 Node* n = lpt()->_body.at(i);
235 if (n->is_Store()) {
236 bt = n->as_Mem()->memory_type();
237 }
238 else {
239 bt = n->bottom_type()->basic_type();
240 }
241
242 int cur_max_vector = Matcher::max_vector_size(bt);
243
244 // If a max vector exists which is not larger than _local_loop_unroll_factor
245 // stop looking, we already have the max vector to map to.
246 if (cur_max_vector <= local_loop_unroll_factor) {
247 is_slp = false;
248 #ifndef PRODUCT
249 if (TraceSuperWordLoopUnrollAnalysis) {
250 tty->print_cr("slp analysis fails: unroll limit equals max vector\n");
251 }
252 #endif
253 break;
254 }
255
256 // Map the maximal common vector
257 if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
258 if (cur_max_vector < max_vector) {
259 max_vector = cur_max_vector;
260 }
261 }
262 }
263 if (is_slp) {
264 local_loop_unroll_factor = max_vector;
265 }
266 cl->mark_passed_slp();
267 cl->set_slp_max_unroll(local_loop_unroll_factor);
268 }
269 }
270
271 //------------------------------SLP_extract---------------------------
272 // Extract the superword level parallelism
273 //
274 // 1) A reverse post-order of nodes in the block is constructed. By scanning
275 // this list from first to last, all definitions are visited before their uses.
276 //
277 // 2) A point-to-point dependence graph is constructed between memory references.
278 // This simplies the upcoming "independence" checker.
279 //
280 // 3) The maximum depth in the node graph from the beginning of the block
281 // to each node is computed. This is used to prune the graph search
282 // in the independence checker.
283 //
284 // 4) For integer types, the necessary bit width is propagated backwards
285 // from stores to allow packed operations on byte, char, and short
286 // integers. This reverses the promotion to type "int" that javac
287 // did for operations like: char c1,c2,c3; c1 = c2 + c3.
288 //
402
403 Node_List align_to_refs;
404 int best_iv_adjustment = 0;
405 MemNode* best_align_to_mem_ref = NULL;
406
407 while (memops.size() != 0) {
408 // Find a memory reference to align to.
409 MemNode* mem_ref = find_align_to_ref(memops);
410 if (mem_ref == NULL) break;
411 align_to_refs.push(mem_ref);
412 int iv_adjustment = get_iv_adjustment(mem_ref);
413
414 if (best_align_to_mem_ref == NULL) {
415 // Set memory reference which is the best from all memory operations
416 // to be used for alignment. The pre-loop trip count is modified to align
417 // this reference to a vector-aligned address.
418 best_align_to_mem_ref = mem_ref;
419 best_iv_adjustment = iv_adjustment;
420 }
421
422 SWPointer align_to_ref_p(mem_ref, this, NULL, false);
423 // Set alignment relative to "align_to_ref" for all related memory operations.
424 for (int i = memops.size() - 1; i >= 0; i--) {
425 MemNode* s = memops.at(i)->as_Mem();
426 if (isomorphic(s, mem_ref)) {
427 SWPointer p2(s, this, NULL, false);
428 if (p2.comparable(align_to_ref_p)) {
429 int align = memory_alignment(s, iv_adjustment);
430 set_alignment(s, align);
431 }
432 }
433 }
434
435 // Create initial pack pairs of memory operations for which
436 // alignment is set and vectors will be aligned.
437 bool create_pack = true;
438 if (memory_alignment(mem_ref, best_iv_adjustment) == 0 || _do_vector_loop) {
439 if (!Matcher::misaligned_vectors_ok()) {
440 int vw = vector_width(mem_ref);
441 int vw_best = vector_width(best_align_to_mem_ref);
442 if (vw > vw_best) {
443 // Do not vectorize a memory access with more elements per vector
444 // if unaligned memory access is not allowed because number of
445 // iterations in pre-loop will be not enough to align it.
446 create_pack = false;
447 } else {
448 SWPointer p2(best_align_to_mem_ref, this, NULL, false);
449 if (align_to_ref_p.invar() != p2.invar()) {
450 // Do not vectorize memory accesses with different invariants
451 // if unaligned memory accesses are not allowed.
452 create_pack = false;
453 }
454 }
455 }
456 } else {
457 if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
458 // Can't allow vectorization of unaligned memory accesses with the
459 // same type since it could be overlapped accesses to the same array.
460 create_pack = false;
461 } else {
462 // Allow independent (different type) unaligned memory operations
463 // if HW supports them.
464 if (!Matcher::misaligned_vectors_ok()) {
465 create_pack = false;
466 } else {
467 // Check if packs of the same memory type but
468 // with a different alignment were created before.
545 set_align_to_ref(best_align_to_mem_ref);
546
547 #ifndef PRODUCT
548 if (TraceSuperWord) {
549 tty->print_cr("\nAfter find_adjacent_refs");
550 print_packset();
551 }
552 #endif
553 }
554
555 //------------------------------find_align_to_ref---------------------------
556 // Find a memory reference to align the loop induction variable to.
557 // Looks first at stores then at loads, looking for a memory reference
558 // with the largest number of references similar to it.
559 MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
560 GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
561
562 // Count number of comparable memory ops
563 for (uint i = 0; i < memops.size(); i++) {
564 MemNode* s1 = memops.at(i)->as_Mem();
565 SWPointer p1(s1, this, NULL, false);
566 // Discard if pre loop can't align this reference
567 if (!ref_is_alignable(p1)) {
568 *cmp_ct.adr_at(i) = 0;
569 continue;
570 }
571 for (uint j = i+1; j < memops.size(); j++) {
572 MemNode* s2 = memops.at(j)->as_Mem();
573 if (isomorphic(s1, s2)) {
574 SWPointer p2(s2, this, NULL, false);
575 if (p1.comparable(p2)) {
576 (*cmp_ct.adr_at(i))++;
577 (*cmp_ct.adr_at(j))++;
578 }
579 }
580 }
581 }
582
583 // Find Store (or Load) with the greatest number of "comparable" references,
584 // biggest vector size, smallest data size and smallest iv offset.
585 int max_ct = 0;
586 int max_vw = 0;
587 int max_idx = -1;
588 int min_size = max_jint;
589 int min_iv_offset = max_jint;
590 for (uint j = 0; j < memops.size(); j++) {
591 MemNode* s = memops.at(j)->as_Mem();
592 if (s->is_Store()) {
593 int vw = vector_width_in_bytes(s);
594 assert(vw > 1, "sanity");
595 SWPointer p(s, this, NULL, false);
596 if (cmp_ct.at(j) > max_ct ||
597 cmp_ct.at(j) == max_ct &&
598 (vw > max_vw ||
599 vw == max_vw &&
600 (data_size(s) < min_size ||
601 data_size(s) == min_size &&
602 (p.offset_in_bytes() < min_iv_offset)))) {
603 max_ct = cmp_ct.at(j);
604 max_vw = vw;
605 max_idx = j;
606 min_size = data_size(s);
607 min_iv_offset = p.offset_in_bytes();
608 }
609 }
610 }
611 // If no stores, look at loads
612 if (max_ct == 0) {
613 for (uint j = 0; j < memops.size(); j++) {
614 MemNode* s = memops.at(j)->as_Mem();
615 if (s->is_Load()) {
616 int vw = vector_width_in_bytes(s);
617 assert(vw > 1, "sanity");
618 SWPointer p(s, this, NULL, false);
619 if (cmp_ct.at(j) > max_ct ||
620 cmp_ct.at(j) == max_ct &&
621 (vw > max_vw ||
622 vw == max_vw &&
623 (data_size(s) < min_size ||
624 data_size(s) == min_size &&
625 (p.offset_in_bytes() < min_iv_offset)))) {
626 max_ct = cmp_ct.at(j);
627 max_vw = vw;
628 max_idx = j;
629 min_size = data_size(s);
630 min_iv_offset = p.offset_in_bytes();
631 }
632 }
633 }
634 }
635
636 #ifdef ASSERT
637 if (TraceSuperWord && Verbose) {
638 tty->print_cr("\nVector memops after find_align_to_ref");
709 } else if (span % vw == 0) {
710 // If span is a multiple of vw, we can simplify formula (1) to:
711 // (init_offset + i * span) % vw == 0
712 // =>
713 // (init_offset % vw) + ((i * span) % vw) == 0
714 // =>
715 // init_offset % vw == 0
716 //
717 // Because we add a multiple of vw to the initial offset, the final
718 // offset is a multiple of vw if and only if init_offset is a multiple.
719 //
720 return (init_offset % vw) == 0;
721 }
722 }
723 return false;
724 }
725
726 //---------------------------get_iv_adjustment---------------------------
727 // Calculate loop's iv adjustment for this memory ops.
728 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
729 SWPointer align_to_ref_p(mem_ref, this, NULL, false);
730 int offset = align_to_ref_p.offset_in_bytes();
731 int scale = align_to_ref_p.scale_in_bytes();
732 int elt_size = align_to_ref_p.memory_size();
733 int vw = vector_width_in_bytes(mem_ref);
734 assert(vw > 1, "sanity");
735 int iv_adjustment;
736 if (scale != 0) {
737 int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
738 // At least one iteration is executed in pre-loop by default. As result
739 // several iterations are needed to align memory operations in main-loop even
740 // if offset is 0.
741 int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
742 assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
743 err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
744 iv_adjustment = iv_adjustment_in_bytes/elt_size;
745 } else {
746 // This memory op is not dependent on iv (scale == 0)
747 iv_adjustment = 0;
748 }
749
783 _nlist.at(j)->dump();
784 }
785 }
786 #endif
787 // Make the slice dependent on the root
788 DepMem* slice = _dg.dep(n);
789 _dg.make_edge(_dg.root(), slice);
790
791 // Create a sink for the slice
792 DepMem* slice_sink = _dg.make_node(NULL);
793 _dg.make_edge(slice_sink, _dg.tail());
794
795 // Now visit each pair of memory ops, creating the edges
796 for (int j = _nlist.length() - 1; j >= 0 ; j--) {
797 Node* s1 = _nlist.at(j);
798
799 // If no dependency yet, use slice
800 if (_dg.dep(s1)->in_cnt() == 0) {
801 _dg.make_edge(slice, s1);
802 }
803 SWPointer p1(s1->as_Mem(), this, NULL, false);
804 bool sink_dependent = true;
805 for (int k = j - 1; k >= 0; k--) {
806 Node* s2 = _nlist.at(k);
807 if (s1->is_Load() && s2->is_Load())
808 continue;
809 SWPointer p2(s2->as_Mem(), this, NULL, false);
810
811 int cmp = p1.cmp(p2);
812 if (SuperWordRTDepCheck &&
813 p1.base() != p2.base() && p1.valid() && p2.valid()) {
814 // Create a runtime check to disambiguate
815 OrderedPair pp(p1.base(), p2.base());
816 _disjoint_ptrs.append_if_missing(pp);
817 } else if (!SWPointer::not_equal(cmp)) {
818 // Possibly same address
819 _dg.make_edge(s1, s2);
820 sink_dependent = false;
821 }
822 }
823 if (sink_dependent) {
824 _dg.make_edge(s1, slice_sink);
825 }
826 }
827 #ifndef PRODUCT
828 if (TraceSuperWord) {
829 tty->print_cr("\nDependence graph for slice: %d", n->_idx);
929 return false;
930 }
931
932 //------------------------------are_adjacent_refs---------------------------
933 // Is s1 immediately before s2 in memory?
934 bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
935 if (!s1->is_Mem() || !s2->is_Mem()) return false;
936 if (!in_bb(s1) || !in_bb(s2)) return false;
937
938 // Do not use superword for non-primitives
939 if (!is_java_primitive(s1->as_Mem()->memory_type()) ||
940 !is_java_primitive(s2->as_Mem()->memory_type())) {
941 return false;
942 }
943
944 // FIXME - co_locate_pack fails on Stores in different mem-slices, so
945 // only pack memops that are in the same alias set until that's fixed.
946 if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
947 _phase->C->get_alias_index(s2->as_Mem()->adr_type()))
948 return false;
949 SWPointer p1(s1->as_Mem(), this, NULL, false);
950 SWPointer p2(s2->as_Mem(), this, NULL, false);
951 if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
952 int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
953 return diff == data_size(s1);
954 }
955
956 //------------------------------isomorphic---------------------------
957 // Are s1 and s2 similar?
958 bool SuperWord::isomorphic(Node* s1, Node* s2) {
959 if (s1->Opcode() != s2->Opcode()) return false;
960 if (s1->req() != s2->req()) return false;
961 if (s1->in(0) != s2->in(0)) return false;
962 if (!same_velt_type(s1, s2)) return false;
963 return true;
964 }
965
966 //------------------------------independent---------------------------
967 // Is there no data path from s1 to s2 or s2 to s1?
968 bool SuperWord::independent(Node* s1, Node* s2) {
969 // assert(s1->Opcode() == s2->Opcode(), "check isomorphic first");
970 int d1 = depth(s1);
1749 // Insert extract (unpack) operations for scalar uses
1750 for (int i = 0; i < _packset.length(); i++) {
1751 insert_extracts(_packset.at(i));
1752 }
1753
1754 Compile* C = _phase->C;
1755 uint max_vlen_in_bytes = 0;
1756 for (int i = 0; i < _block.length(); i++) {
1757 Node* n = _block.at(i);
1758 Node_List* p = my_pack(n);
1759 if (p && n == executed_last(p)) {
1760 uint vlen = p->size();
1761 uint vlen_in_bytes = 0;
1762 Node* vn = NULL;
1763 Node* low_adr = p->at(0);
1764 Node* first = executed_first(p);
1765 int opc = n->Opcode();
1766 if (n->is_Load()) {
1767 Node* ctl = n->in(MemNode::Control);
1768 Node* mem = first->in(MemNode::Memory);
1769 SWPointer p1(n->as_Mem(), this, NULL, false);
1770 // Identify the memory dependency for the new loadVector node by
1771 // walking up through memory chain.
1772 // This is done to give flexibility to the new loadVector node so that
1773 // it can move above independent storeVector nodes.
1774 while (mem->is_StoreVector()) {
1775 SWPointer p2(mem->as_Mem(), this, NULL, false);
1776 int cmp = p1.cmp(p2);
1777 if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
1778 mem = mem->in(MemNode::Memory);
1779 } else {
1780 break; // dependent memory
1781 }
1782 }
1783 Node* adr = low_adr->in(MemNode::Address);
1784 const TypePtr* atyp = n->adr_type();
1785 vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
1786 vlen_in_bytes = vn->as_LoadVector()->memory_size();
1787 } else if (n->is_Store()) {
1788 // Promote value to be stored to vector
1789 Node* val = vector_opd(p, MemNode::ValueIn);
1790 Node* ctl = n->in(MemNode::Control);
1791 Node* mem = first->in(MemNode::Memory);
1792 Node* adr = low_adr->in(MemNode::Address);
1793 const TypePtr* atyp = n->adr_type();
1794 vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
1795 vlen_in_bytes = vn->as_StoreVector()->memory_size();
2272 }
2273 }
2274 }
2275 }
2276 }
2277 #ifndef PRODUCT
2278 if (TraceSuperWord && Verbose) {
2279 for (int i = 0; i < _block.length(); i++) {
2280 Node* n = _block.at(i);
2281 velt_type(n)->dump();
2282 tty->print("\t");
2283 n->dump();
2284 }
2285 }
2286 #endif
2287 }
2288
2289 //------------------------------memory_alignment---------------------------
2290 // Alignment within a vector memory reference
2291 int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
2292 SWPointer p(s, this, NULL, false);
2293 if (!p.valid()) {
2294 return bottom_align;
2295 }
2296 int vw = vector_width_in_bytes(s);
2297 if (vw < 2) {
2298 return bottom_align; // No vectors for this type
2299 }
2300 int offset = p.offset_in_bytes();
2301 offset += iv_adjust*p.memory_size();
2302 int off_rem = offset % vw;
2303 int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
2304 return off_mod;
2305 }
2306
2307 //---------------------------container_type---------------------------
2308 // Smallest type containing range of values
2309 const Type* SuperWord::container_type(Node* n) {
2310 if (n->is_Mem()) {
2311 BasicType bt = n->as_Mem()->memory_type();
2312 if (n->is_Store() && (bt == T_CHAR)) {
2436 // to align_to_ref will be a position zero in the vector.
2437 // (iv + k) mod vector_align == 0
2438 void SuperWord::align_initial_loop_index(MemNode* align_to_ref) {
2439 CountedLoopNode *main_head = lp()->as_CountedLoop();
2440 assert(main_head->is_main_loop(), "");
2441 CountedLoopEndNode* pre_end = get_pre_loop_end(main_head);
2442 assert(pre_end != NULL, "we must have a correct pre-loop");
2443 Node *pre_opaq1 = pre_end->limit();
2444 assert(pre_opaq1->Opcode() == Op_Opaque1, "");
2445 Opaque1Node *pre_opaq = (Opaque1Node*)pre_opaq1;
2446 Node *lim0 = pre_opaq->in(1);
2447
2448 // Where we put new limit calculations
2449 Node *pre_ctrl = pre_end->loopnode()->in(LoopNode::EntryControl);
2450
2451 // Ensure the original loop limit is available from the
2452 // pre-loop Opaque1 node.
2453 Node *orig_limit = pre_opaq->original_loop_limit();
2454 assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
2455
2456 SWPointer align_to_ref_p(align_to_ref, this, NULL, false);
2457 assert(align_to_ref_p.valid(), "sanity");
2458
2459 // Given:
2460 // lim0 == original pre loop limit
2461 // V == v_align (power of 2)
2462 // invar == extra invariant piece of the address expression
2463 // e == offset [ +/- invar ]
2464 //
2465 // When reassociating expressions involving '%' the basic rules are:
2466 // (a - b) % k == 0 => a % k == b % k
2467 // and:
2468 // (a + b) % k == 0 => a % k == (k - b) % k
2469 //
2470 // For stride > 0 && scale > 0,
2471 // Derive the new pre-loop limit "lim" such that the two constraints:
2472 // (1) lim = lim0 + N (where N is some positive integer < V)
2473 // (2) (e + lim) % V == 0
2474 // are true.
2475 //
2476 // Substituting (1) into (2),
2610
2611
2612 //------------------------------init---------------------------
2613 void SuperWord::init() {
2614 _dg.init();
2615 _packset.clear();
2616 _disjoint_ptrs.clear();
2617 _block.clear();
2618 _data_entry.clear();
2619 _mem_slice_head.clear();
2620 _mem_slice_tail.clear();
2621 _iteration_first.clear();
2622 _iteration_last.clear();
2623 _node_info.clear();
2624 _align_to_ref = NULL;
2625 _lpt = NULL;
2626 _lp = NULL;
2627 _bb = NULL;
2628 _iv = NULL;
2629 _race_possible = 0;
2630 _early_return = false;
2631 _num_work_vecs = 0;
2632 _num_reductions = 0;
2633 }
2634
2635 //------------------------------restart---------------------------
2636 void SuperWord::restart() {
2637 _dg.init();
2638 _packset.clear();
2639 _disjoint_ptrs.clear();
2640 _block.clear();
2641 _data_entry.clear();
2642 _mem_slice_head.clear();
2643 _mem_slice_tail.clear();
2644 _node_info.clear();
2645 }
2646
2647 //------------------------------print_packset---------------------------
2648 void SuperWord::print_packset() {
2649 #ifndef PRODUCT
2650 tty->print_cr("packset");
2681 void SuperWord::print_stmt(Node* s) {
2682 #ifndef PRODUCT
2683 tty->print(" align: %d \t", alignment(s));
2684 s->dump();
2685 #endif
2686 }
2687
2688 //------------------------------blank---------------------------
2689 char* SuperWord::blank(uint depth) {
2690 static char blanks[101];
2691 assert(depth < 101, "too deep");
2692 for (uint i = 0; i < depth; i++) blanks[i] = ' ';
2693 blanks[depth] = '\0';
2694 return blanks;
2695 }
2696
2697
2698 //==============================SWPointer===========================
2699
2700 //----------------------------SWPointer------------------------
2701 SWPointer::SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only) :
2702 _mem(mem), _slp(slp), _base(NULL), _adr(NULL),
2703 _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
2704 _nstack(nstack), _analyze_only(analyze_only),
2705 _stack_idx(0) {
2706
2707 Node* adr = mem->in(MemNode::Address);
2708 if (!adr->is_AddP()) {
2709 assert(!valid(), "too complex");
2710 return;
2711 }
2712 // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant)
2713 Node* base = adr->in(AddPNode::Base);
2714 // The base address should be loop invariant
2715 if (!invariant(base)) {
2716 assert(!valid(), "base address is loop variant");
2717 return;
2718 }
2719 //unsafe reference could not be aligned appropriately without runtime checking
2720 if (base == NULL || base->bottom_type() == Type::TOP) {
2721 assert(!valid(), "unsafe access");
2722 return;
2723 }
2724 for (int i = 0; i < 3; i++) {
2725 if (!scaled_iv_plus_offset(adr->in(AddPNode::Offset))) {
2726 assert(!valid(), "too complex");
2727 return;
2728 }
2729 adr = adr->in(AddPNode::Address);
2730 if (base == adr || !adr->is_AddP()) {
2731 break; // stop looking at addp's
2732 }
2733 }
2734 _base = base;
2735 _adr = adr;
2736 assert(valid(), "Usable");
2737 }
2738
2739 // Following is used to create a temporary object during
2740 // the pattern match of an address expression.
2741 SWPointer::SWPointer(SWPointer* p) :
2742 _mem(p->_mem), _slp(p->_slp), _base(NULL), _adr(NULL),
2743 _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
2744 _nstack(p->_nstack), _analyze_only(p->_analyze_only),
2745 _stack_idx(p->_stack_idx) {}
2746
2747 //------------------------scaled_iv_plus_offset--------------------
2748 // Match: k*iv + offset
2749 // where: k is a constant that maybe zero, and
2750 // offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional
2751 bool SWPointer::scaled_iv_plus_offset(Node* n) {
2752 if (scaled_iv(n)) {
2753 return true;
2754 }
2755 if (offset_plus_k(n)) {
2756 return true;
2757 }
2758 int opc = n->Opcode();
2759 if (opc == Op_AddI) {
2760 if (scaled_iv(n->in(1)) && offset_plus_k(n->in(2))) {
2761 return true;
2762 }
2763 if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
2764 return true;
2765 }
2768 return true;
2769 }
2770 if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) {
2771 _scale *= -1;
2772 return true;
2773 }
2774 }
2775 return false;
2776 }
2777
2778 //----------------------------scaled_iv------------------------
2779 // Match: k*iv where k is a constant that's not zero
2780 bool SWPointer::scaled_iv(Node* n) {
2781 if (_scale != 0) {
2782 return false; // already found a scale
2783 }
2784 if (n == iv()) {
2785 _scale = 1;
2786 return true;
2787 }
2788 if (_analyze_only && (invariant(n) == false)) {
2789 _nstack->push(n, _stack_idx++);
2790 }
2791 int opc = n->Opcode();
2792 if (opc == Op_MulI) {
2793 if (n->in(1) == iv() && n->in(2)->is_Con()) {
2794 _scale = n->in(2)->get_int();
2795 return true;
2796 } else if (n->in(2) == iv() && n->in(1)->is_Con()) {
2797 _scale = n->in(1)->get_int();
2798 return true;
2799 }
2800 } else if (opc == Op_LShiftI) {
2801 if (n->in(1) == iv() && n->in(2)->is_Con()) {
2802 _scale = 1 << n->in(2)->get_int();
2803 return true;
2804 }
2805 } else if (opc == Op_ConvI2L) {
2806 if (scaled_iv_plus_offset(n->in(1))) {
2807 return true;
2808 }
2809 } else if (opc == Op_LShiftL) {
2810 if (!has_iv() && _invar == NULL) {
2828 //----------------------------offset_plus_k------------------------
2829 // Match: offset is (k [+/- invariant])
2830 // where k maybe zero and invariant is optional, but not both.
2831 bool SWPointer::offset_plus_k(Node* n, bool negate) {
2832 int opc = n->Opcode();
2833 if (opc == Op_ConI) {
2834 _offset += negate ? -(n->get_int()) : n->get_int();
2835 return true;
2836 } else if (opc == Op_ConL) {
2837 // Okay if value fits into an int
2838 const TypeLong* t = n->find_long_type();
2839 if (t->higher_equal(TypeLong::INT)) {
2840 jlong loff = n->get_long();
2841 jint off = (jint)loff;
2842 _offset += negate ? -off : loff;
2843 return true;
2844 }
2845 return false;
2846 }
2847 if (_invar != NULL) return false; // already have an invariant
2848 if (_analyze_only && (invariant(n) == false)) {
2849 _nstack->push(n, _stack_idx++);
2850 }
2851 if (opc == Op_AddI) {
2852 if (n->in(2)->is_Con() && invariant(n->in(1))) {
2853 _negate_invar = negate;
2854 _invar = n->in(1);
2855 _offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
2856 return true;
2857 } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
2858 _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int();
2859 _negate_invar = negate;
2860 _invar = n->in(2);
2861 return true;
2862 }
2863 }
2864 if (opc == Op_SubI) {
2865 if (n->in(2)->is_Con() && invariant(n->in(1))) {
2866 _negate_invar = negate;
2867 _invar = n->in(1);
2868 _offset += !negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();
2869 return true;
2870 } else if (n->in(1)->is_Con() && invariant(n->in(2))) {
|