1 /* 2 * Copyright (c) 2007, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 #include "precompiled.hpp" 25 #include "compiler/compileLog.hpp" 26 #include "libadt/vectset.hpp" 27 #include "memory/allocation.inline.hpp" 28 #include "opto/addnode.hpp" 29 #include "opto/callnode.hpp" 30 #include "opto/castnode.hpp" 31 #include "opto/convertnode.hpp" 32 #include "opto/divnode.hpp" 33 #include "opto/matcher.hpp" 34 #include "opto/memnode.hpp" 35 #include "opto/mulnode.hpp" 36 #include "opto/opcodes.hpp" 37 #include "opto/opaquenode.hpp" 38 #include "opto/superword.hpp" 39 #include "opto/vectornode.hpp" 40 41 // 42 // S U P E R W O R D T R A N S F O R M 43 //============================================================================= 44 45 //------------------------------SuperWord--------------------------- 46 SuperWord::SuperWord(PhaseIdealLoop* phase) : 47 _phase(phase), 48 _igvn(phase->_igvn), 49 _arena(phase->C->comp_arena()), 50 _packset(arena(), 8, 0, NULL), // packs for the current block 51 _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb 52 _block(arena(), 8, 0, NULL), // nodes in current block 53 _data_entry(arena(), 8, 0, NULL), // nodes with all inputs from outside 54 _mem_slice_head(arena(), 8, 0, NULL), // memory slice heads 55 _mem_slice_tail(arena(), 8, 0, NULL), // memory slice tails 56 _node_info(arena(), 8, 0, SWNodeInfo::initial), // info needed per node 57 _clone_map(phase->C->clone_map()), // map of nodes created in cloning 58 _align_to_ref(NULL), // memory reference to align vectors to 59 _disjoint_ptrs(arena(), 8, 0, OrderedPair::initial), // runtime disambiguated pointer pairs 60 _dg(_arena), // dependence graph 61 _visited(arena()), // visited node set 62 _post_visited(arena()), // post visited node set 63 _n_idx_list(arena(), 8), // scratch list of (node,index) pairs 64 _stk(arena(), 8, 0, NULL), // scratch stack of nodes 65 _nlist(arena(), 8, 0, NULL), // scratch list of nodes 66 _lpt(NULL), // loop tree node 67 _lp(NULL), // LoopNode 68 _bb(NULL), // basic block 69 _iv(NULL), // induction var 70 _race_possible(false), // cases where SDMU is true 71 _num_work_vecs(0), // amount of vector work we have 72 _num_reductions(0), // amount of reduction work we have 73 _do_vector_loop(phase->C->do_vector_loop()), // whether to do vectorization/simd style 74 _ii_first(-1), // first loop generation index - only if do_vector_loop() 75 _ii_last(-1), // last loop generation index - only if do_vector_loop() 76 _ii_order(arena(), 8, 0, 0) 77 { 78 #ifndef PRODUCT 79 if (_phase->C->method() != NULL) { 80 _phase->C->method()->has_option_value("VectorizeDebug", _vector_loop_debug); 81 } 82 #endif 83 } 84 85 //------------------------------transform_loop--------------------------- 86 void SuperWord::transform_loop(IdealLoopTree* lpt) { 87 assert(UseSuperWord, "should be"); 88 // Do vectors exist on this architecture? 89 if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return; 90 91 assert(lpt->_head->is_CountedLoop(), "must be"); 92 CountedLoopNode *cl = lpt->_head->as_CountedLoop(); 93 94 if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop 95 96 if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops 97 #ifndef PRODUCT 98 if (_do_vector_loop && is_debug()) { 99 tty->print_cr("SuperWord::transform_loop: lpt->_head->_idx %d", lpt->_head->_idx); 100 Node_Stack stack(_arena, _phase->C->unique() >> 2); 101 Node_List rpo_list; 102 VectorSet visited(_arena); 103 visited.set(lpt->_head->_idx); 104 _phase->rpo(lpt->_head, stack, visited, rpo_list); 105 _phase->dump(lpt, rpo_list.size(), rpo_list ); 106 if(is_trace_loop()) { 107 tty->print_cr("\nSuperWord::transform_loop: whole loop tree"); 108 _phase->dump(); 109 tty->print_cr("SuperWord::transform_loop: end of whole loop tree\n"); 110 } 111 } 112 #endif 113 // Check for no control flow in body (other than exit) 114 Node *cl_exit = cl->loopexit(); 115 if (cl_exit->in(0) != lpt->_head) { 116 #ifndef PRODUCT 117 if (TraceSuperWord) { 118 tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head"); 119 tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump(); 120 tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump(); 121 tty->print("lpt->_head %d", lpt->_head->_idx); lpt->_head->dump(); 122 lpt->dump_head(); 123 } 124 #endif 125 return; 126 } 127 128 // Make sure the are no extra control users of the loop backedge 129 if (cl->back_control()->outcnt() != 1) { 130 return; 131 } 132 133 // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit)))) 134 CountedLoopEndNode* pre_end = get_pre_loop_end(cl); 135 if (pre_end == NULL) return; 136 Node *pre_opaq1 = pre_end->limit(); 137 if (pre_opaq1->Opcode() != Op_Opaque1) return; 138 139 init(); // initialize data structures 140 141 set_lpt(lpt); 142 set_lp(cl); 143 144 // For now, define one block which is the entire loop body 145 set_bb(cl); 146 147 assert(_packset.length() == 0, "packset must be empty"); 148 SLP_extract(); 149 } 150 151 //------------------------------SLP_extract--------------------------- 152 // Extract the superword level parallelism 153 // 154 // 1) A reverse post-order of nodes in the block is constructed. By scanning 155 // this list from first to last, all definitions are visited before their uses. 156 // 157 // 2) A point-to-point dependence graph is constructed between memory references. 158 // This simplies the upcoming "independence" checker. 159 // 160 // 3) The maximum depth in the node graph from the beginning of the block 161 // to each node is computed. This is used to prune the graph search 162 // in the independence checker. 163 // 164 // 4) For integer types, the necessary bit width is propagated backwards 165 // from stores to allow packed operations on byte, char, and short 166 // integers. This reverses the promotion to type "int" that javac 167 // did for operations like: char c1,c2,c3; c1 = c2 + c3. 168 // 169 // 5) One of the memory references is picked to be an aligned vector reference. 170 // The pre-loop trip count is adjusted to align this reference in the 171 // unrolled body. 172 // 173 // 6) The initial set of pack pairs is seeded with memory references. 174 // 175 // 7) The set of pack pairs is extended by following use->def and def->use links. 176 // 177 // 8) The pairs are combined into vector sized packs. 178 // 179 // 9) Reorder the memory slices to co-locate members of the memory packs. 180 // 181 // 10) Generate ideal vector nodes for the final set of packs and where necessary, 182 // inserting scalar promotion, vector creation from multiple scalars, and 183 // extraction of scalar values from vectors. 184 // 185 void SuperWord::SLP_extract() { 186 187 #ifndef PRODUCT 188 if (_do_vector_loop && TraceSuperWord) { 189 tty->print("SuperWord::SLP_extract\n"); 190 tty->print("input loop\n"); 191 _lpt->dump_head(); 192 _lpt->dump(); 193 for (uint i = 0; i < _lpt->_body.size(); i++) { 194 _lpt->_body.at(i)->dump(); 195 } 196 } 197 #endif 198 // Ready the block 199 if (!construct_bb()) { 200 return; // Exit if no interesting nodes or complex graph. 201 } 202 // build _dg, _disjoint_ptrs 203 dependence_graph(); 204 205 // compute function depth(Node*) 206 compute_max_depth(); 207 208 if (_do_vector_loop) { 209 if (mark_generations() != -1) { 210 hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly 211 212 if (!construct_bb()) { 213 return; // Exit if no interesting nodes or complex graph. 214 } 215 dependence_graph(); 216 compute_max_depth(); 217 } 218 219 #ifndef PRODUCT 220 if (TraceSuperWord) { 221 tty->print_cr("\nSuperWord::_do_vector_loop: graph after hoist_loads_in_graph"); 222 _lpt->dump_head(); 223 for (int j = 0; j < _block.length(); j++) { 224 Node* n = _block.at(j); 225 int d = depth(n); 226 for (int i = 0; i < d; i++) tty->print("%s", " "); 227 tty->print("%d :", d); 228 n->dump(); 229 } 230 } 231 #endif 232 } 233 234 compute_vector_element_type(); 235 236 // Attempt vectorization 237 238 find_adjacent_refs(); 239 240 extend_packlist(); 241 242 if (_do_vector_loop) { 243 if (_packset.length() == 0) { 244 #ifndef PRODUCT 245 if (TraceSuperWord) { 246 tty->print_cr("\nSuperWord::_do_vector_loop DFA could not build packset, now trying to build anyway"); 247 } 248 #endif 249 pack_parallel(); 250 } 251 } 252 253 combine_packs(); 254 255 construct_my_pack_map(); 256 257 filter_packs(); 258 259 schedule(); 260 261 output(); 262 } 263 264 //------------------------------find_adjacent_refs--------------------------- 265 // Find the adjacent memory references and create pack pairs for them. 266 // This is the initial set of packs that will then be extended by 267 // following use->def and def->use links. The align positions are 268 // assigned relative to the reference "align_to_ref" 269 void SuperWord::find_adjacent_refs() { 270 // Get list of memory operations 271 Node_List memops; 272 for (int i = 0; i < _block.length(); i++) { 273 Node* n = _block.at(i); 274 if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) && 275 is_java_primitive(n->as_Mem()->memory_type())) { 276 int align = memory_alignment(n->as_Mem(), 0); 277 if (align != bottom_align) { 278 memops.push(n); 279 } 280 } 281 } 282 283 Node_List align_to_refs; 284 int best_iv_adjustment = 0; 285 MemNode* best_align_to_mem_ref = NULL; 286 287 while (memops.size() != 0) { 288 // Find a memory reference to align to. 289 MemNode* mem_ref = find_align_to_ref(memops); 290 if (mem_ref == NULL) break; 291 align_to_refs.push(mem_ref); 292 int iv_adjustment = get_iv_adjustment(mem_ref); 293 294 if (best_align_to_mem_ref == NULL) { 295 // Set memory reference which is the best from all memory operations 296 // to be used for alignment. The pre-loop trip count is modified to align 297 // this reference to a vector-aligned address. 298 best_align_to_mem_ref = mem_ref; 299 best_iv_adjustment = iv_adjustment; 300 #ifndef PRODUCT 301 if(is_trace_adjacent()) { 302 tty->print("SuperWord::find_adjacent_refs best_align_to_mem_ref = %d, best_iv_adjustment = %d", 303 best_align_to_mem_ref->_idx, best_iv_adjustment); 304 best_align_to_mem_ref->dump(); 305 } 306 #endif 307 } 308 309 SWPointer align_to_ref_p(mem_ref, this); 310 // Set alignment relative to "align_to_ref" for all related memory operations. 311 for (int i = memops.size() - 1; i >= 0; i--) { 312 MemNode* s = memops.at(i)->as_Mem(); 313 if (isomorphic(s, mem_ref) && 314 (!_do_vector_loop || same_origin_idx(s, mem_ref))) { 315 SWPointer p2(s, this); 316 if (p2.comparable(align_to_ref_p)) { 317 int align = memory_alignment(s, iv_adjustment); 318 set_alignment(s, align); 319 } 320 } 321 } 322 323 // Create initial pack pairs of memory operations for which 324 // alignment is set and vectors will be aligned. 325 bool create_pack = true; 326 if (memory_alignment(mem_ref, best_iv_adjustment) == 0 || _do_vector_loop) { 327 if (!Matcher::misaligned_vectors_ok()) { 328 int vw = vector_width(mem_ref); 329 int vw_best = vector_width(best_align_to_mem_ref); 330 if (vw > vw_best) { 331 // Do not vectorize a memory access with more elements per vector 332 // if unaligned memory access is not allowed because number of 333 // iterations in pre-loop will be not enough to align it. 334 create_pack = false; 335 } else { 336 SWPointer p2(best_align_to_mem_ref, this); 337 if (align_to_ref_p.invar() != p2.invar()) { 338 // Do not vectorize memory accesses with different invariants 339 // if unaligned memory accesses are not allowed. 340 create_pack = false; 341 } 342 } 343 } 344 } else { 345 if (same_velt_type(mem_ref, best_align_to_mem_ref)) { 346 // Can't allow vectorization of unaligned memory accesses with the 347 // same type since it could be overlapped accesses to the same array. 348 create_pack = false; 349 } else { 350 // Allow independent (different type) unaligned memory operations 351 // if HW supports them. 352 if (!Matcher::misaligned_vectors_ok()) { 353 create_pack = false; 354 } else { 355 // Check if packs of the same memory type but 356 // with a different alignment were created before. 357 for (uint i = 0; i < align_to_refs.size(); i++) { 358 MemNode* mr = align_to_refs.at(i)->as_Mem(); 359 if (same_velt_type(mr, mem_ref) && 360 memory_alignment(mr, iv_adjustment) != 0) 361 create_pack = false; 362 } 363 } 364 } 365 } 366 if (create_pack) { 367 for (uint i = 0; i < memops.size(); i++) { 368 Node* s1 = memops.at(i); 369 int align = alignment(s1); 370 if (align == top_align) continue; 371 for (uint j = 0; j < memops.size(); j++) { 372 Node* s2 = memops.at(j); 373 if (alignment(s2) == top_align) continue; 374 if (s1 != s2 && are_adjacent_refs(s1, s2)) { 375 if (stmts_can_pack(s1, s2, align)) { 376 Node_List* pair = new Node_List(); 377 pair->push(s1); 378 pair->push(s2); 379 if (!_do_vector_loop || same_origin_idx(s1, s2)) { 380 _packset.append(pair); 381 } 382 } 383 } 384 } 385 } 386 } else { // Don't create unaligned pack 387 // First, remove remaining memory ops of the same type from the list. 388 for (int i = memops.size() - 1; i >= 0; i--) { 389 MemNode* s = memops.at(i)->as_Mem(); 390 if (same_velt_type(s, mem_ref)) { 391 memops.remove(i); 392 } 393 } 394 395 // Second, remove already constructed packs of the same type. 396 for (int i = _packset.length() - 1; i >= 0; i--) { 397 Node_List* p = _packset.at(i); 398 MemNode* s = p->at(0)->as_Mem(); 399 if (same_velt_type(s, mem_ref)) { 400 remove_pack_at(i); 401 } 402 } 403 404 // If needed find the best memory reference for loop alignment again. 405 if (same_velt_type(mem_ref, best_align_to_mem_ref)) { 406 // Put memory ops from remaining packs back on memops list for 407 // the best alignment search. 408 uint orig_msize = memops.size(); 409 for (int i = 0; i < _packset.length(); i++) { 410 Node_List* p = _packset.at(i); 411 MemNode* s = p->at(0)->as_Mem(); 412 assert(!same_velt_type(s, mem_ref), "sanity"); 413 memops.push(s); 414 } 415 MemNode* best_align_to_mem_ref = find_align_to_ref(memops); 416 if (best_align_to_mem_ref == NULL) { 417 #ifndef PRODUCT 418 if (TraceSuperWord) { 419 tty->print_cr("SuperWord::find_adjacent_refs(): best_align_to_mem_ref == NULL"); 420 } 421 #endif 422 break; 423 } 424 best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref); 425 #ifndef PRODUCT 426 if(is_trace_adjacent()) { 427 tty->print("SuperWord::find_adjacent_refs best_align_to_mem_ref = %d, best_iv_adjustment = %d", 428 best_align_to_mem_ref->_idx, best_iv_adjustment); 429 best_align_to_mem_ref->dump(); 430 } 431 #endif 432 // Restore list. 433 while (memops.size() > orig_msize) 434 (void)memops.pop(); 435 } 436 } // unaligned memory accesses 437 438 // Remove used mem nodes. 439 for (int i = memops.size() - 1; i >= 0; i--) { 440 MemNode* m = memops.at(i)->as_Mem(); 441 if (alignment(m) != top_align) { 442 memops.remove(i); 443 } 444 } 445 446 } // while (memops.size() != 0 447 set_align_to_ref(best_align_to_mem_ref); 448 449 #ifndef PRODUCT 450 if (TraceSuperWord) { 451 tty->print_cr("\nAfter find_adjacent_refs"); 452 print_packset(); 453 } 454 #endif 455 } 456 457 //------------------------------find_align_to_ref--------------------------- 458 // Find a memory reference to align the loop induction variable to. 459 // Looks first at stores then at loads, looking for a memory reference 460 // with the largest number of references similar to it. 461 MemNode* SuperWord::find_align_to_ref(Node_List &memops) { 462 GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0); 463 464 // Count number of comparable memory ops 465 for (uint i = 0; i < memops.size(); i++) { 466 MemNode* s1 = memops.at(i)->as_Mem(); 467 SWPointer p1(s1, this); 468 // Discard if pre loop can't align this reference 469 if (!ref_is_alignable(p1)) { 470 *cmp_ct.adr_at(i) = 0; 471 continue; 472 } 473 for (uint j = i+1; j < memops.size(); j++) { 474 MemNode* s2 = memops.at(j)->as_Mem(); 475 if (isomorphic(s1, s2)) { 476 SWPointer p2(s2, this); 477 if (p1.comparable(p2)) { 478 (*cmp_ct.adr_at(i))++; 479 (*cmp_ct.adr_at(j))++; 480 } 481 } 482 } 483 } 484 485 // Find Store (or Load) with the greatest number of "comparable" references, 486 // biggest vector size, smallest data size and smallest iv offset. 487 int max_ct = 0; 488 int max_vw = 0; 489 int max_idx = -1; 490 int min_size = max_jint; 491 int min_iv_offset = max_jint; 492 for (uint j = 0; j < memops.size(); j++) { 493 MemNode* s = memops.at(j)->as_Mem(); 494 if (s->is_Store()) { 495 int vw = vector_width_in_bytes(s); 496 assert(vw > 1, "sanity"); 497 SWPointer p(s, this); 498 if (cmp_ct.at(j) > max_ct || 499 cmp_ct.at(j) == max_ct && 500 (vw > max_vw || 501 vw == max_vw && 502 (data_size(s) < min_size || 503 data_size(s) == min_size && 504 (p.offset_in_bytes() < min_iv_offset)))) { 505 max_ct = cmp_ct.at(j); 506 max_vw = vw; 507 max_idx = j; 508 min_size = data_size(s); 509 min_iv_offset = p.offset_in_bytes(); 510 } 511 } 512 } 513 // If no stores, look at loads 514 if (max_ct == 0) { 515 for (uint j = 0; j < memops.size(); j++) { 516 MemNode* s = memops.at(j)->as_Mem(); 517 if (s->is_Load()) { 518 int vw = vector_width_in_bytes(s); 519 assert(vw > 1, "sanity"); 520 SWPointer p(s, this); 521 if (cmp_ct.at(j) > max_ct || 522 cmp_ct.at(j) == max_ct && 523 (vw > max_vw || 524 vw == max_vw && 525 (data_size(s) < min_size || 526 data_size(s) == min_size && 527 (p.offset_in_bytes() < min_iv_offset)))) { 528 max_ct = cmp_ct.at(j); 529 max_vw = vw; 530 max_idx = j; 531 min_size = data_size(s); 532 min_iv_offset = p.offset_in_bytes(); 533 } 534 } 535 } 536 } 537 538 #ifdef ASSERT 539 if (TraceSuperWord && Verbose) { 540 tty->print_cr("\nVector memops after find_align_to_ref"); 541 for (uint i = 0; i < memops.size(); i++) { 542 MemNode* s = memops.at(i)->as_Mem(); 543 s->dump(); 544 } 545 } 546 #endif 547 548 if (max_ct > 0) { 549 #ifdef ASSERT 550 if (TraceSuperWord) { 551 tty->print("\nVector align to node: "); 552 memops.at(max_idx)->as_Mem()->dump(); 553 } 554 #endif 555 return memops.at(max_idx)->as_Mem(); 556 } 557 return NULL; 558 } 559 560 //------------------------------ref_is_alignable--------------------------- 561 // Can the preloop align the reference to position zero in the vector? 562 bool SuperWord::ref_is_alignable(SWPointer& p) { 563 if (!p.has_iv()) { 564 return true; // no induction variable 565 } 566 CountedLoopEndNode* pre_end = get_pre_loop_end(lp()->as_CountedLoop()); 567 assert(pre_end != NULL, "we must have a correct pre-loop"); 568 assert(pre_end->stride_is_con(), "pre loop stride is constant"); 569 int preloop_stride = pre_end->stride_con(); 570 571 int span = preloop_stride * p.scale_in_bytes(); 572 int mem_size = p.memory_size(); 573 int offset = p.offset_in_bytes(); 574 // Stride one accesses are alignable if offset is aligned to memory operation size. 575 // Offset can be unaligned when UseUnalignedAccesses is used. 576 if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) { 577 return true; 578 } 579 // If the initial offset from start of the object is computable, 580 // check if the pre-loop can align the final offset accordingly. 581 // 582 // In other words: Can we find an i such that the offset 583 // after i pre-loop iterations is aligned to vw? 584 // (init_offset + pre_loop) % vw == 0 (1) 585 // where 586 // pre_loop = i * span 587 // is the number of bytes added to the offset by i pre-loop iterations. 588 // 589 // For this to hold we need pre_loop to increase init_offset by 590 // pre_loop = vw - (init_offset % vw) 591 // 592 // This is only possible if pre_loop is divisible by span because each 593 // pre-loop iteration increases the initial offset by 'span' bytes: 594 // (vw - (init_offset % vw)) % span == 0 595 // 596 int vw = vector_width_in_bytes(p.mem()); 597 assert(vw > 1, "sanity"); 598 Node* init_nd = pre_end->init_trip(); 599 if (init_nd->is_Con() && p.invar() == NULL) { 600 int init = init_nd->bottom_type()->is_int()->get_con(); 601 int init_offset = init * p.scale_in_bytes() + offset; 602 assert(init_offset >= 0, "positive offset from object start"); 603 if (vw % span == 0) { 604 // If vm is a multiple of span, we use formula (1). 605 if (span > 0) { 606 return (vw - (init_offset % vw)) % span == 0; 607 } else { 608 assert(span < 0, "nonzero stride * scale"); 609 return (init_offset % vw) % -span == 0; 610 } 611 } else if (span % vw == 0) { 612 // If span is a multiple of vw, we can simplify formula (1) to: 613 // (init_offset + i * span) % vw == 0 614 // => 615 // (init_offset % vw) + ((i * span) % vw) == 0 616 // => 617 // init_offset % vw == 0 618 // 619 // Because we add a multiple of vw to the initial offset, the final 620 // offset is a multiple of vw if and only if init_offset is a multiple. 621 // 622 return (init_offset % vw) == 0; 623 } 624 } 625 return false; 626 } 627 628 //---------------------------get_iv_adjustment--------------------------- 629 // Calculate loop's iv adjustment for this memory ops. 630 int SuperWord::get_iv_adjustment(MemNode* mem_ref) { 631 SWPointer align_to_ref_p(mem_ref, this); 632 int offset = align_to_ref_p.offset_in_bytes(); 633 int scale = align_to_ref_p.scale_in_bytes(); 634 int elt_size = align_to_ref_p.memory_size(); 635 int vw = vector_width_in_bytes(mem_ref); 636 assert(vw > 1, "sanity"); 637 int iv_adjustment; 638 if (scale != 0) { 639 int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; 640 // At least one iteration is executed in pre-loop by default. As result 641 // several iterations are needed to align memory operations in main-loop even 642 // if offset is 0. 643 int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw)); 644 assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0), 645 err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size)); 646 iv_adjustment = iv_adjustment_in_bytes/elt_size; 647 } else { 648 // This memory op is not dependent on iv (scale == 0) 649 iv_adjustment = 0; 650 } 651 652 #ifndef PRODUCT 653 if (TraceSuperWord) { 654 tty->print("SuperWord::get_iv_adjustment: n = %d, noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d: ", 655 mem_ref->_idx, offset, iv_adjustment, elt_size, scale, iv_stride(), vw); 656 mem_ref->dump(); 657 } 658 #endif 659 return iv_adjustment; 660 } 661 662 //---------------------------dependence_graph--------------------------- 663 // Construct dependency graph. 664 // Add dependence edges to load/store nodes for memory dependence 665 // A.out()->DependNode.in(1) and DependNode.out()->B.prec(x) 666 void SuperWord::dependence_graph() { 667 // First, assign a dependence node to each memory node 668 for (int i = 0; i < _block.length(); i++ ) { 669 Node *n = _block.at(i); 670 if (n->is_Mem() || n->is_Phi() && n->bottom_type() == Type::MEMORY) { 671 _dg.make_node(n); 672 } 673 } 674 675 // For each memory slice, create the dependences 676 for (int i = 0; i < _mem_slice_head.length(); i++) { 677 Node* n = _mem_slice_head.at(i); 678 Node* n_tail = _mem_slice_tail.at(i); 679 680 // Get slice in predecessor order (last is first) 681 mem_slice_preds(n_tail, n, _nlist); 682 683 #ifndef PRODUCT 684 if(TraceSuperWord && Verbose) { 685 tty->print_cr("SuperWord::dependence_graph: built a new mem slice"); 686 for (int j = _nlist.length() - 1; j >= 0 ; j--) { 687 _nlist.at(j)->dump(); 688 } 689 } 690 #endif 691 // Make the slice dependent on the root 692 DepMem* slice = _dg.dep(n); 693 _dg.make_edge(_dg.root(), slice); 694 695 // Create a sink for the slice 696 DepMem* slice_sink = _dg.make_node(NULL); 697 _dg.make_edge(slice_sink, _dg.tail()); 698 699 // Now visit each pair of memory ops, creating the edges 700 for (int j = _nlist.length() - 1; j >= 0 ; j--) { 701 Node* s1 = _nlist.at(j); 702 703 // If no dependency yet, use slice 704 if (_dg.dep(s1)->in_cnt() == 0) { 705 _dg.make_edge(slice, s1); 706 } 707 SWPointer p1(s1->as_Mem(), this); 708 bool sink_dependent = true; 709 for (int k = j - 1; k >= 0; k--) { 710 Node* s2 = _nlist.at(k); 711 if (s1->is_Load() && s2->is_Load()) 712 continue; 713 SWPointer p2(s2->as_Mem(), this); 714 715 int cmp = p1.cmp(p2); 716 if (SuperWordRTDepCheck && 717 p1.base() != p2.base() && p1.valid() && p2.valid()) { 718 // Create a runtime check to disambiguate 719 OrderedPair pp(p1.base(), p2.base()); 720 _disjoint_ptrs.append_if_missing(pp); 721 } else if (!SWPointer::not_equal(cmp)) { 722 // Possibly same address 723 _dg.make_edge(s1, s2); 724 sink_dependent = false; 725 } 726 } 727 if (sink_dependent) { 728 _dg.make_edge(s1, slice_sink); 729 } 730 } 731 #ifndef PRODUCT 732 if (TraceSuperWord) { 733 tty->print_cr("\nDependence graph for slice: %d", n->_idx); 734 for (int q = 0; q < _nlist.length(); q++) { 735 _dg.print(_nlist.at(q)); 736 } 737 tty->cr(); 738 } 739 #endif 740 _nlist.clear(); 741 } 742 743 #ifndef PRODUCT 744 if (TraceSuperWord) { 745 tty->print_cr("\ndisjoint_ptrs: %s", _disjoint_ptrs.length() > 0 ? "" : "NONE"); 746 for (int r = 0; r < _disjoint_ptrs.length(); r++) { 747 _disjoint_ptrs.at(r).print(); 748 tty->cr(); 749 } 750 tty->cr(); 751 } 752 #endif 753 } 754 755 //---------------------------mem_slice_preds--------------------------- 756 // Return a memory slice (node list) in predecessor order starting at "start" 757 void SuperWord::mem_slice_preds(Node* start, Node* stop, GrowableArray<Node*> &preds) { 758 assert(preds.length() == 0, "start empty"); 759 Node* n = start; 760 Node* prev = NULL; 761 while (true) { 762 #ifndef PRODUCT 763 if(is_trace_mem_slice()) { 764 tty->print_cr("SuperWord::mem_slice_preds: n %d", n->_idx); 765 } 766 #endif 767 assert(in_bb(n), "must be in block"); 768 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) { 769 Node* out = n->fast_out(i); 770 if (out->is_Load()) { 771 if (in_bb(out)) { 772 preds.push(out); 773 #ifndef PRODUCT 774 if (TraceSuperWord && Verbose) { 775 tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", out->_idx); 776 } 777 #endif 778 } 779 } else { 780 // FIXME 781 if (out->is_MergeMem() && !in_bb(out)) { 782 // Either unrolling is causing a memory edge not to disappear, 783 // or need to run igvn.optimize() again before SLP 784 } else if (out->is_Phi() && out->bottom_type() == Type::MEMORY && !in_bb(out)) { 785 // Ditto. Not sure what else to check further. 786 } else if (out->Opcode() == Op_StoreCM && out->in(MemNode::OopStore) == n) { 787 // StoreCM has an input edge used as a precedence edge. 788 // Maybe an issue when oop stores are vectorized. 789 } else { 790 assert(out == prev || prev == NULL, "no branches off of store slice"); 791 } 792 } 793 } 794 if (n == stop) break; 795 preds.push(n); 796 #ifndef PRODUCT 797 if (TraceSuperWord && Verbose) tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", n->_idx); 798 #endif 799 prev = n; 800 assert(n->is_Mem(), err_msg_res("unexpected node %s", n->Name())); 801 n = n->in(MemNode::Memory); 802 } 803 } 804 805 //------------------------------stmts_can_pack--------------------------- 806 // Can s1 and s2 be in a pack with s1 immediately preceding s2 and 807 // s1 aligned at "align" 808 bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) { 809 810 // Do not use superword for non-primitives 811 BasicType bt1 = velt_basic_type(s1); 812 BasicType bt2 = velt_basic_type(s2); 813 if(!is_java_primitive(bt1) || !is_java_primitive(bt2)) 814 return false; 815 if (Matcher::max_vector_size(bt1) < 2) { 816 return false; // No vectors for this type 817 } 818 819 if (isomorphic(s1, s2)) { 820 if (independent(s1, s2) || reduction(s1, s2)) { 821 if (!exists_at(s1, 0) && !exists_at(s2, 1)) { 822 if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) { 823 int s1_align = alignment(s1); 824 int s2_align = alignment(s2); 825 if (s1_align == top_align || s1_align == align) { 826 if (s2_align == top_align || s2_align == align + data_size(s1)) { 827 return true; 828 } 829 } 830 } 831 } 832 } 833 } 834 return false; 835 } 836 837 //------------------------------exists_at--------------------------- 838 // Does s exist in a pack at position pos? 839 bool SuperWord::exists_at(Node* s, uint pos) { 840 for (int i = 0; i < _packset.length(); i++) { 841 Node_List* p = _packset.at(i); 842 if (p->at(pos) == s) { 843 return true; 844 } 845 } 846 return false; 847 } 848 849 //------------------------------are_adjacent_refs--------------------------- 850 // Is s1 immediately before s2 in memory? 851 bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) { 852 if (!s1->is_Mem() || !s2->is_Mem()) return false; 853 if (!in_bb(s1) || !in_bb(s2)) return false; 854 855 // Do not use superword for non-primitives 856 if (!is_java_primitive(s1->as_Mem()->memory_type()) || 857 !is_java_primitive(s2->as_Mem()->memory_type())) { 858 return false; 859 } 860 861 // FIXME - co_locate_pack fails on Stores in different mem-slices, so 862 // only pack memops that are in the same alias set until that's fixed. 863 if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) != 864 _phase->C->get_alias_index(s2->as_Mem()->adr_type())) 865 return false; 866 SWPointer p1(s1->as_Mem(), this); 867 SWPointer p2(s2->as_Mem(), this); 868 if (p1.base() != p2.base() || !p1.comparable(p2)) return false; 869 int diff = p2.offset_in_bytes() - p1.offset_in_bytes(); 870 return diff == data_size(s1); 871 } 872 873 //------------------------------isomorphic--------------------------- 874 // Are s1 and s2 similar? 875 bool SuperWord::isomorphic(Node* s1, Node* s2) { 876 if (s1->Opcode() != s2->Opcode()) return false; 877 if (s1->req() != s2->req()) return false; 878 if (s1->in(0) != s2->in(0)) return false; 879 if (!same_velt_type(s1, s2)) return false; 880 return true; 881 } 882 883 //------------------------------independent--------------------------- 884 // Is there no data path from s1 to s2 or s2 to s1? 885 bool SuperWord::independent(Node* s1, Node* s2) { 886 // assert(s1->Opcode() == s2->Opcode(), "check isomorphic first"); 887 int d1 = depth(s1); 888 int d2 = depth(s2); 889 if (d1 == d2) return s1 != s2; 890 Node* deep = d1 > d2 ? s1 : s2; 891 Node* shallow = d1 > d2 ? s2 : s1; 892 893 visited_clear(); 894 895 return independent_path(shallow, deep); 896 } 897 898 //------------------------------reduction--------------------------- 899 // Is there a data path between s1 and s2 and the nodes reductions? 900 bool SuperWord::reduction(Node* s1, Node* s2) { 901 bool retValue = false; 902 int d1 = depth(s1); 903 int d2 = depth(s2); 904 if (d1 + 1 == d2) { 905 if (s1->is_reduction() && s2->is_reduction()) { 906 // This is an ordered set, so s1 should define s2 907 for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { 908 Node* t1 = s1->fast_out(i); 909 if (t1 == s2) { 910 // both nodes are reductions and connected 911 retValue = true; 912 } 913 } 914 } 915 } 916 917 return retValue; 918 } 919 920 //------------------------------independent_path------------------------------ 921 // Helper for independent 922 bool SuperWord::independent_path(Node* shallow, Node* deep, uint dp) { 923 if (dp >= 1000) return false; // stop deep recursion 924 visited_set(deep); 925 int shal_depth = depth(shallow); 926 assert(shal_depth <= depth(deep), "must be"); 927 for (DepPreds preds(deep, _dg); !preds.done(); preds.next()) { 928 Node* pred = preds.current(); 929 if (in_bb(pred) && !visited_test(pred)) { 930 if (shallow == pred) { 931 return false; 932 } 933 if (shal_depth < depth(pred) && !independent_path(shallow, pred, dp+1)) { 934 return false; 935 } 936 } 937 } 938 return true; 939 } 940 941 //------------------------------set_alignment--------------------------- 942 void SuperWord::set_alignment(Node* s1, Node* s2, int align) { 943 set_alignment(s1, align); 944 if (align == top_align || align == bottom_align) { 945 set_alignment(s2, align); 946 } else { 947 set_alignment(s2, align + data_size(s1)); 948 } 949 } 950 951 //------------------------------data_size--------------------------- 952 int SuperWord::data_size(Node* s) { 953 int bsize = type2aelembytes(velt_basic_type(s)); 954 assert(bsize != 0, "valid size"); 955 return bsize; 956 } 957 958 //------------------------------extend_packlist--------------------------- 959 // Extend packset by following use->def and def->use links from pack members. 960 void SuperWord::extend_packlist() { 961 bool changed; 962 do { 963 packset_sort(_packset.length()); 964 changed = false; 965 for (int i = 0; i < _packset.length(); i++) { 966 Node_List* p = _packset.at(i); 967 changed |= follow_use_defs(p); 968 changed |= follow_def_uses(p); 969 } 970 } while (changed); 971 972 if (_race_possible) { 973 for (int i = 0; i < _packset.length(); i++) { 974 Node_List* p = _packset.at(i); 975 order_def_uses(p); 976 } 977 } 978 979 #ifndef PRODUCT 980 if (TraceSuperWord) { 981 tty->print_cr("\nAfter extend_packlist"); 982 print_packset(); 983 } 984 #endif 985 } 986 987 //------------------------------follow_use_defs--------------------------- 988 // Extend the packset by visiting operand definitions of nodes in pack p 989 bool SuperWord::follow_use_defs(Node_List* p) { 990 assert(p->size() == 2, "just checking"); 991 Node* s1 = p->at(0); 992 Node* s2 = p->at(1); 993 assert(s1->req() == s2->req(), "just checking"); 994 assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking"); 995 996 if (s1->is_Load()) return false; 997 998 int align = alignment(s1); 999 bool changed = false; 1000 int start = s1->is_Store() ? MemNode::ValueIn : 1; 1001 int end = s1->is_Store() ? MemNode::ValueIn+1 : s1->req(); 1002 for (int j = start; j < end; j++) { 1003 Node* t1 = s1->in(j); 1004 Node* t2 = s2->in(j); 1005 if (!in_bb(t1) || !in_bb(t2)) 1006 continue; 1007 if (stmts_can_pack(t1, t2, align)) { 1008 if (est_savings(t1, t2) >= 0) { 1009 Node_List* pair = new Node_List(); 1010 pair->push(t1); 1011 pair->push(t2); 1012 _packset.append(pair); 1013 set_alignment(t1, t2, align); 1014 changed = true; 1015 } 1016 } 1017 } 1018 return changed; 1019 } 1020 1021 //------------------------------follow_def_uses--------------------------- 1022 // Extend the packset by visiting uses of nodes in pack p 1023 bool SuperWord::follow_def_uses(Node_List* p) { 1024 bool changed = false; 1025 Node* s1 = p->at(0); 1026 Node* s2 = p->at(1); 1027 assert(p->size() == 2, "just checking"); 1028 assert(s1->req() == s2->req(), "just checking"); 1029 assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking"); 1030 1031 if (s1->is_Store()) return false; 1032 1033 int align = alignment(s1); 1034 int savings = -1; 1035 int num_s1_uses = 0; 1036 Node* u1 = NULL; 1037 Node* u2 = NULL; 1038 for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { 1039 Node* t1 = s1->fast_out(i); 1040 num_s1_uses++; 1041 if (!in_bb(t1)) continue; 1042 for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) { 1043 Node* t2 = s2->fast_out(j); 1044 if (!in_bb(t2)) continue; 1045 if (!opnd_positions_match(s1, t1, s2, t2)) 1046 continue; 1047 if (stmts_can_pack(t1, t2, align)) { 1048 int my_savings = est_savings(t1, t2); 1049 if (my_savings > savings) { 1050 savings = my_savings; 1051 u1 = t1; 1052 u2 = t2; 1053 } 1054 } 1055 } 1056 } 1057 if (num_s1_uses > 1) { 1058 _race_possible = true; 1059 } 1060 if (savings >= 0) { 1061 Node_List* pair = new Node_List(); 1062 pair->push(u1); 1063 pair->push(u2); 1064 _packset.append(pair); 1065 set_alignment(u1, u2, align); 1066 changed = true; 1067 } 1068 return changed; 1069 } 1070 1071 //------------------------------order_def_uses--------------------------- 1072 // For extended packsets, ordinally arrange uses packset by major component 1073 void SuperWord::order_def_uses(Node_List* p) { 1074 Node* s1 = p->at(0); 1075 1076 if (s1->is_Store()) return; 1077 1078 // reductions are always managed beforehand 1079 if (s1->is_reduction()) return; 1080 1081 for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { 1082 Node* t1 = s1->fast_out(i); 1083 1084 // Only allow operand swap on commuting operations 1085 if (!t1->is_Add() && !t1->is_Mul()) { 1086 break; 1087 } 1088 1089 // Now find t1's packset 1090 Node_List* p2 = NULL; 1091 for (int j = 0; j < _packset.length(); j++) { 1092 p2 = _packset.at(j); 1093 Node* first = p2->at(0); 1094 if (t1 == first) { 1095 break; 1096 } 1097 p2 = NULL; 1098 } 1099 // Arrange all sub components by the major component 1100 if (p2 != NULL) { 1101 for (uint j = 1; j < p->size(); j++) { 1102 Node* d1 = p->at(j); 1103 Node* u1 = p2->at(j); 1104 opnd_positions_match(s1, t1, d1, u1); 1105 } 1106 } 1107 } 1108 } 1109 1110 //---------------------------opnd_positions_match------------------------- 1111 // Is the use of d1 in u1 at the same operand position as d2 in u2? 1112 bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) { 1113 // check reductions to see if they are marshalled to represent the reduction 1114 // operator in a specified opnd 1115 if (u1->is_reduction() && u2->is_reduction()) { 1116 // ensure reductions have phis and reduction definitions feeding the 1st operand 1117 Node* first = u1->in(2); 1118 if (first->is_Phi() || first->is_reduction()) { 1119 u1->swap_edges(1, 2); 1120 } 1121 // ensure reductions have phis and reduction definitions feeding the 1st operand 1122 first = u2->in(2); 1123 if (first->is_Phi() || first->is_reduction()) { 1124 u2->swap_edges(1, 2); 1125 } 1126 return true; 1127 } 1128 1129 uint ct = u1->req(); 1130 if (ct != u2->req()) return false; 1131 uint i1 = 0; 1132 uint i2 = 0; 1133 do { 1134 for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break; 1135 for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break; 1136 if (i1 != i2) { 1137 if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) { 1138 // Further analysis relies on operands position matching. 1139 u2->swap_edges(i1, i2); 1140 } else { 1141 return false; 1142 } 1143 } 1144 } while (i1 < ct); 1145 return true; 1146 } 1147 1148 //------------------------------est_savings--------------------------- 1149 // Estimate the savings from executing s1 and s2 as a pack 1150 int SuperWord::est_savings(Node* s1, Node* s2) { 1151 int save_in = 2 - 1; // 2 operations per instruction in packed form 1152 1153 // inputs 1154 for (uint i = 1; i < s1->req(); i++) { 1155 Node* x1 = s1->in(i); 1156 Node* x2 = s2->in(i); 1157 if (x1 != x2) { 1158 if (are_adjacent_refs(x1, x2)) { 1159 save_in += adjacent_profit(x1, x2); 1160 } else if (!in_packset(x1, x2)) { 1161 save_in -= pack_cost(2); 1162 } else { 1163 save_in += unpack_cost(2); 1164 } 1165 } 1166 } 1167 1168 // uses of result 1169 uint ct = 0; 1170 int save_use = 0; 1171 for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { 1172 Node* s1_use = s1->fast_out(i); 1173 for (int j = 0; j < _packset.length(); j++) { 1174 Node_List* p = _packset.at(j); 1175 if (p->at(0) == s1_use) { 1176 for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) { 1177 Node* s2_use = s2->fast_out(k); 1178 if (p->at(p->size()-1) == s2_use) { 1179 ct++; 1180 if (are_adjacent_refs(s1_use, s2_use)) { 1181 save_use += adjacent_profit(s1_use, s2_use); 1182 } 1183 } 1184 } 1185 } 1186 } 1187 } 1188 1189 if (ct < s1->outcnt()) save_use += unpack_cost(1); 1190 if (ct < s2->outcnt()) save_use += unpack_cost(1); 1191 1192 return MAX2(save_in, save_use); 1193 } 1194 1195 //------------------------------costs--------------------------- 1196 int SuperWord::adjacent_profit(Node* s1, Node* s2) { return 2; } 1197 int SuperWord::pack_cost(int ct) { return ct; } 1198 int SuperWord::unpack_cost(int ct) { return ct; } 1199 1200 //------------------------------combine_packs--------------------------- 1201 // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last 1202 void SuperWord::combine_packs() { 1203 bool changed = true; 1204 // Combine packs regardless max vector size. 1205 while (changed) { 1206 changed = false; 1207 for (int i = 0; i < _packset.length(); i++) { 1208 Node_List* p1 = _packset.at(i); 1209 if (p1 == NULL) continue; 1210 // Because of sorting we can start at i + 1 1211 for (int j = i + 1; j < _packset.length(); j++) { 1212 Node_List* p2 = _packset.at(j); 1213 if (p2 == NULL) continue; 1214 if (i == j) continue; 1215 if (p1->at(p1->size()-1) == p2->at(0)) { 1216 for (uint k = 1; k < p2->size(); k++) { 1217 p1->push(p2->at(k)); 1218 } 1219 _packset.at_put(j, NULL); 1220 changed = true; 1221 } 1222 } 1223 } 1224 } 1225 1226 // Split packs which have size greater then max vector size. 1227 for (int i = 0; i < _packset.length(); i++) { 1228 Node_List* p1 = _packset.at(i); 1229 if (p1 != NULL) { 1230 BasicType bt = velt_basic_type(p1->at(0)); 1231 uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector 1232 assert(is_power_of_2(max_vlen), "sanity"); 1233 uint psize = p1->size(); 1234 if (!is_power_of_2(psize)) { 1235 // Skip pack which can't be vector. 1236 // case1: for(...) { a[i] = i; } elements values are different (i+x) 1237 // case2: for(...) { a[i] = b[i+1]; } can't align both, load and store 1238 _packset.at_put(i, NULL); 1239 continue; 1240 } 1241 if (psize > max_vlen) { 1242 Node_List* pack = new Node_List(); 1243 for (uint j = 0; j < psize; j++) { 1244 pack->push(p1->at(j)); 1245 if (pack->size() >= max_vlen) { 1246 assert(is_power_of_2(pack->size()), "sanity"); 1247 _packset.append(pack); 1248 pack = new Node_List(); 1249 } 1250 } 1251 _packset.at_put(i, NULL); 1252 } 1253 } 1254 } 1255 1256 // Compress list. 1257 for (int i = _packset.length() - 1; i >= 0; i--) { 1258 Node_List* p1 = _packset.at(i); 1259 if (p1 == NULL) { 1260 _packset.remove_at(i); 1261 } 1262 } 1263 1264 #ifndef PRODUCT 1265 if (TraceSuperWord) { 1266 tty->print_cr("\nAfter combine_packs"); 1267 print_packset(); 1268 } 1269 #endif 1270 } 1271 1272 //-----------------------------construct_my_pack_map-------------------------- 1273 // Construct the map from nodes to packs. Only valid after the 1274 // point where a node is only in one pack (after combine_packs). 1275 void SuperWord::construct_my_pack_map() { 1276 Node_List* rslt = NULL; 1277 for (int i = 0; i < _packset.length(); i++) { 1278 Node_List* p = _packset.at(i); 1279 for (uint j = 0; j < p->size(); j++) { 1280 Node* s = p->at(j); 1281 assert(my_pack(s) == NULL, "only in one pack"); 1282 set_my_pack(s, p); 1283 } 1284 } 1285 } 1286 1287 //------------------------------filter_packs--------------------------- 1288 // Remove packs that are not implemented or not profitable. 1289 void SuperWord::filter_packs() { 1290 // Remove packs that are not implemented 1291 for (int i = _packset.length() - 1; i >= 0; i--) { 1292 Node_List* pk = _packset.at(i); 1293 bool impl = implemented(pk); 1294 if (!impl) { 1295 #ifndef PRODUCT 1296 if (TraceSuperWord && Verbose) { 1297 tty->print_cr("Unimplemented"); 1298 pk->at(0)->dump(); 1299 } 1300 #endif 1301 remove_pack_at(i); 1302 } 1303 Node *n = pk->at(0); 1304 if (n->is_reduction()) { 1305 _num_reductions++; 1306 } else { 1307 _num_work_vecs++; 1308 } 1309 } 1310 1311 // Remove packs that are not profitable 1312 bool changed; 1313 do { 1314 changed = false; 1315 for (int i = _packset.length() - 1; i >= 0; i--) { 1316 Node_List* pk = _packset.at(i); 1317 bool prof = profitable(pk); 1318 if (!prof) { 1319 #ifndef PRODUCT 1320 if (TraceSuperWord && Verbose) { 1321 tty->print_cr("Unprofitable"); 1322 pk->at(0)->dump(); 1323 } 1324 #endif 1325 remove_pack_at(i); 1326 changed = true; 1327 } 1328 } 1329 } while (changed); 1330 1331 #ifndef PRODUCT 1332 if (TraceSuperWord) { 1333 tty->print_cr("\nAfter filter_packs"); 1334 print_packset(); 1335 tty->cr(); 1336 } 1337 #endif 1338 } 1339 1340 //------------------------------implemented--------------------------- 1341 // Can code be generated for pack p? 1342 bool SuperWord::implemented(Node_List* p) { 1343 bool retValue = false; 1344 Node* p0 = p->at(0); 1345 if (p0 != NULL) { 1346 int opc = p0->Opcode(); 1347 uint size = p->size(); 1348 if (p0->is_reduction()) { 1349 const Type *arith_type = p0->bottom_type(); 1350 // Length 2 reductions of INT/LONG do not offer performance benefits 1351 if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) { 1352 retValue = false; 1353 } else { 1354 retValue = ReductionNode::implemented(opc, size, arith_type->basic_type()); 1355 } 1356 } else { 1357 retValue = VectorNode::implemented(opc, size, velt_basic_type(p0)); 1358 } 1359 } 1360 return retValue; 1361 } 1362 1363 //------------------------------same_inputs-------------------------- 1364 // For pack p, are all idx operands the same? 1365 static bool same_inputs(Node_List* p, int idx) { 1366 Node* p0 = p->at(0); 1367 uint vlen = p->size(); 1368 Node* p0_def = p0->in(idx); 1369 for (uint i = 1; i < vlen; i++) { 1370 Node* pi = p->at(i); 1371 Node* pi_def = pi->in(idx); 1372 if (p0_def != pi_def) 1373 return false; 1374 } 1375 return true; 1376 } 1377 1378 //------------------------------profitable--------------------------- 1379 // For pack p, are all operands and all uses (with in the block) vector? 1380 bool SuperWord::profitable(Node_List* p) { 1381 Node* p0 = p->at(0); 1382 uint start, end; 1383 VectorNode::vector_operands(p0, &start, &end); 1384 1385 // Return false if some inputs are not vectors or vectors with different 1386 // size or alignment. 1387 // Also, for now, return false if not scalar promotion case when inputs are 1388 // the same. Later, implement PackNode and allow differing, non-vector inputs 1389 // (maybe just the ones from outside the block.) 1390 for (uint i = start; i < end; i++) { 1391 if (!is_vector_use(p0, i)) 1392 return false; 1393 } 1394 // Check if reductions are connected 1395 if (p0->is_reduction()) { 1396 Node* second_in = p0->in(2); 1397 Node_List* second_pk = my_pack(second_in); 1398 if ((second_pk == NULL) || (_num_work_vecs == _num_reductions)) { 1399 // Remove reduction flag if no parent pack or if not enough work 1400 // to cover reduction expansion overhead 1401 p0->remove_flag(Node::Flag_is_reduction); 1402 return false; 1403 } else if (second_pk->size() != p->size()) { 1404 return false; 1405 } 1406 } 1407 if (VectorNode::is_shift(p0)) { 1408 // For now, return false if shift count is vector or not scalar promotion 1409 // case (different shift counts) because it is not supported yet. 1410 Node* cnt = p0->in(2); 1411 Node_List* cnt_pk = my_pack(cnt); 1412 if (cnt_pk != NULL) 1413 return false; 1414 if (!same_inputs(p, 2)) 1415 return false; 1416 } 1417 if (!p0->is_Store()) { 1418 // For now, return false if not all uses are vector. 1419 // Later, implement ExtractNode and allow non-vector uses (maybe 1420 // just the ones outside the block.) 1421 for (uint i = 0; i < p->size(); i++) { 1422 Node* def = p->at(i); 1423 for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) { 1424 Node* use = def->fast_out(j); 1425 for (uint k = 0; k < use->req(); k++) { 1426 Node* n = use->in(k); 1427 if (def == n) { 1428 // reductions can be loop carried dependences 1429 if (def->is_reduction() && use->is_Phi()) 1430 continue; 1431 if (!is_vector_use(use, k)) { 1432 return false; 1433 } 1434 } 1435 } 1436 } 1437 } 1438 } 1439 return true; 1440 } 1441 1442 //------------------------------schedule--------------------------- 1443 // Adjust the memory graph for the packed operations 1444 void SuperWord::schedule() { 1445 1446 // Co-locate in the memory graph the members of each memory pack 1447 for (int i = 0; i < _packset.length(); i++) { 1448 co_locate_pack(_packset.at(i)); 1449 } 1450 } 1451 1452 //-------------------------------remove_and_insert------------------- 1453 // Remove "current" from its current position in the memory graph and insert 1454 // it after the appropriate insertion point (lip or uip). 1455 void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip, 1456 Node *uip, Unique_Node_List &sched_before) { 1457 Node* my_mem = current->in(MemNode::Memory); 1458 bool sched_up = sched_before.member(current); 1459 1460 // remove current_store from its current position in the memmory graph 1461 for (DUIterator i = current->outs(); current->has_out(i); i++) { 1462 Node* use = current->out(i); 1463 if (use->is_Mem()) { 1464 assert(use->in(MemNode::Memory) == current, "must be"); 1465 if (use == prev) { // connect prev to my_mem 1466 _igvn.replace_input_of(use, MemNode::Memory, my_mem); 1467 --i; //deleted this edge; rescan position 1468 } else if (sched_before.member(use)) { 1469 if (!sched_up) { // Will be moved together with current 1470 _igvn.replace_input_of(use, MemNode::Memory, uip); 1471 --i; //deleted this edge; rescan position 1472 } 1473 } else { 1474 if (sched_up) { // Will be moved together with current 1475 _igvn.replace_input_of(use, MemNode::Memory, lip); 1476 --i; //deleted this edge; rescan position 1477 } 1478 } 1479 } 1480 } 1481 1482 Node *insert_pt = sched_up ? uip : lip; 1483 1484 // all uses of insert_pt's memory state should use current's instead 1485 for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) { 1486 Node* use = insert_pt->out(i); 1487 if (use->is_Mem()) { 1488 assert(use->in(MemNode::Memory) == insert_pt, "must be"); 1489 _igvn.replace_input_of(use, MemNode::Memory, current); 1490 --i; //deleted this edge; rescan position 1491 } else if (!sched_up && use->is_Phi() && use->bottom_type() == Type::MEMORY) { 1492 uint pos; //lip (lower insert point) must be the last one in the memory slice 1493 for (pos=1; pos < use->req(); pos++) { 1494 if (use->in(pos) == insert_pt) break; 1495 } 1496 _igvn.replace_input_of(use, pos, current); 1497 --i; 1498 } 1499 } 1500 1501 //connect current to insert_pt 1502 _igvn.replace_input_of(current, MemNode::Memory, insert_pt); 1503 } 1504 1505 //------------------------------co_locate_pack---------------------------------- 1506 // To schedule a store pack, we need to move any sandwiched memory ops either before 1507 // or after the pack, based upon dependence information: 1508 // (1) If any store in the pack depends on the sandwiched memory op, the 1509 // sandwiched memory op must be scheduled BEFORE the pack; 1510 // (2) If a sandwiched memory op depends on any store in the pack, the 1511 // sandwiched memory op must be scheduled AFTER the pack; 1512 // (3) If a sandwiched memory op (say, memA) depends on another sandwiched 1513 // memory op (say memB), memB must be scheduled before memA. So, if memA is 1514 // scheduled before the pack, memB must also be scheduled before the pack; 1515 // (4) If there is no dependence restriction for a sandwiched memory op, we simply 1516 // schedule this store AFTER the pack 1517 // (5) We know there is no dependence cycle, so there in no other case; 1518 // (6) Finally, all memory ops in another single pack should be moved in the same direction. 1519 // 1520 // To schedule a load pack, we use the memory state of either the first or the last load in 1521 // the pack, based on the dependence constraint. 1522 void SuperWord::co_locate_pack(Node_List* pk) { 1523 if (pk->at(0)->is_Store()) { 1524 MemNode* first = executed_first(pk)->as_Mem(); 1525 MemNode* last = executed_last(pk)->as_Mem(); 1526 Unique_Node_List schedule_before_pack; 1527 Unique_Node_List memops; 1528 1529 MemNode* current = last->in(MemNode::Memory)->as_Mem(); 1530 MemNode* previous = last; 1531 while (true) { 1532 assert(in_bb(current), "stay in block"); 1533 memops.push(previous); 1534 for (DUIterator i = current->outs(); current->has_out(i); i++) { 1535 Node* use = current->out(i); 1536 if (use->is_Mem() && use != previous) 1537 memops.push(use); 1538 } 1539 if (current == first) break; 1540 previous = current; 1541 current = current->in(MemNode::Memory)->as_Mem(); 1542 } 1543 1544 // determine which memory operations should be scheduled before the pack 1545 for (uint i = 1; i < memops.size(); i++) { 1546 Node *s1 = memops.at(i); 1547 if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) { 1548 for (uint j = 0; j< i; j++) { 1549 Node *s2 = memops.at(j); 1550 if (!independent(s1, s2)) { 1551 if (in_pack(s2, pk) || schedule_before_pack.member(s2)) { 1552 schedule_before_pack.push(s1); // s1 must be scheduled before 1553 Node_List* mem_pk = my_pack(s1); 1554 if (mem_pk != NULL) { 1555 for (uint ii = 0; ii < mem_pk->size(); ii++) { 1556 Node* s = mem_pk->at(ii); // follow partner 1557 if (memops.member(s) && !schedule_before_pack.member(s)) 1558 schedule_before_pack.push(s); 1559 } 1560 } 1561 break; 1562 } 1563 } 1564 } 1565 } 1566 } 1567 1568 Node* upper_insert_pt = first->in(MemNode::Memory); 1569 // Following code moves loads connected to upper_insert_pt below aliased stores. 1570 // Collect such loads here and reconnect them back to upper_insert_pt later. 1571 memops.clear(); 1572 for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) { 1573 Node* use = upper_insert_pt->out(i); 1574 if (use->is_Mem() && !use->is_Store()) { 1575 memops.push(use); 1576 } 1577 } 1578 1579 MemNode* lower_insert_pt = last; 1580 previous = last; //previous store in pk 1581 current = last->in(MemNode::Memory)->as_Mem(); 1582 1583 // start scheduling from "last" to "first" 1584 while (true) { 1585 assert(in_bb(current), "stay in block"); 1586 assert(in_pack(previous, pk), "previous stays in pack"); 1587 Node* my_mem = current->in(MemNode::Memory); 1588 1589 if (in_pack(current, pk)) { 1590 // Forward users of my memory state (except "previous) to my input memory state 1591 for (DUIterator i = current->outs(); current->has_out(i); i++) { 1592 Node* use = current->out(i); 1593 if (use->is_Mem() && use != previous) { 1594 assert(use->in(MemNode::Memory) == current, "must be"); 1595 if (schedule_before_pack.member(use)) { 1596 _igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt); 1597 } else { 1598 _igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt); 1599 } 1600 --i; // deleted this edge; rescan position 1601 } 1602 } 1603 previous = current; 1604 } else { // !in_pack(current, pk) ==> a sandwiched store 1605 remove_and_insert(current, previous, lower_insert_pt, upper_insert_pt, schedule_before_pack); 1606 } 1607 1608 if (current == first) break; 1609 current = my_mem->as_Mem(); 1610 } // end while 1611 1612 // Reconnect loads back to upper_insert_pt. 1613 for (uint i = 0; i < memops.size(); i++) { 1614 Node *ld = memops.at(i); 1615 if (ld->in(MemNode::Memory) != upper_insert_pt) { 1616 _igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt); 1617 } 1618 } 1619 } else if (pk->at(0)->is_Load()) { //load 1620 // all loads in the pack should have the same memory state. By default, 1621 // we use the memory state of the last load. However, if any load could 1622 // not be moved down due to the dependence constraint, we use the memory 1623 // state of the first load. 1624 Node* last_mem = executed_last(pk)->in(MemNode::Memory); 1625 Node* first_mem = executed_first(pk)->in(MemNode::Memory); 1626 bool schedule_last = true; 1627 for (uint i = 0; i < pk->size(); i++) { 1628 Node* ld = pk->at(i); 1629 for (Node* current = last_mem; current != ld->in(MemNode::Memory); 1630 current=current->in(MemNode::Memory)) { 1631 assert(current != first_mem, "corrupted memory graph"); 1632 if(current->is_Mem() && !independent(current, ld)){ 1633 schedule_last = false; // a later store depends on this load 1634 break; 1635 } 1636 } 1637 } 1638 1639 Node* mem_input = schedule_last ? last_mem : first_mem; 1640 _igvn.hash_delete(mem_input); 1641 // Give each load the same memory state 1642 for (uint i = 0; i < pk->size(); i++) { 1643 LoadNode* ld = pk->at(i)->as_Load(); 1644 _igvn.replace_input_of(ld, MemNode::Memory, mem_input); 1645 } 1646 } 1647 } 1648 1649 //------------------------------output--------------------------- 1650 // Convert packs into vector node operations 1651 void SuperWord::output() { 1652 if (_packset.length() == 0) return; 1653 1654 #ifndef PRODUCT 1655 if (TraceLoopOpts) { 1656 tty->print("SuperWord "); 1657 lpt()->dump_head(); 1658 } 1659 #endif 1660 1661 // MUST ENSURE main loop's initial value is properly aligned: 1662 // (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0 1663 1664 align_initial_loop_index(align_to_ref()); 1665 1666 // Insert extract (unpack) operations for scalar uses 1667 for (int i = 0; i < _packset.length(); i++) { 1668 insert_extracts(_packset.at(i)); 1669 } 1670 1671 Compile* C = _phase->C; 1672 uint max_vlen_in_bytes = 0; 1673 for (int i = 0; i < _block.length(); i++) { 1674 Node* n = _block.at(i); 1675 Node_List* p = my_pack(n); 1676 if (p && n == executed_last(p)) { 1677 uint vlen = p->size(); 1678 uint vlen_in_bytes = 0; 1679 Node* vn = NULL; 1680 Node* low_adr = p->at(0); 1681 Node* first = executed_first(p); 1682 int opc = n->Opcode(); 1683 if (n->is_Load()) { 1684 Node* ctl = n->in(MemNode::Control); 1685 Node* mem = first->in(MemNode::Memory); 1686 SWPointer p1(n->as_Mem(), this); 1687 // Identify the memory dependency for the new loadVector node by 1688 // walking up through memory chain. 1689 // This is done to give flexibility to the new loadVector node so that 1690 // it can move above independent storeVector nodes. 1691 while (mem->is_StoreVector()) { 1692 SWPointer p2(mem->as_Mem(), this); 1693 int cmp = p1.cmp(p2); 1694 if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) { 1695 mem = mem->in(MemNode::Memory); 1696 } else { 1697 break; // dependent memory 1698 } 1699 } 1700 Node* adr = low_adr->in(MemNode::Address); 1701 const TypePtr* atyp = n->adr_type(); 1702 vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p)); 1703 vlen_in_bytes = vn->as_LoadVector()->memory_size(); 1704 } else if (n->is_Store()) { 1705 // Promote value to be stored to vector 1706 Node* val = vector_opd(p, MemNode::ValueIn); 1707 Node* ctl = n->in(MemNode::Control); 1708 Node* mem = first->in(MemNode::Memory); 1709 Node* adr = low_adr->in(MemNode::Address); 1710 const TypePtr* atyp = n->adr_type(); 1711 vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen); 1712 vlen_in_bytes = vn->as_StoreVector()->memory_size(); 1713 } else if (n->req() == 3) { 1714 // Promote operands to vector 1715 Node* in1 = NULL; 1716 bool node_isa_reduction = n->is_reduction(); 1717 if (node_isa_reduction) { 1718 // the input to the first reduction operation is retained 1719 in1 = low_adr->in(1); 1720 } else { 1721 in1 = vector_opd(p, 1); 1722 } 1723 Node* in2 = vector_opd(p, 2); 1724 if (VectorNode::is_invariant_vector(in1) && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) { 1725 // Move invariant vector input into second position to avoid register spilling. 1726 Node* tmp = in1; 1727 in1 = in2; 1728 in2 = tmp; 1729 } 1730 if (node_isa_reduction) { 1731 const Type *arith_type = n->bottom_type(); 1732 vn = ReductionNode::make(opc, NULL, in1, in2, arith_type->basic_type()); 1733 if (in2->is_Load()) { 1734 vlen_in_bytes = in2->as_LoadVector()->memory_size(); 1735 } else { 1736 vlen_in_bytes = in2->as_Vector()->length_in_bytes(); 1737 } 1738 } else { 1739 vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n)); 1740 vlen_in_bytes = vn->as_Vector()->length_in_bytes(); 1741 } 1742 } else { 1743 ShouldNotReachHere(); 1744 } 1745 assert(vn != NULL, "sanity"); 1746 _igvn.register_new_node_with_optimizer(vn); 1747 _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0))); 1748 for (uint j = 0; j < p->size(); j++) { 1749 Node* pm = p->at(j); 1750 _igvn.replace_node(pm, vn); 1751 } 1752 _igvn._worklist.push(vn); 1753 1754 if (vlen_in_bytes > max_vlen_in_bytes) { 1755 max_vlen_in_bytes = vlen_in_bytes; 1756 } 1757 #ifdef ASSERT 1758 if (TraceNewVectors) { 1759 tty->print("new Vector node: "); 1760 vn->dump(); 1761 } 1762 #endif 1763 } 1764 } 1765 C->set_max_vector_size(max_vlen_in_bytes); 1766 } 1767 1768 //------------------------------vector_opd--------------------------- 1769 // Create a vector operand for the nodes in pack p for operand: in(opd_idx) 1770 Node* SuperWord::vector_opd(Node_List* p, int opd_idx) { 1771 Node* p0 = p->at(0); 1772 uint vlen = p->size(); 1773 Node* opd = p0->in(opd_idx); 1774 1775 if (same_inputs(p, opd_idx)) { 1776 if (opd->is_Vector() || opd->is_LoadVector()) { 1777 assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector"); 1778 return opd; // input is matching vector 1779 } 1780 if ((opd_idx == 2) && VectorNode::is_shift(p0)) { 1781 Compile* C = _phase->C; 1782 Node* cnt = opd; 1783 // Vector instructions do not mask shift count, do it here. 1784 juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1); 1785 const TypeInt* t = opd->find_int_type(); 1786 if (t != NULL && t->is_con()) { 1787 juint shift = t->get_con(); 1788 if (shift > mask) { // Unsigned cmp 1789 cnt = ConNode::make(TypeInt::make(shift & mask)); 1790 } 1791 } else { 1792 if (t == NULL || t->_lo < 0 || t->_hi > (int)mask) { 1793 cnt = ConNode::make(TypeInt::make(mask)); 1794 _igvn.register_new_node_with_optimizer(cnt); 1795 cnt = new AndINode(opd, cnt); 1796 _igvn.register_new_node_with_optimizer(cnt); 1797 _phase->set_ctrl(cnt, _phase->get_ctrl(opd)); 1798 } 1799 assert(opd->bottom_type()->isa_int(), "int type only"); 1800 // Move non constant shift count into vector register. 1801 cnt = VectorNode::shift_count(p0, cnt, vlen, velt_basic_type(p0)); 1802 } 1803 if (cnt != opd) { 1804 _igvn.register_new_node_with_optimizer(cnt); 1805 _phase->set_ctrl(cnt, _phase->get_ctrl(opd)); 1806 } 1807 return cnt; 1808 } 1809 assert(!opd->is_StoreVector(), "such vector is not expected here"); 1810 // Convert scalar input to vector with the same number of elements as 1811 // p0's vector. Use p0's type because size of operand's container in 1812 // vector should match p0's size regardless operand's size. 1813 const Type* p0_t = velt_type(p0); 1814 VectorNode* vn = VectorNode::scalar2vector(opd, vlen, p0_t); 1815 1816 _igvn.register_new_node_with_optimizer(vn); 1817 _phase->set_ctrl(vn, _phase->get_ctrl(opd)); 1818 #ifdef ASSERT 1819 if (TraceNewVectors) { 1820 tty->print("new Vector node: "); 1821 vn->dump(); 1822 } 1823 #endif 1824 return vn; 1825 } 1826 1827 // Insert pack operation 1828 BasicType bt = velt_basic_type(p0); 1829 PackNode* pk = PackNode::make(opd, vlen, bt); 1830 DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); ) 1831 1832 for (uint i = 1; i < vlen; i++) { 1833 Node* pi = p->at(i); 1834 Node* in = pi->in(opd_idx); 1835 assert(my_pack(in) == NULL, "Should already have been unpacked"); 1836 assert(opd_bt == in->bottom_type()->basic_type(), "all same type"); 1837 pk->add_opd(in); 1838 } 1839 _igvn.register_new_node_with_optimizer(pk); 1840 _phase->set_ctrl(pk, _phase->get_ctrl(opd)); 1841 #ifdef ASSERT 1842 if (TraceNewVectors) { 1843 tty->print("new Vector node: "); 1844 pk->dump(); 1845 } 1846 #endif 1847 return pk; 1848 } 1849 1850 //------------------------------insert_extracts--------------------------- 1851 // If a use of pack p is not a vector use, then replace the 1852 // use with an extract operation. 1853 void SuperWord::insert_extracts(Node_List* p) { 1854 if (p->at(0)->is_Store()) return; 1855 assert(_n_idx_list.is_empty(), "empty (node,index) list"); 1856 1857 // Inspect each use of each pack member. For each use that is 1858 // not a vector use, replace the use with an extract operation. 1859 1860 for (uint i = 0; i < p->size(); i++) { 1861 Node* def = p->at(i); 1862 for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) { 1863 Node* use = def->fast_out(j); 1864 for (uint k = 0; k < use->req(); k++) { 1865 Node* n = use->in(k); 1866 if (def == n) { 1867 if (!is_vector_use(use, k)) { 1868 _n_idx_list.push(use, k); 1869 } 1870 } 1871 } 1872 } 1873 } 1874 1875 while (_n_idx_list.is_nonempty()) { 1876 Node* use = _n_idx_list.node(); 1877 int idx = _n_idx_list.index(); 1878 _n_idx_list.pop(); 1879 Node* def = use->in(idx); 1880 1881 if (def->is_reduction()) continue; 1882 1883 // Insert extract operation 1884 _igvn.hash_delete(def); 1885 int def_pos = alignment(def) / data_size(def); 1886 1887 Node* ex = ExtractNode::make(def, def_pos, velt_basic_type(def)); 1888 _igvn.register_new_node_with_optimizer(ex); 1889 _phase->set_ctrl(ex, _phase->get_ctrl(def)); 1890 _igvn.replace_input_of(use, idx, ex); 1891 _igvn._worklist.push(def); 1892 1893 bb_insert_after(ex, bb_idx(def)); 1894 set_velt_type(ex, velt_type(def)); 1895 } 1896 } 1897 1898 //------------------------------is_vector_use--------------------------- 1899 // Is use->in(u_idx) a vector use? 1900 bool SuperWord::is_vector_use(Node* use, int u_idx) { 1901 Node_List* u_pk = my_pack(use); 1902 if (u_pk == NULL) return false; 1903 if (use->is_reduction()) return true; 1904 Node* def = use->in(u_idx); 1905 Node_List* d_pk = my_pack(def); 1906 if (d_pk == NULL) { 1907 // check for scalar promotion 1908 Node* n = u_pk->at(0)->in(u_idx); 1909 for (uint i = 1; i < u_pk->size(); i++) { 1910 if (u_pk->at(i)->in(u_idx) != n) return false; 1911 } 1912 return true; 1913 } 1914 if (u_pk->size() != d_pk->size()) 1915 return false; 1916 for (uint i = 0; i < u_pk->size(); i++) { 1917 Node* ui = u_pk->at(i); 1918 Node* di = d_pk->at(i); 1919 if (ui->in(u_idx) != di || alignment(ui) != alignment(di)) 1920 return false; 1921 } 1922 return true; 1923 } 1924 1925 //------------------------------construct_bb--------------------------- 1926 // Construct reverse postorder list of block members 1927 bool SuperWord::construct_bb() { 1928 Node* entry = bb(); 1929 1930 assert(_stk.length() == 0, "stk is empty"); 1931 assert(_block.length() == 0, "block is empty"); 1932 assert(_data_entry.length() == 0, "data_entry is empty"); 1933 assert(_mem_slice_head.length() == 0, "mem_slice_head is empty"); 1934 assert(_mem_slice_tail.length() == 0, "mem_slice_tail is empty"); 1935 1936 // Find non-control nodes with no inputs from within block, 1937 // create a temporary map from node _idx to bb_idx for use 1938 // by the visited and post_visited sets, 1939 // and count number of nodes in block. 1940 int bb_ct = 0; 1941 for (uint i = 0; i < lpt()->_body.size(); i++) { 1942 Node *n = lpt()->_body.at(i); 1943 set_bb_idx(n, i); // Create a temporary map 1944 if (in_bb(n)) { 1945 if (n->is_LoadStore() || n->is_MergeMem() || 1946 (n->is_Proj() && !n->as_Proj()->is_CFG())) { 1947 // Bailout if the loop has LoadStore, MergeMem or data Proj 1948 // nodes. Superword optimization does not work with them. 1949 return false; 1950 } 1951 bb_ct++; 1952 if (!n->is_CFG()) { 1953 bool found = false; 1954 for (uint j = 0; j < n->req(); j++) { 1955 Node* def = n->in(j); 1956 if (def && in_bb(def)) { 1957 found = true; 1958 break; 1959 } 1960 } 1961 if (!found) { 1962 assert(n != entry, "can't be entry"); 1963 _data_entry.push(n); 1964 } 1965 } 1966 } 1967 } 1968 1969 // Find memory slices (head and tail) 1970 for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) { 1971 Node *n = lp()->fast_out(i); 1972 if (in_bb(n) && (n->is_Phi() && n->bottom_type() == Type::MEMORY)) { 1973 Node* n_tail = n->in(LoopNode::LoopBackControl); 1974 if (n_tail != n->in(LoopNode::EntryControl)) { 1975 if (!n_tail->is_Mem()) { 1976 assert(n_tail->is_Mem(), err_msg_res("unexpected node for memory slice: %s", n_tail->Name())); 1977 return false; // Bailout 1978 } 1979 _mem_slice_head.push(n); 1980 _mem_slice_tail.push(n_tail); 1981 } 1982 } 1983 } 1984 1985 // Create an RPO list of nodes in block 1986 1987 visited_clear(); 1988 post_visited_clear(); 1989 1990 // Push all non-control nodes with no inputs from within block, then control entry 1991 for (int j = 0; j < _data_entry.length(); j++) { 1992 Node* n = _data_entry.at(j); 1993 visited_set(n); 1994 _stk.push(n); 1995 } 1996 visited_set(entry); 1997 _stk.push(entry); 1998 1999 // Do a depth first walk over out edges 2000 int rpo_idx = bb_ct - 1; 2001 int size; 2002 int reduction_uses = 0; 2003 while ((size = _stk.length()) > 0) { 2004 Node* n = _stk.top(); // Leave node on stack 2005 if (!visited_test_set(n)) { 2006 // forward arc in graph 2007 } else if (!post_visited_test(n)) { 2008 // cross or back arc 2009 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) { 2010 Node *use = n->fast_out(i); 2011 if (in_bb(use) && !visited_test(use) && 2012 // Don't go around backedge 2013 (!use->is_Phi() || n == entry)) { 2014 if (use->is_reduction()) { 2015 // First see if we can map the reduction on the given system we are on, then 2016 // make a data entry operation for each reduction we see. 2017 BasicType bt = use->bottom_type()->basic_type(); 2018 if (ReductionNode::implemented(use->Opcode(), Matcher::min_vector_size(bt), bt)) { 2019 reduction_uses++; 2020 } 2021 } 2022 _stk.push(use); 2023 } 2024 } 2025 if (_stk.length() == size) { 2026 // There were no additional uses, post visit node now 2027 _stk.pop(); // Remove node from stack 2028 assert(rpo_idx >= 0, ""); 2029 _block.at_put_grow(rpo_idx, n); 2030 rpo_idx--; 2031 post_visited_set(n); 2032 assert(rpo_idx >= 0 || _stk.is_empty(), ""); 2033 } 2034 } else { 2035 _stk.pop(); // Remove post-visited node from stack 2036 } 2037 }//while 2038 2039 int ii_current = -1; 2040 unsigned int load_idx = -1; 2041 _ii_order.clear(); 2042 // Create real map of block indices for nodes 2043 for (int j = 0; j < _block.length(); j++) { 2044 Node* n = _block.at(j); 2045 set_bb_idx(n, j); 2046 if (_do_vector_loop && n->is_Load()) { 2047 if (ii_current == -1) { 2048 ii_current = _clone_map.gen(n->_idx); 2049 _ii_order.push(ii_current); 2050 load_idx = _clone_map.idx(n->_idx); 2051 } else if (_clone_map.idx(n->_idx) == load_idx && _clone_map.gen(n->_idx) != ii_current) { 2052 ii_current = _clone_map.gen(n->_idx); 2053 _ii_order.push(ii_current); 2054 } 2055 } 2056 }//for 2057 2058 // Ensure extra info is allocated. 2059 initialize_bb(); 2060 2061 #ifndef PRODUCT 2062 if (_vector_loop_debug && _ii_order.length() > 0) { 2063 tty->print("SuperWord::construct_bb: List of generations: "); 2064 for (int jj = 0; jj < _ii_order.length(); ++jj) { 2065 tty->print(" %d:%d", jj, _ii_order.at(jj)); 2066 } 2067 tty->print_cr(" "); 2068 } 2069 if (TraceSuperWord) { 2070 print_bb(); 2071 tty->print_cr("\ndata entry nodes: %s", _data_entry.length() > 0 ? "" : "NONE"); 2072 for (int m = 0; m < _data_entry.length(); m++) { 2073 tty->print("%3d ", m); 2074 _data_entry.at(m)->dump(); 2075 } 2076 tty->print_cr("\nmemory slices: %s", _mem_slice_head.length() > 0 ? "" : "NONE"); 2077 for (int m = 0; m < _mem_slice_head.length(); m++) { 2078 tty->print("%3d ", m); _mem_slice_head.at(m)->dump(); 2079 tty->print(" "); _mem_slice_tail.at(m)->dump(); 2080 } 2081 } 2082 #endif 2083 assert(rpo_idx == -1 && bb_ct == _block.length(), "all block members found"); 2084 return (_mem_slice_head.length() > 0) || (reduction_uses > 0) || (_data_entry.length() > 0); 2085 } 2086 2087 //------------------------------initialize_bb--------------------------- 2088 // Initialize per node info 2089 void SuperWord::initialize_bb() { 2090 Node* last = _block.at(_block.length() - 1); 2091 grow_node_info(bb_idx(last)); 2092 } 2093 2094 //------------------------------bb_insert_after--------------------------- 2095 // Insert n into block after pos 2096 void SuperWord::bb_insert_after(Node* n, int pos) { 2097 int n_pos = pos + 1; 2098 // Make room 2099 for (int i = _block.length() - 1; i >= n_pos; i--) { 2100 _block.at_put_grow(i+1, _block.at(i)); 2101 } 2102 for (int j = _node_info.length() - 1; j >= n_pos; j--) { 2103 _node_info.at_put_grow(j+1, _node_info.at(j)); 2104 } 2105 // Set value 2106 _block.at_put_grow(n_pos, n); 2107 _node_info.at_put_grow(n_pos, SWNodeInfo::initial); 2108 // Adjust map from node->_idx to _block index 2109 for (int i = n_pos; i < _block.length(); i++) { 2110 set_bb_idx(_block.at(i), i); 2111 } 2112 } 2113 2114 //------------------------------compute_max_depth--------------------------- 2115 // Compute max depth for expressions from beginning of block 2116 // Use to prune search paths during test for independence. 2117 void SuperWord::compute_max_depth() { 2118 int ct = 0; 2119 bool again; 2120 do { 2121 again = false; 2122 for (int i = 0; i < _block.length(); i++) { 2123 Node* n = _block.at(i); 2124 if (!n->is_Phi()) { 2125 int d_orig = depth(n); 2126 int d_in = 0; 2127 for (DepPreds preds(n, _dg); !preds.done(); preds.next()) { 2128 Node* pred = preds.current(); 2129 if (in_bb(pred)) { 2130 d_in = MAX2(d_in, depth(pred)); 2131 } 2132 } 2133 if (d_in + 1 != d_orig) { 2134 set_depth(n, d_in + 1); 2135 again = true; 2136 } 2137 } 2138 } 2139 ct++; 2140 } while (again); 2141 #ifndef PRODUCT 2142 if (TraceSuperWord && Verbose) 2143 tty->print_cr("compute_max_depth iterated: %d times", ct); 2144 #endif 2145 } 2146 2147 //-------------------------compute_vector_element_type----------------------- 2148 // Compute necessary vector element type for expressions 2149 // This propagates backwards a narrower integer type when the 2150 // upper bits of the value are not needed. 2151 // Example: char a,b,c; a = b + c; 2152 // Normally the type of the add is integer, but for packed character 2153 // operations the type of the add needs to be char. 2154 void SuperWord::compute_vector_element_type() { 2155 #ifndef PRODUCT 2156 if (TraceSuperWord && Verbose) 2157 tty->print_cr("\ncompute_velt_type:"); 2158 #endif 2159 2160 // Initial type 2161 for (int i = 0; i < _block.length(); i++) { 2162 Node* n = _block.at(i); 2163 set_velt_type(n, container_type(n)); 2164 } 2165 2166 // Propagate integer narrowed type backwards through operations 2167 // that don't depend on higher order bits 2168 for (int i = _block.length() - 1; i >= 0; i--) { 2169 Node* n = _block.at(i); 2170 // Only integer types need be examined 2171 const Type* vtn = velt_type(n); 2172 if (vtn->basic_type() == T_INT) { 2173 uint start, end; 2174 VectorNode::vector_operands(n, &start, &end); 2175 2176 for (uint j = start; j < end; j++) { 2177 Node* in = n->in(j); 2178 // Don't propagate through a memory 2179 if (!in->is_Mem() && in_bb(in) && velt_type(in)->basic_type() == T_INT && 2180 data_size(n) < data_size(in)) { 2181 bool same_type = true; 2182 for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) { 2183 Node *use = in->fast_out(k); 2184 if (!in_bb(use) || !same_velt_type(use, n)) { 2185 same_type = false; 2186 break; 2187 } 2188 } 2189 if (same_type) { 2190 // For right shifts of small integer types (bool, byte, char, short) 2191 // we need precise information about sign-ness. Only Load nodes have 2192 // this information because Store nodes are the same for signed and 2193 // unsigned values. And any arithmetic operation after a load may 2194 // expand a value to signed Int so such right shifts can't be used 2195 // because vector elements do not have upper bits of Int. 2196 const Type* vt = vtn; 2197 if (VectorNode::is_shift(in)) { 2198 Node* load = in->in(1); 2199 if (load->is_Load() && in_bb(load) && (velt_type(load)->basic_type() == T_INT)) { 2200 vt = velt_type(load); 2201 } else if (in->Opcode() != Op_LShiftI) { 2202 // Widen type to Int to avoid creation of right shift vector 2203 // (align + data_size(s1) check in stmts_can_pack() will fail). 2204 // Note, left shifts work regardless type. 2205 vt = TypeInt::INT; 2206 } 2207 } 2208 set_velt_type(in, vt); 2209 } 2210 } 2211 } 2212 } 2213 } 2214 #ifndef PRODUCT 2215 if (TraceSuperWord && Verbose) { 2216 for (int i = 0; i < _block.length(); i++) { 2217 Node* n = _block.at(i); 2218 velt_type(n)->dump(); 2219 tty->print("\t"); 2220 n->dump(); 2221 } 2222 } 2223 #endif 2224 } 2225 2226 //------------------------------memory_alignment--------------------------- 2227 // Alignment within a vector memory reference 2228 int SuperWord::memory_alignment(MemNode* s, int iv_adjust) { 2229 #ifndef PRODUCT 2230 if(TraceSuperWord && Verbose) { 2231 tty->print("SuperWord::memory_alignment within a vector memory reference for %d: ", s->_idx); s->dump(); 2232 } 2233 #endif 2234 SWPointer p(s, this); 2235 if (!p.valid()) { 2236 #ifndef PRODUCT 2237 if(is_trace_alignment()) { 2238 tty->print("SWPointer::memory_alignment: SWPointer p invalid, return bottom_align"); p.print(); 2239 } 2240 #endif 2241 return bottom_align; 2242 } 2243 int vw = vector_width_in_bytes(s); 2244 if (vw < 2) { 2245 #ifndef PRODUCT 2246 if(is_trace_alignment()) { 2247 tty->print_cr("SWPointer::memory_alignment: vector_width_in_bytes < 2, return bottom_align"); 2248 } 2249 #endif 2250 return bottom_align; // No vectors for this type 2251 } 2252 int offset = p.offset_in_bytes(); 2253 offset += iv_adjust*p.memory_size(); 2254 int off_rem = offset % vw; 2255 int off_mod = off_rem >= 0 ? off_rem : off_rem + vw; 2256 #ifndef PRODUCT 2257 if(TraceSuperWord && Verbose) { 2258 tty->print_cr("SWPointer::memory_alignment: off_rem = %d, off_mod = %d", off_rem, off_mod); 2259 } 2260 #endif 2261 return off_mod; 2262 } 2263 2264 //---------------------------container_type--------------------------- 2265 // Smallest type containing range of values 2266 const Type* SuperWord::container_type(Node* n) { 2267 if (n->is_Mem()) { 2268 BasicType bt = n->as_Mem()->memory_type(); 2269 if (n->is_Store() && (bt == T_CHAR)) { 2270 // Use T_SHORT type instead of T_CHAR for stored values because any 2271 // preceding arithmetic operation extends values to signed Int. 2272 bt = T_SHORT; 2273 } 2274 if (n->Opcode() == Op_LoadUB) { 2275 // Adjust type for unsigned byte loads, it is important for right shifts. 2276 // T_BOOLEAN is used because there is no basic type representing type 2277 // TypeInt::UBYTE. Use of T_BOOLEAN for vectors is fine because only 2278 // size (one byte) and sign is important. 2279 bt = T_BOOLEAN; 2280 } 2281 return Type::get_const_basic_type(bt); 2282 } 2283 const Type* t = _igvn.type(n); 2284 if (t->basic_type() == T_INT) { 2285 // A narrow type of arithmetic operations will be determined by 2286 // propagating the type of memory operations. 2287 return TypeInt::INT; 2288 } 2289 return t; 2290 } 2291 2292 bool SuperWord::same_velt_type(Node* n1, Node* n2) { 2293 const Type* vt1 = velt_type(n1); 2294 const Type* vt2 = velt_type(n2); 2295 if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) { 2296 // Compare vectors element sizes for integer types. 2297 return data_size(n1) == data_size(n2); 2298 } 2299 return vt1 == vt2; 2300 } 2301 2302 //------------------------------in_packset--------------------------- 2303 // Are s1 and s2 in a pack pair and ordered as s1,s2? 2304 bool SuperWord::in_packset(Node* s1, Node* s2) { 2305 for (int i = 0; i < _packset.length(); i++) { 2306 Node_List* p = _packset.at(i); 2307 assert(p->size() == 2, "must be"); 2308 if (p->at(0) == s1 && p->at(p->size()-1) == s2) { 2309 return true; 2310 } 2311 } 2312 return false; 2313 } 2314 2315 //------------------------------in_pack--------------------------- 2316 // Is s in pack p? 2317 Node_List* SuperWord::in_pack(Node* s, Node_List* p) { 2318 for (uint i = 0; i < p->size(); i++) { 2319 if (p->at(i) == s) { 2320 return p; 2321 } 2322 } 2323 return NULL; 2324 } 2325 2326 //------------------------------remove_pack_at--------------------------- 2327 // Remove the pack at position pos in the packset 2328 void SuperWord::remove_pack_at(int pos) { 2329 Node_List* p = _packset.at(pos); 2330 for (uint i = 0; i < p->size(); i++) { 2331 Node* s = p->at(i); 2332 set_my_pack(s, NULL); 2333 } 2334 _packset.remove_at(pos); 2335 } 2336 2337 void SuperWord::packset_sort(int n) { 2338 // simple bubble sort so that we capitalize with O(n) when its already sorted 2339 while (n != 0) { 2340 bool swapped = false; 2341 for (int i = 1; i < n; i++) { 2342 Node_List* q_low = _packset.at(i-1); 2343 Node_List* q_i = _packset.at(i); 2344 2345 // only swap when we find something to swap 2346 if (alignment(q_low->at(0)) > alignment(q_i->at(0))) { 2347 Node_List* t = q_i; 2348 *(_packset.adr_at(i)) = q_low; 2349 *(_packset.adr_at(i-1)) = q_i; 2350 swapped = true; 2351 } 2352 } 2353 if (swapped == false) break; 2354 n--; 2355 } 2356 } 2357 2358 //------------------------------executed_first--------------------------- 2359 // Return the node executed first in pack p. Uses the RPO block list 2360 // to determine order. 2361 Node* SuperWord::executed_first(Node_List* p) { 2362 Node* n = p->at(0); 2363 int n_rpo = bb_idx(n); 2364 for (uint i = 1; i < p->size(); i++) { 2365 Node* s = p->at(i); 2366 int s_rpo = bb_idx(s); 2367 if (s_rpo < n_rpo) { 2368 n = s; 2369 n_rpo = s_rpo; 2370 } 2371 } 2372 return n; 2373 } 2374 2375 //------------------------------executed_last--------------------------- 2376 // Return the node executed last in pack p. 2377 Node* SuperWord::executed_last(Node_List* p) { 2378 Node* n = p->at(0); 2379 int n_rpo = bb_idx(n); 2380 for (uint i = 1; i < p->size(); i++) { 2381 Node* s = p->at(i); 2382 int s_rpo = bb_idx(s); 2383 if (s_rpo > n_rpo) { 2384 n = s; 2385 n_rpo = s_rpo; 2386 } 2387 } 2388 return n; 2389 } 2390 2391 LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) { 2392 LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest; 2393 for (uint i = 0; i < p->size(); i++) { 2394 Node* n = p->at(i); 2395 assert(n->is_Load(), "only meaningful for loads"); 2396 if (!n->depends_only_on_test()) { 2397 dep = LoadNode::Pinned; 2398 } 2399 } 2400 return dep; 2401 } 2402 2403 2404 //----------------------------align_initial_loop_index--------------------------- 2405 // Adjust pre-loop limit so that in main loop, a load/store reference 2406 // to align_to_ref will be a position zero in the vector. 2407 // (iv + k) mod vector_align == 0 2408 void SuperWord::align_initial_loop_index(MemNode* align_to_ref) { 2409 CountedLoopNode *main_head = lp()->as_CountedLoop(); 2410 assert(main_head->is_main_loop(), ""); 2411 CountedLoopEndNode* pre_end = get_pre_loop_end(main_head); 2412 assert(pre_end != NULL, "we must have a correct pre-loop"); 2413 Node *pre_opaq1 = pre_end->limit(); 2414 assert(pre_opaq1->Opcode() == Op_Opaque1, ""); 2415 Opaque1Node *pre_opaq = (Opaque1Node*)pre_opaq1; 2416 Node *lim0 = pre_opaq->in(1); 2417 2418 // Where we put new limit calculations 2419 Node *pre_ctrl = pre_end->loopnode()->in(LoopNode::EntryControl); 2420 2421 // Ensure the original loop limit is available from the 2422 // pre-loop Opaque1 node. 2423 Node *orig_limit = pre_opaq->original_loop_limit(); 2424 assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, ""); 2425 2426 SWPointer align_to_ref_p(align_to_ref, this); 2427 assert(align_to_ref_p.valid(), "sanity"); 2428 2429 // Given: 2430 // lim0 == original pre loop limit 2431 // V == v_align (power of 2) 2432 // invar == extra invariant piece of the address expression 2433 // e == offset [ +/- invar ] 2434 // 2435 // When reassociating expressions involving '%' the basic rules are: 2436 // (a - b) % k == 0 => a % k == b % k 2437 // and: 2438 // (a + b) % k == 0 => a % k == (k - b) % k 2439 // 2440 // For stride > 0 && scale > 0, 2441 // Derive the new pre-loop limit "lim" such that the two constraints: 2442 // (1) lim = lim0 + N (where N is some positive integer < V) 2443 // (2) (e + lim) % V == 0 2444 // are true. 2445 // 2446 // Substituting (1) into (2), 2447 // (e + lim0 + N) % V == 0 2448 // solve for N: 2449 // N = (V - (e + lim0)) % V 2450 // substitute back into (1), so that new limit 2451 // lim = lim0 + (V - (e + lim0)) % V 2452 // 2453 // For stride > 0 && scale < 0 2454 // Constraints: 2455 // lim = lim0 + N 2456 // (e - lim) % V == 0 2457 // Solving for lim: 2458 // (e - lim0 - N) % V == 0 2459 // N = (e - lim0) % V 2460 // lim = lim0 + (e - lim0) % V 2461 // 2462 // For stride < 0 && scale > 0 2463 // Constraints: 2464 // lim = lim0 - N 2465 // (e + lim) % V == 0 2466 // Solving for lim: 2467 // (e + lim0 - N) % V == 0 2468 // N = (e + lim0) % V 2469 // lim = lim0 - (e + lim0) % V 2470 // 2471 // For stride < 0 && scale < 0 2472 // Constraints: 2473 // lim = lim0 - N 2474 // (e - lim) % V == 0 2475 // Solving for lim: 2476 // (e - lim0 + N) % V == 0 2477 // N = (V - (e - lim0)) % V 2478 // lim = lim0 - (V - (e - lim0)) % V 2479 2480 int vw = vector_width_in_bytes(align_to_ref); 2481 int stride = iv_stride(); 2482 int scale = align_to_ref_p.scale_in_bytes(); 2483 int elt_size = align_to_ref_p.memory_size(); 2484 int v_align = vw / elt_size; 2485 assert(v_align > 1, "sanity"); 2486 int offset = align_to_ref_p.offset_in_bytes() / elt_size; 2487 Node *offsn = _igvn.intcon(offset); 2488 2489 Node *e = offsn; 2490 if (align_to_ref_p.invar() != NULL) { 2491 // incorporate any extra invariant piece producing (offset +/- invar) >>> log2(elt) 2492 Node* log2_elt = _igvn.intcon(exact_log2(elt_size)); 2493 Node* aref = new URShiftINode(align_to_ref_p.invar(), log2_elt); 2494 _igvn.register_new_node_with_optimizer(aref); 2495 _phase->set_ctrl(aref, pre_ctrl); 2496 if (align_to_ref_p.negate_invar()) { 2497 e = new SubINode(e, aref); 2498 } else { 2499 e = new AddINode(e, aref); 2500 } 2501 _igvn.register_new_node_with_optimizer(e); 2502 _phase->set_ctrl(e, pre_ctrl); 2503 } 2504 if (vw > ObjectAlignmentInBytes) { 2505 // incorporate base e +/- base && Mask >>> log2(elt) 2506 Node* xbase = new CastP2XNode(NULL, align_to_ref_p.base()); 2507 _igvn.register_new_node_with_optimizer(xbase); 2508 #ifdef _LP64 2509 xbase = new ConvL2INode(xbase); 2510 _igvn.register_new_node_with_optimizer(xbase); 2511 #endif 2512 Node* mask = _igvn.intcon(vw-1); 2513 Node* masked_xbase = new AndINode(xbase, mask); 2514 _igvn.register_new_node_with_optimizer(masked_xbase); 2515 Node* log2_elt = _igvn.intcon(exact_log2(elt_size)); 2516 Node* bref = new URShiftINode(masked_xbase, log2_elt); 2517 _igvn.register_new_node_with_optimizer(bref); 2518 _phase->set_ctrl(bref, pre_ctrl); 2519 e = new AddINode(e, bref); 2520 _igvn.register_new_node_with_optimizer(e); 2521 _phase->set_ctrl(e, pre_ctrl); 2522 } 2523 2524 // compute e +/- lim0 2525 if (scale < 0) { 2526 e = new SubINode(e, lim0); 2527 } else { 2528 e = new AddINode(e, lim0); 2529 } 2530 _igvn.register_new_node_with_optimizer(e); 2531 _phase->set_ctrl(e, pre_ctrl); 2532 2533 if (stride * scale > 0) { 2534 // compute V - (e +/- lim0) 2535 Node* va = _igvn.intcon(v_align); 2536 e = new SubINode(va, e); 2537 _igvn.register_new_node_with_optimizer(e); 2538 _phase->set_ctrl(e, pre_ctrl); 2539 } 2540 // compute N = (exp) % V 2541 Node* va_msk = _igvn.intcon(v_align - 1); 2542 Node* N = new AndINode(e, va_msk); 2543 _igvn.register_new_node_with_optimizer(N); 2544 _phase->set_ctrl(N, pre_ctrl); 2545 2546 // substitute back into (1), so that new limit 2547 // lim = lim0 + N 2548 Node* lim; 2549 if (stride < 0) { 2550 lim = new SubINode(lim0, N); 2551 } else { 2552 lim = new AddINode(lim0, N); 2553 } 2554 _igvn.register_new_node_with_optimizer(lim); 2555 _phase->set_ctrl(lim, pre_ctrl); 2556 Node* constrained = 2557 (stride > 0) ? (Node*) new MinINode(lim, orig_limit) 2558 : (Node*) new MaxINode(lim, orig_limit); 2559 _igvn.register_new_node_with_optimizer(constrained); 2560 _phase->set_ctrl(constrained, pre_ctrl); 2561 _igvn.hash_delete(pre_opaq); 2562 pre_opaq->set_req(1, constrained); 2563 } 2564 2565 //----------------------------get_pre_loop_end--------------------------- 2566 // Find pre loop end from main loop. Returns null if none. 2567 CountedLoopEndNode* SuperWord::get_pre_loop_end(CountedLoopNode *cl) { 2568 Node *ctrl = cl->in(LoopNode::EntryControl); 2569 if (!ctrl->is_IfTrue() && !ctrl->is_IfFalse()) return NULL; 2570 Node *iffm = ctrl->in(0); 2571 if (!iffm->is_If()) return NULL; 2572 Node *p_f = iffm->in(0); 2573 if (!p_f->is_IfFalse()) return NULL; 2574 if (!p_f->in(0)->is_CountedLoopEnd()) return NULL; 2575 CountedLoopEndNode *pre_end = p_f->in(0)->as_CountedLoopEnd(); 2576 CountedLoopNode* loop_node = pre_end->loopnode(); 2577 if (loop_node == NULL || !loop_node->is_pre_loop()) return NULL; 2578 return pre_end; 2579 } 2580 2581 2582 //------------------------------init--------------------------- 2583 void SuperWord::init() { 2584 _dg.init(); 2585 _packset.clear(); 2586 _disjoint_ptrs.clear(); 2587 _block.clear(); 2588 _data_entry.clear(); 2589 _mem_slice_head.clear(); 2590 _mem_slice_tail.clear(); 2591 _iteration_first.clear(); 2592 _iteration_last.clear(); 2593 _node_info.clear(); 2594 _align_to_ref = NULL; 2595 _lpt = NULL; 2596 _lp = NULL; 2597 _bb = NULL; 2598 _iv = NULL; 2599 _race_possible = 0; 2600 _num_work_vecs = 0; 2601 _num_reductions = 0; 2602 } 2603 2604 //------------------------------restart--------------------------- 2605 void SuperWord::restart() { 2606 _dg.init(); 2607 _packset.clear(); 2608 _disjoint_ptrs.clear(); 2609 _block.clear(); 2610 _data_entry.clear(); 2611 _mem_slice_head.clear(); 2612 _mem_slice_tail.clear(); 2613 _node_info.clear(); 2614 } 2615 2616 //------------------------------print_packset--------------------------- 2617 void SuperWord::print_packset() { 2618 #ifndef PRODUCT 2619 tty->print_cr("packset"); 2620 for (int i = 0; i < _packset.length(); i++) { 2621 tty->print_cr("Pack: %d", i); 2622 Node_List* p = _packset.at(i); 2623 print_pack(p); 2624 } 2625 #endif 2626 } 2627 2628 //------------------------------print_pack--------------------------- 2629 void SuperWord::print_pack(Node_List* p) { 2630 for (uint i = 0; i < p->size(); i++) { 2631 print_stmt(p->at(i)); 2632 } 2633 } 2634 2635 //------------------------------print_bb--------------------------- 2636 void SuperWord::print_bb() { 2637 #ifndef PRODUCT 2638 tty->print_cr("\nBlock"); 2639 for (int i = 0; i < _block.length(); i++) { 2640 Node* n = _block.at(i); 2641 tty->print("%d ", i); 2642 if (n) { 2643 n->dump(); 2644 } 2645 } 2646 #endif 2647 } 2648 2649 //------------------------------print_stmt--------------------------- 2650 void SuperWord::print_stmt(Node* s) { 2651 #ifndef PRODUCT 2652 tty->print(" align: %d \t", alignment(s)); 2653 s->dump(); 2654 #endif 2655 } 2656 2657 //------------------------------blank--------------------------- 2658 char* SuperWord::blank(uint depth) { 2659 static char blanks[101]; 2660 assert(depth < 101, "too deep"); 2661 for (uint i = 0; i < depth; i++) blanks[i] = ' '; 2662 blanks[depth] = '\0'; 2663 return blanks; 2664 } 2665 2666 2667 //==============================SWPointer=========================== 2668 #ifndef PRODUCT 2669 int SWPointer::_depth = 0; 2670 #endif 2671 //----------------------------SWPointer------------------------ 2672 SWPointer::SWPointer(MemNode* mem, SuperWord* slp) : 2673 _mem(mem), _slp(slp), _base(NULL), _adr(NULL), 2674 _scale(0), _offset(0), _invar(NULL), _negate_invar(false) { 2675 2676 #ifndef PRODUCT 2677 if(_slp->is_trace_alignment()) { 2678 print_depth(); tty->print(" %d SWPointer::SWPointer: ctor: ", mem->_idx); mem->dump(); 2679 } 2680 #endif 2681 2682 Node* adr = mem->in(MemNode::Address); 2683 if (!adr->is_AddP()) { 2684 assert(!valid(), "too complex"); 2685 return; 2686 } 2687 // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant) 2688 Node* base = adr->in(AddPNode::Base); 2689 // The base address should be loop invariant 2690 if (!invariant(base)) { 2691 assert(!valid(), "base address is loop variant"); 2692 return; 2693 } 2694 //unsafe reference could not be aligned appropriately without runtime checking 2695 if (base == NULL || base->bottom_type() == Type::TOP) { 2696 assert(!valid(), "unsafe access"); 2697 return; 2698 } 2699 2700 #ifndef PRODUCT 2701 int idepth = depth(); 2702 if(_slp->is_trace_alignment()) { 2703 inc_depth(); 2704 print_depth(); tty->print(" %d (adr) SWPointer::SWPointer: ", adr->_idx); adr->dump(); 2705 print_depth(); tty->print(" %d (base) SWPointer::SWPointer: ", adr->in(AddPNode::Base)->_idx); adr->in(AddPNode::Base)->dump(); 2706 } 2707 #endif 2708 2709 int i; 2710 for (i = 0; i < 3; i++) { 2711 #ifndef PRODUCT 2712 if(_slp->is_trace_alignment()) { 2713 Node* offset = adr->in(AddPNode::Offset); 2714 print_depth(); tty->print(" %d (offset) SWPointer::SWPointer: i = %d: ", offset->_idx, i); offset->dump(); 2715 } 2716 #endif 2717 if (!scaled_iv_plus_offset(adr->in(AddPNode::Offset))) { 2718 assert(!valid(), "too complex"); 2719 return; 2720 } 2721 adr = adr->in(AddPNode::Address); 2722 #ifndef PRODUCT 2723 if(_slp->is_trace_alignment()) { 2724 inc_depth(); 2725 print_depth(); 2726 tty->print(" %d (adr) SWPointer::SWPointer: i = %d: ", adr->_idx, i); 2727 adr->dump(); 2728 } 2729 #endif 2730 if (base == adr || !adr->is_AddP()) { 2731 break; // stop looking at addp's 2732 } 2733 } 2734 #ifndef PRODUCT 2735 if(_slp->is_trace_alignment()) { 2736 set_depth(idepth); 2737 print_depth(); 2738 tty->print(" %d (adr) SWPointer::SWPointer: stop address analysis: ", adr->_idx); 2739 adr->dump(); 2740 } 2741 #endif 2742 _base = base; 2743 _adr = adr; 2744 assert(valid(), "Usable"); 2745 } 2746 2747 // Following is used to create a temporary object during 2748 // the pattern match of an address expression. 2749 SWPointer::SWPointer(SWPointer* p) : 2750 _mem(p->_mem), _slp(p->_slp), _base(NULL), _adr(NULL), 2751 _scale(0), _offset(0), _invar(NULL), _negate_invar(false) {} 2752 2753 2754 bool SWPointer::invariant(Node* n) { 2755 NOT_PRODUCT(Depth dd;) 2756 Node *n_c = phase()->get_ctrl(n); 2757 #ifndef PRODUCT 2758 if (_slp->do_vector_loop() && _slp->is_debug() && 2759 !lpt()->is_member(phase()->get_loop(n_c)) != !_slp->in_bb(n)) { 2760 print_depth(); tty->print(" %d SWPointer::invariant conditions differ: n_c %d", n->_idx, n_c->_idx); 2761 n->dump(); 2762 n_c->dump(); 2763 } 2764 #endif 2765 return !lpt()->is_member(phase()->get_loop(n_c)); 2766 } 2767 //------------------------scaled_iv_plus_offset-------------------- 2768 // Match: k*iv + offset 2769 // where: k is a constant that maybe zero, and 2770 // offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional 2771 bool SWPointer::scaled_iv_plus_offset(Node* n) { 2772 #ifndef PRODUCT 2773 Depth ddd; 2774 if(_slp->is_trace_alignment()) { 2775 print_depth(); tty->print(" %d SWPointer::scaled_iv_plus_offset testing node: ", n->_idx); 2776 n->dump(); 2777 } 2778 #endif 2779 2780 if (scaled_iv(n)) { 2781 #ifndef PRODUCT 2782 if(_slp->is_trace_alignment()) { 2783 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: is scaled_iv_plus_offset", n->_idx); 2784 } 2785 #endif 2786 return true; 2787 } 2788 2789 if (offset_plus_k(n)) { 2790 #ifndef PRODUCT 2791 if(_slp->is_trace_alignment()) { 2792 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: is scaled_iv_plus_offset", n->_idx); 2793 } 2794 #endif 2795 return true; 2796 } 2797 2798 int opc = n->Opcode(); 2799 if (opc == Op_AddI) { 2800 if (scaled_iv(n->in(1)) && offset_plus_k(n->in(2))) { 2801 #ifndef PRODUCT 2802 if(_slp->is_trace_alignment()) { 2803 print_depth(); tty->print(" %d SWPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump(); 2804 print_depth(); tty->print(" %d SWPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump(); 2805 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_AddI is scaled_iv_plus_offset", n->_idx); 2806 } 2807 #endif 2808 return true; 2809 } 2810 if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) { 2811 #ifndef PRODUCT 2812 if(_slp->is_trace_alignment()) { 2813 print_depth(); tty->print(" %d SWPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump(); 2814 print_depth(); tty->print(" %d SWPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump(); 2815 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_AddI is scaled_iv_plus_offset", n->_idx); 2816 } 2817 #endif 2818 return true; 2819 } 2820 } else if (opc == Op_SubI) { 2821 if (scaled_iv(n->in(1)) && offset_plus_k(n->in(2), true)) { 2822 #ifndef PRODUCT 2823 if(_slp->is_trace_alignment()) { 2824 print_depth(); tty->print(" %d SWPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump(); 2825 print_depth(); tty->print(" %d SWPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump(); 2826 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_SubI is scaled_iv_plus_offset", n->_idx); 2827 } 2828 #endif 2829 return true; 2830 } 2831 if (scaled_iv(n->in(2)) && offset_plus_k(n->in(1))) { 2832 _scale *= -1; 2833 #ifndef PRODUCT 2834 if(_slp->is_trace_alignment()) { 2835 print_depth(); tty->print(" %d SWPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump(); 2836 print_depth(); tty->print(" %d SWPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump(); 2837 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_SubI is scaled_iv_plus_offset", n->_idx); 2838 } 2839 #endif 2840 return true; 2841 } 2842 } 2843 2844 #ifndef PRODUCT 2845 if(_slp->is_trace_alignment()) { 2846 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: is NOT scaled_iv_plus_offset", n->_idx); 2847 } 2848 #endif 2849 return false; 2850 } 2851 2852 //----------------------------scaled_iv------------------------ 2853 // Match: k*iv where k is a constant that's not zero 2854 bool SWPointer::scaled_iv(Node* n) { 2855 #ifndef PRODUCT 2856 Depth ddd; 2857 if(_slp->is_trace_alignment()) { 2858 print_depth(); tty->print(" %d SWPointer::scaled_iv: testing node: ", n->_idx); n->dump(); 2859 } 2860 #endif 2861 2862 if (_scale != 0) { 2863 #ifndef PRODUCT 2864 if(_slp->is_trace_alignment()) { 2865 print_depth(); tty->print_cr("SWPointer::scaled_iv: _scale (%d) != 0", _scale); 2866 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: is NOT scaled_iv", n->_idx); 2867 } 2868 #endif 2869 return _slp->do_vector_loop()? true: false; // already found a scale 2870 } 2871 2872 if (n == iv()) { 2873 _scale = 1; 2874 #ifndef PRODUCT 2875 if(_slp->is_trace_alignment()) { 2876 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: is iv, setting _scale = %d", n->_idx, _scale); 2877 } 2878 #endif 2879 return true; 2880 } 2881 2882 int opc = n->Opcode(); 2883 if (opc == Op_MulI) { 2884 if (n->in(1) == iv() && n->in(2)->is_Con()) { 2885 _scale = n->in(2)->get_int(); 2886 #ifndef PRODUCT 2887 if(_slp->is_trace_alignment()) { 2888 print_depth(); tty->print(" %d SWPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump(); 2889 print_depth(); tty->print(" %d SWPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump(); 2890 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_MulI is scaled_iv, setting _scale = %d", n->_idx, _scale); 2891 } 2892 #endif 2893 return true; 2894 } else if (n->in(2) == iv() && n->in(1)->is_Con()) { 2895 _scale = n->in(1)->get_int(); 2896 #ifndef PRODUCT 2897 if(_slp->is_trace_alignment()) { 2898 print_depth(); tty->print(" %d SWPointer::scaled_iv: in(2) is iv: ", n->in(2)->_idx); n->in(2)->dump(); 2899 print_depth(); tty->print(" %d SWPointer::scaled_iv: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump(); 2900 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_MulI is scaled_iv, setting _scale = %d", n->_idx, _scale); 2901 } 2902 #endif 2903 return true; 2904 } 2905 } else if (opc == Op_LShiftI) { 2906 if (n->in(1) == iv() && n->in(2)->is_Con()) { 2907 _scale = 1 << n->in(2)->get_int(); 2908 #ifndef PRODUCT 2909 if(_slp->is_trace_alignment()) { 2910 print_depth(); tty->print(" %d SWPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump(); 2911 print_depth(); tty->print(" %d SWPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump(); 2912 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_LShiftI is scaled_iv, setting _scale = %d", n->_idx, _scale); 2913 } 2914 #endif 2915 return true; 2916 } 2917 } else if (opc == Op_ConvI2L) { 2918 if (scaled_iv_plus_offset(n->in(1))) { 2919 #ifndef PRODUCT 2920 if(_slp->is_trace_alignment()) { 2921 print_depth(); tty->print(" %d SWPointer::scaled_iv: in(1) is scaled_iv_plus_offset: ", n->in(1)->_idx); n->in(1)->dump(); 2922 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_ConvI2L is scaled_iv", n->_idx); 2923 } 2924 #endif 2925 return true; 2926 } 2927 } else if (opc == Op_LShiftL) { 2928 if (!has_iv() && _invar == NULL) { 2929 // Need to preserve the current _offset value, so 2930 // create a temporary object for this expression subtree. 2931 // Hacky, so should re-engineer the address pattern match. 2932 NOT_PRODUCT(Depth dddd;) 2933 SWPointer tmp(this); 2934 #ifndef PRODUCT 2935 if(_slp->is_trace_alignment()) { 2936 print_depth(); tty->print(" %d SWPointer::scaled_iv: Op_LShiftL, creating tmp SWPointer: ", n->_idx); tmp.print(); 2937 } 2938 #endif 2939 if (tmp.scaled_iv_plus_offset(n->in(1))) { 2940 if (tmp._invar == NULL || _slp->do_vector_loop()) { //I do not know, why tmp._invar == NULL was here at first hand 2941 int mult = 1 << n->in(2)->get_int(); 2942 _scale = tmp._scale * mult; 2943 _offset += tmp._offset * mult; 2944 #ifndef PRODUCT 2945 if(_slp->is_trace_alignment()) { 2946 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_LShiftL is scaled_iv, setting _scale = %d, _offset = %d", n->_idx, _scale, _offset); 2947 } 2948 #endif 2949 return true; 2950 } 2951 } 2952 } 2953 } 2954 #ifndef PRODUCT 2955 if(_slp->is_trace_alignment()) { 2956 print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: is NOT scaled_iv", n->_idx); 2957 } 2958 #endif 2959 return false; 2960 } 2961 2962 //----------------------------offset_plus_k------------------------ 2963 // Match: offset is (k [+/- invariant]) 2964 // where k maybe zero and invariant is optional, but not both. 2965 bool SWPointer::offset_plus_k(Node* n, bool negate) { 2966 #ifndef PRODUCT 2967 Depth ddd; 2968 if(_slp->is_trace_alignment()) { 2969 print_depth(); tty->print(" %d SWPointer::offset_plus_k: testing node: ", n->_idx); n->dump(); 2970 } 2971 #endif 2972 int opc = n->Opcode(); 2973 if (opc == Op_ConI) { 2974 _offset += negate ? -(n->get_int()) : n->get_int(); 2975 #ifndef PRODUCT 2976 if(_slp->is_trace_alignment()) { 2977 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_ConI is offset_plus_k, setting _offset = %d", n->_idx, _offset); 2978 } 2979 #endif 2980 return true; 2981 } else if (opc == Op_ConL) { 2982 // Okay if value fits into an int 2983 const TypeLong* t = n->find_long_type(); 2984 if (t->higher_equal(TypeLong::INT)) { 2985 jlong loff = n->get_long(); 2986 jint off = (jint)loff; 2987 _offset += negate ? -off : loff; 2988 #ifndef PRODUCT 2989 if(_slp->is_trace_alignment()) { 2990 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_ConL is offset_plus_k, setting _offset = %d", n->_idx, _offset); 2991 } 2992 #endif 2993 return true; 2994 } 2995 #ifndef PRODUCT 2996 if(_slp->is_trace_alignment()) { 2997 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_ConL is NOT offset_plus_k, k is too big", n->_idx); 2998 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: is NOT offset_plus_k", n->_idx); 2999 } 3000 #endif 3001 return false; 3002 } 3003 if (_invar != NULL) { 3004 #ifndef PRODUCT 3005 if(_slp->is_trace_alignment()) { 3006 print_depth(); tty->print(" %d SWPointer::offset_plus_k: _invar != NULL: ", _invar->_idx); _invar->dump(); 3007 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: is NOT offset_plus_k", n->_idx); 3008 } 3009 #endif 3010 return _slp->do_vector_loop()? true: false; // already have an invariant 3011 } 3012 if (opc == Op_AddI) { 3013 if (n->in(2)->is_Con() && invariant(n->in(1))) { 3014 _negate_invar = negate; 3015 _invar = n->in(1); 3016 _offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int(); 3017 #ifndef PRODUCT 3018 if(_slp->is_trace_alignment()) { 3019 print_depth(); tty->print(" %d SWPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump(); 3020 print_depth(); tty->print(" %d SWPointer::offset_plus_k: in(1) is invariant: ", n->in(1)->_idx); n->in(1)->dump(); 3021 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_AddI is offset_plus_k, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset); 3022 } 3023 #endif 3024 return true; 3025 } else if (n->in(1)->is_Con() && invariant(n->in(2))) { 3026 _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int(); 3027 _negate_invar = negate; 3028 _invar = n->in(2); 3029 #ifndef PRODUCT 3030 if(_slp->is_trace_alignment()) { 3031 print_depth(); tty->print(" %d SWPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump(); 3032 print_depth(); tty->print(" %d SWPointer::offset_plus_k: in(2) is invariant: ", n->in(2)->_idx); n->in(2)->dump(); 3033 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_AddI is offset_plus_k, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset); 3034 } 3035 #endif 3036 return true; 3037 } 3038 } 3039 if (opc == Op_SubI) { 3040 if (n->in(2)->is_Con() && invariant(n->in(1))) { 3041 _negate_invar = negate; 3042 _invar = n->in(1); 3043 _offset += !negate ? -(n->in(2)->get_int()) : n->in(2)->get_int(); 3044 #ifndef PRODUCT 3045 if(_slp->is_trace_alignment()) { 3046 print_depth(); tty->print(" %d SWPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump(); 3047 print_depth(); tty->print(" %d SWPointer::offset_plus_k: in(1) is invariant: ", n->in(1)->_idx); n->in(1)->dump(); 3048 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_SubI is offset_plus_k, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset); 3049 } 3050 #endif 3051 return true; 3052 } else if (n->in(1)->is_Con() && invariant(n->in(2))) { 3053 _offset += negate ? -(n->in(1)->get_int()) : n->in(1)->get_int(); 3054 _negate_invar = !negate; 3055 _invar = n->in(2); 3056 #ifndef PRODUCT 3057 if(_slp->is_trace_alignment()) { 3058 print_depth(); tty->print(" %d SWPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump(); 3059 print_depth(); tty->print(" %d SWPointer::offset_plus_k: in(2) is invariant: ", n->in(2)->_idx); n->in(2)->dump(); 3060 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_SubI is offset_plus_k, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset); 3061 } 3062 #endif 3063 return true; 3064 } 3065 } 3066 if (invariant(n)) { 3067 _negate_invar = negate; 3068 _invar = n; 3069 #ifndef PRODUCT 3070 if(_slp->is_trace_alignment()) { 3071 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: n is invariant", n->_idx); 3072 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: n is offset_plus_k, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset); 3073 } 3074 #endif 3075 return true; 3076 } 3077 3078 #ifndef PRODUCT 3079 if(_slp->is_trace_alignment()) { 3080 print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: is NOT offset_plus_k", n->_idx); 3081 } 3082 #endif 3083 return false; 3084 } 3085 3086 //----------------------------print------------------------ 3087 void SWPointer::print() { 3088 #ifndef PRODUCT 3089 tty->print("base: %d adr: %d scale: %d offset: %d invar: %c%d\n", 3090 _base != NULL ? _base->_idx : 0, 3091 _adr != NULL ? _adr->_idx : 0, 3092 _scale, _offset, 3093 _negate_invar?'-':'+', 3094 _invar != NULL ? _invar->_idx : 0); 3095 #endif 3096 } 3097 3098 #ifndef PRODUCT 3099 void SWPointer::print_depth() { 3100 for (int ii = 0; ii<_depth; ++ii) tty->print(" "); 3101 } 3102 #endif 3103 3104 // ========================= OrderedPair ===================== 3105 3106 const OrderedPair OrderedPair::initial; 3107 3108 // ========================= SWNodeInfo ===================== 3109 3110 const SWNodeInfo SWNodeInfo::initial; 3111 3112 3113 // ============================ DepGraph =========================== 3114 3115 //------------------------------make_node--------------------------- 3116 // Make a new dependence graph node for an ideal node. 3117 DepMem* DepGraph::make_node(Node* node) { 3118 DepMem* m = new (_arena) DepMem(node); 3119 if (node != NULL) { 3120 assert(_map.at_grow(node->_idx) == NULL, "one init only"); 3121 _map.at_put_grow(node->_idx, m); 3122 } 3123 return m; 3124 } 3125 3126 //------------------------------make_edge--------------------------- 3127 // Make a new dependence graph edge from dpred -> dsucc 3128 DepEdge* DepGraph::make_edge(DepMem* dpred, DepMem* dsucc) { 3129 DepEdge* e = new (_arena) DepEdge(dpred, dsucc, dsucc->in_head(), dpred->out_head()); 3130 dpred->set_out_head(e); 3131 dsucc->set_in_head(e); 3132 return e; 3133 } 3134 3135 // ========================== DepMem ======================== 3136 3137 //------------------------------in_cnt--------------------------- 3138 int DepMem::in_cnt() { 3139 int ct = 0; 3140 for (DepEdge* e = _in_head; e != NULL; e = e->next_in()) ct++; 3141 return ct; 3142 } 3143 3144 //------------------------------out_cnt--------------------------- 3145 int DepMem::out_cnt() { 3146 int ct = 0; 3147 for (DepEdge* e = _out_head; e != NULL; e = e->next_out()) ct++; 3148 return ct; 3149 } 3150 3151 //------------------------------print----------------------------- 3152 void DepMem::print() { 3153 #ifndef PRODUCT 3154 tty->print(" DepNode %d (", _node->_idx); 3155 for (DepEdge* p = _in_head; p != NULL; p = p->next_in()) { 3156 Node* pred = p->pred()->node(); 3157 tty->print(" %d", pred != NULL ? pred->_idx : 0); 3158 } 3159 tty->print(") ["); 3160 for (DepEdge* s = _out_head; s != NULL; s = s->next_out()) { 3161 Node* succ = s->succ()->node(); 3162 tty->print(" %d", succ != NULL ? succ->_idx : 0); 3163 } 3164 tty->print_cr(" ]"); 3165 #endif 3166 } 3167 3168 // =========================== DepEdge ========================= 3169 3170 //------------------------------DepPreds--------------------------- 3171 void DepEdge::print() { 3172 #ifndef PRODUCT 3173 tty->print_cr("DepEdge: %d [ %d ]", _pred->node()->_idx, _succ->node()->_idx); 3174 #endif 3175 } 3176 3177 // =========================== DepPreds ========================= 3178 // Iterator over predecessor edges in the dependence graph. 3179 3180 //------------------------------DepPreds--------------------------- 3181 DepPreds::DepPreds(Node* n, DepGraph& dg) { 3182 _n = n; 3183 _done = false; 3184 if (_n->is_Store() || _n->is_Load()) { 3185 _next_idx = MemNode::Address; 3186 _end_idx = n->req(); 3187 _dep_next = dg.dep(_n)->in_head(); 3188 } else if (_n->is_Mem()) { 3189 _next_idx = 0; 3190 _end_idx = 0; 3191 _dep_next = dg.dep(_n)->in_head(); 3192 } else { 3193 _next_idx = 1; 3194 _end_idx = _n->req(); 3195 _dep_next = NULL; 3196 } 3197 next(); 3198 } 3199 3200 //------------------------------next--------------------------- 3201 void DepPreds::next() { 3202 if (_dep_next != NULL) { 3203 _current = _dep_next->pred()->node(); 3204 _dep_next = _dep_next->next_in(); 3205 } else if (_next_idx < _end_idx) { 3206 _current = _n->in(_next_idx++); 3207 } else { 3208 _done = true; 3209 } 3210 } 3211 3212 // =========================== DepSuccs ========================= 3213 // Iterator over successor edges in the dependence graph. 3214 3215 //------------------------------DepSuccs--------------------------- 3216 DepSuccs::DepSuccs(Node* n, DepGraph& dg) { 3217 _n = n; 3218 _done = false; 3219 if (_n->is_Load()) { 3220 _next_idx = 0; 3221 _end_idx = _n->outcnt(); 3222 _dep_next = dg.dep(_n)->out_head(); 3223 } else if (_n->is_Mem() || _n->is_Phi() && _n->bottom_type() == Type::MEMORY) { 3224 _next_idx = 0; 3225 _end_idx = 0; 3226 _dep_next = dg.dep(_n)->out_head(); 3227 } else { 3228 _next_idx = 0; 3229 _end_idx = _n->outcnt(); 3230 _dep_next = NULL; 3231 } 3232 next(); 3233 } 3234 3235 //-------------------------------next--------------------------- 3236 void DepSuccs::next() { 3237 if (_dep_next != NULL) { 3238 _current = _dep_next->succ()->node(); 3239 _dep_next = _dep_next->next_out(); 3240 } else if (_next_idx < _end_idx) { 3241 _current = _n->raw_out(_next_idx++); 3242 } else { 3243 _done = true; 3244 } 3245 } 3246 3247 // 3248 // --------------------------------- vectorization/simd ----------------------------------- 3249 // 3250 bool SuperWord::same_origin_idx(Node* a, Node* b) const { 3251 return a != NULL && b != NULL && _clone_map.same_idx(a->_idx, b->_idx); 3252 } 3253 bool SuperWord::same_generation(Node* a, Node* b) const { 3254 return a != NULL && b != NULL && _clone_map.same_gen(a->_idx, b->_idx); 3255 } 3256 3257 Node* SuperWord::find_phi_for_mem_dep(LoadNode* ld) { 3258 assert(in_bb(ld), "must be in block"); 3259 if (_clone_map.gen(ld->_idx) == _ii_first) { 3260 #ifndef PRODUCT 3261 if (_vector_loop_debug) { 3262 tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(ld->_idx)=%d", 3263 _clone_map.gen(ld->_idx)); 3264 } 3265 #endif 3266 return NULL; //we think that any ld in the first gen being vectorizable 3267 } 3268 3269 Node* mem = ld->in(MemNode::Memory); 3270 if (mem->outcnt() <= 1) { 3271 // we don't want to remove the only edge from mem node to load 3272 #ifndef PRODUCT 3273 if (_vector_loop_debug) { 3274 tty->print_cr("SuperWord::find_phi_for_mem_dep input node %d to load %d has no other outputs and edge mem->load cannot be removed", 3275 mem->_idx, ld->_idx); 3276 ld->dump(); 3277 mem->dump(); 3278 } 3279 #endif 3280 return NULL; 3281 } 3282 if (!in_bb(mem) || same_generation(mem, ld)) { 3283 #ifndef PRODUCT 3284 if (_vector_loop_debug) { 3285 tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(mem->_idx)=%d", 3286 _clone_map.gen(mem->_idx)); 3287 } 3288 #endif 3289 return NULL; // does not depend on loop volatile node or depends on the same generation 3290 } 3291 3292 //otherwise first node should depend on mem-phi 3293 Node* first = first_node(ld); 3294 assert(first->is_Load(), "must be Load"); 3295 Node* phi = first->as_Load()->in(MemNode::Memory); 3296 if (!phi->is_Phi() || phi->bottom_type() != Type::MEMORY) { 3297 #ifndef PRODUCT 3298 if (_vector_loop_debug) { 3299 tty->print_cr("SuperWord::find_phi_for_mem_dep load is not vectorizable node, since it's `first` does not take input from mem phi"); 3300 ld->dump(); 3301 first->dump(); 3302 } 3303 #endif 3304 return NULL; 3305 } 3306 3307 Node* tail = 0; 3308 for (int m = 0; m < _mem_slice_head.length(); m++) { 3309 if (_mem_slice_head.at(m) == phi) { 3310 tail = _mem_slice_tail.at(m); 3311 } 3312 } 3313 if (tail == 0) { //test that found phi is in the list _mem_slice_head 3314 #ifndef PRODUCT 3315 if (_vector_loop_debug) { 3316 tty->print_cr("SuperWord::find_phi_for_mem_dep load %d is not vectorizable node, its phi %d is not _mem_slice_head", 3317 ld->_idx, phi->_idx); 3318 ld->dump(); 3319 phi->dump(); 3320 } 3321 #endif 3322 return NULL; 3323 } 3324 3325 // now all conditions are met 3326 return phi; 3327 } 3328 3329 Node* SuperWord::first_node(Node* nd) { 3330 for (int ii = 0; ii < _iteration_first.length(); ii++) { 3331 Node* nnn = _iteration_first.at(ii); 3332 if (same_origin_idx(nnn, nd)) { 3333 #ifndef PRODUCT 3334 if (_vector_loop_debug) { 3335 tty->print_cr("SuperWord::first_node: %d is the first iteration node for %d (_clone_map.idx(nnn->_idx) = %d)", 3336 nnn->_idx, nd->_idx, _clone_map.idx(nnn->_idx)); 3337 } 3338 #endif 3339 return nnn; 3340 } 3341 } 3342 3343 #ifndef PRODUCT 3344 if (_vector_loop_debug) { 3345 tty->print_cr("SuperWord::first_node: did not find first iteration node for %d (_clone_map.idx(nd->_idx)=%d)", 3346 nd->_idx, _clone_map.idx(nd->_idx)); 3347 } 3348 #endif 3349 return 0; 3350 } 3351 3352 Node* SuperWord::last_node(Node* nd) { 3353 for (int ii = 0; ii < _iteration_last.length(); ii++) { 3354 Node* nnn = _iteration_last.at(ii); 3355 if (same_origin_idx(nnn, nd)) { 3356 #ifndef PRODUCT 3357 if (_vector_loop_debug) { 3358 tty->print_cr("SuperWord::last_node _clone_map.idx(nnn->_idx)=%d, _clone_map.idx(nd->_idx)=%d", 3359 _clone_map.idx(nnn->_idx), _clone_map.idx(nd->_idx)); 3360 } 3361 #endif 3362 return nnn; 3363 } 3364 } 3365 return 0; 3366 } 3367 3368 int SuperWord::mark_generations() { 3369 Node *ii_err = 0, *tail_err; 3370 for (int i = 0; i < _mem_slice_head.length(); i++) { 3371 Node* phi = _mem_slice_head.at(i); 3372 assert(phi->is_Phi(), "must be phi"); 3373 3374 Node* tail = _mem_slice_tail.at(i); 3375 if (_ii_last == -1) { 3376 tail_err = tail; 3377 _ii_last = _clone_map.gen(tail->_idx); 3378 } 3379 else if (_ii_last != _clone_map.gen(tail->_idx)) { 3380 #ifndef PRODUCT 3381 if (TraceSuperWord && Verbose) { 3382 tty->print_cr("SuperWord::mark_generations _ii_last error - found different generations in two tail nodes "); 3383 tail->dump(); 3384 tail_err->dump(); 3385 } 3386 #endif 3387 return -1; 3388 } 3389 3390 // find first iteration in the loop 3391 for (DUIterator_Fast imax, i = phi->fast_outs(imax); i < imax; i++) { 3392 Node* ii = phi->fast_out(i); 3393 if (in_bb(ii) && ii->is_Store()) { // we speculate that normally Stores of one and one only generation have deps from mem phi 3394 if (_ii_first == -1) { 3395 ii_err = ii; 3396 _ii_first = _clone_map.gen(ii->_idx); 3397 } else if (_ii_first != _clone_map.gen(ii->_idx)) { 3398 #ifndef PRODUCT 3399 if (TraceSuperWord && Verbose) { 3400 tty->print_cr("SuperWord::mark_generations: _ii_first was found before and not equal to one in this node (%d)", _ii_first); 3401 ii->dump(); 3402 if (ii_err!= 0) { 3403 ii_err->dump(); 3404 } 3405 } 3406 #endif 3407 return -1; // this phi has Stores from different generations of unroll and cannot be simd/vectorized 3408 } 3409 } 3410 }//for (DUIterator_Fast imax, 3411 }//for (int i... 3412 3413 if (_ii_first == -1 || _ii_last == -1) { 3414 #ifndef PRODUCT 3415 if (TraceSuperWord && Verbose) { 3416 tty->print_cr("SuperWord::mark_generations unknown error, something vent wrong"); 3417 } 3418 #endif 3419 return -1; // something vent wrong 3420 } 3421 // collect nodes in the first and last generations 3422 assert(_iteration_first.length() == 0, "_iteration_first must be empty"); 3423 assert(_iteration_last.length() == 0, "_iteration_last must be empty"); 3424 for (int j = 0; j < _block.length(); j++) { 3425 Node* n = _block.at(j); 3426 node_idx_t gen = _clone_map.gen(n->_idx); 3427 if ((signed)gen == _ii_first) { 3428 _iteration_first.push(n); 3429 } else if ((signed)gen == _ii_last) { 3430 _iteration_last.push(n); 3431 } 3432 } 3433 3434 // building order of iterations 3435 if (_ii_order.length() == 0 && ii_err != 0) { 3436 assert(in_bb(ii_err) && ii_err->is_Store(), "should be Store in bb"); 3437 Node* nd = ii_err; 3438 while(_clone_map.gen(nd->_idx) != _ii_last) { 3439 _ii_order.push(_clone_map.gen(nd->_idx)); 3440 bool found = false; 3441 for (DUIterator_Fast imax, i = nd->fast_outs(imax); i < imax; i++) { 3442 Node* use = nd->fast_out(i); 3443 if (same_origin_idx(use, nd) && use->as_Store()->in(MemNode::Memory) == nd) { 3444 found = true; 3445 nd = use; 3446 break; 3447 } 3448 }//for 3449 3450 if (found == false) { 3451 #ifndef PRODUCT 3452 if (TraceSuperWord && Verbose) { 3453 tty->print_cr("SuperWord::mark_generations: Cannot build order of iterations - no dependent Store for %d", nd->_idx); 3454 } 3455 #endif 3456 _ii_order.clear(); 3457 return -1; 3458 } 3459 } //while 3460 _ii_order.push(_clone_map.gen(nd->_idx)); 3461 } 3462 3463 #ifndef PRODUCT 3464 if (_vector_loop_debug) { 3465 tty->print_cr("SuperWord::mark_generations"); 3466 tty->print_cr("First generation (%d) nodes:", _ii_first); 3467 for (int ii = 0; ii < _iteration_first.length(); ii++) _iteration_first.at(ii)->dump(); 3468 tty->print_cr("Last generation (%d) nodes:", _ii_last); 3469 for (int ii = 0; ii < _iteration_last.length(); ii++) _iteration_last.at(ii)->dump(); 3470 tty->print_cr(" "); 3471 3472 tty->print("SuperWord::List of generations: "); 3473 for (int jj = 0; jj < _ii_order.length(); ++jj) { 3474 tty->print("%d:%d ", jj, _ii_order.at(jj)); 3475 } 3476 tty->print_cr(" "); 3477 } 3478 #endif 3479 3480 return _ii_first; 3481 } 3482 3483 bool SuperWord::fix_commutative_inputs(Node* gold, Node* fix) { 3484 assert(gold->is_Add() && fix->is_Add() || gold->is_Mul() && fix->is_Mul(), "should be only Add or Mul nodes"); 3485 assert(same_origin_idx(gold, fix), "should be clones of the same node"); 3486 Node* gin1 = gold->in(1); 3487 Node* gin2 = gold->in(2); 3488 Node* fin1 = fix->in(1); 3489 Node* fin2 = fix->in(2); 3490 bool swapped = false; 3491 3492 if (in_bb(gin1) && in_bb(gin2) && in_bb(fin1) && in_bb(fin1)) { 3493 if (same_origin_idx(gin1, fin1) && 3494 same_origin_idx(gin2, fin2)) { 3495 return true; // nothing to fix 3496 } 3497 if (same_origin_idx(gin1, fin2) && 3498 same_origin_idx(gin2, fin1)) { 3499 fix->swap_edges(1, 2); 3500 swapped = true; 3501 } 3502 } 3503 // at least one input comes from outside of bb 3504 if (gin1->_idx == fin1->_idx) { 3505 return true; // nothing to fix 3506 } 3507 if (!swapped && (gin1->_idx == fin2->_idx || gin2->_idx == fin1->_idx)) { //swapping is expensive, check condition first 3508 fix->swap_edges(1, 2); 3509 swapped = true; 3510 } 3511 3512 if (swapped) { 3513 #ifndef PRODUCT 3514 if (_vector_loop_debug) { 3515 tty->print_cr("SuperWord::fix_commutative_inputs: fixed node %d", fix->_idx); 3516 } 3517 #endif 3518 return true; 3519 } 3520 3521 #ifndef PRODUCT 3522 if (TraceSuperWord && Verbose) { 3523 tty->print_cr("SuperWord::fix_commutative_inputs: cannot fix node %d", fix->_idx); 3524 } 3525 #endif 3526 return false; 3527 } 3528 3529 bool SuperWord::pack_parallel() { 3530 #ifndef PRODUCT 3531 if (_vector_loop_debug) { 3532 tty->print_cr("SuperWord::pack_parallel: START"); 3533 } 3534 #endif 3535 3536 _packset.clear(); 3537 3538 for (int ii = 0; ii < _iteration_first.length(); ii++) { 3539 Node* nd = _iteration_first.at(ii); 3540 if (in_bb(nd) && (nd->is_Load() || nd->is_Store() || nd->is_Add() || nd->is_Mul())) { 3541 Node_List* pk = new Node_List(); 3542 pk->push(nd); 3543 for (int gen = 1; gen < _ii_order.length(); ++gen) { 3544 for (int kk = 0; kk < _block.length(); kk++) { 3545 Node* clone = _block.at(kk); 3546 if (same_origin_idx(clone, nd) && 3547 _clone_map.gen(clone->_idx) == _ii_order.at(gen)) { 3548 if (nd->is_Add() || nd->is_Mul()) { 3549 fix_commutative_inputs(nd, clone); 3550 } 3551 pk->push(clone); 3552 if (pk->size() == 4) { 3553 _packset.append(pk); 3554 #ifndef PRODUCT 3555 if (_vector_loop_debug) { 3556 tty->print_cr("SuperWord::pack_parallel: added pack "); 3557 pk->dump(); 3558 } 3559 #endif 3560 if (_clone_map.gen(clone->_idx) != _ii_last) { 3561 pk = new Node_List(); 3562 } 3563 } 3564 break; 3565 } 3566 } 3567 }//for 3568 }//if 3569 }//for 3570 3571 #ifndef PRODUCT 3572 if (_vector_loop_debug) { 3573 tty->print_cr("SuperWord::pack_parallel: END"); 3574 } 3575 #endif 3576 3577 return true; 3578 } 3579 3580 bool SuperWord::hoist_loads_in_graph() { 3581 GrowableArray<Node*> loads; 3582 3583 #ifndef PRODUCT 3584 if (_vector_loop_debug) { 3585 tty->print_cr("SuperWord::hoist_loads_in_graph: total number _mem_slice_head.length() = %d", _mem_slice_head.length()); 3586 } 3587 #endif 3588 3589 for (int i = 0; i < _mem_slice_head.length(); i++) { 3590 Node* n = _mem_slice_head.at(i); 3591 if ( !in_bb(n) || !n->is_Phi() || n->bottom_type() != Type::MEMORY) { 3592 #ifndef PRODUCT 3593 if (TraceSuperWord && Verbose) { 3594 tty->print_cr("SuperWord::hoist_loads_in_graph: skipping unexpected node n=%d", n->_idx); 3595 } 3596 #endif 3597 continue; 3598 } 3599 3600 #ifndef PRODUCT 3601 if (_vector_loop_debug) { 3602 tty->print_cr("SuperWord::hoist_loads_in_graph: processing phi %d = _mem_slice_head.at(%d);", n->_idx, i); 3603 } 3604 #endif 3605 3606 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) { 3607 Node* ld = n->fast_out(i); 3608 if (ld->is_Load() && ld->as_Load()->in(MemNode::Memory) == n && in_bb(ld)) { 3609 for (int i = 0; i < _block.length(); i++) { 3610 Node* ld2 = _block.at(i); 3611 if (ld2->is_Load() && same_origin_idx(ld, ld2) && 3612 !same_generation(ld, ld2)) { // <= do not collect the first generation ld 3613 #ifndef PRODUCT 3614 if (_vector_loop_debug) { 3615 tty->print_cr("SuperWord::hoist_loads_in_graph: will try to hoist load ld2->_idx=%d, cloned from %d (ld->_idx=%d)", 3616 ld2->_idx, _clone_map.idx(ld->_idx), ld->_idx); 3617 } 3618 #endif 3619 // could not do on-the-fly, since iterator is immutable 3620 loads.push(ld2); 3621 } 3622 }// for 3623 }//if 3624 }//for (DUIterator_Fast imax, 3625 }//for (int i = 0; i 3626 3627 for (int i = 0; i < loads.length(); i++) { 3628 LoadNode* ld = loads.at(i)->as_Load(); 3629 Node* phi = find_phi_for_mem_dep(ld); 3630 if (phi != NULL) { 3631 #ifndef PRODUCT 3632 if (_vector_loop_debug) { 3633 tty->print_cr("SuperWord::hoist_loads_in_graph replacing MemNode::Memory(%d) edge in %d with one from %d", 3634 MemNode::Memory, ld->_idx, phi->_idx); 3635 } 3636 #endif 3637 _igvn.replace_input_of(ld, MemNode::Memory, phi); 3638 } 3639 }//for 3640 3641 restart(); // invalidate all basic structures, since we rebuilt the graph 3642 3643 #ifndef PRODUCT 3644 if (TraceSuperWord && Verbose) { 3645 tty->print_cr("\nSuperWord::hoist_loads_in_graph() the graph was rebuilt, all structures invalidated and need rebuild"); 3646 } 3647 #endif 3648 return true; 3649 } 3650