< prev index next >

src/share/vm/opto/superword.cpp

Print this page

        

*** 49,58 **** --- 49,59 ---- _igvn(phase->_igvn), _arena(phase->C->comp_arena()), _packset(arena(), 8, 0, NULL), // packs for the current block _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb _block(arena(), 8, 0, NULL), // nodes in current block + _post_block(arena(), 8, 0, NULL), // nodes common to current block which are marked as post loop vectorizable _data_entry(arena(), 8, 0, NULL), // nodes with all inputs from outside _mem_slice_head(arena(), 8, 0, NULL), // memory slice heads _mem_slice_tail(arena(), 8, 0, NULL), // memory slice tails _node_info(arena(), 8, 0, SWNodeInfo::initial), // info needed per node _clone_map(phase->C->clone_map()), // map of nodes created in cloning
*** 97,110 **** assert(lpt->_head->is_CountedLoop(), "must be"); CountedLoopNode *cl = lpt->_head->as_CountedLoop(); if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop ! if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops // Check for no control flow in body (other than exit) Node *cl_exit = cl->loopexit(); ! if (cl_exit->in(0) != lpt->_head) { #ifndef PRODUCT if (TraceSuperWord) { tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head"); tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump(); tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump(); --- 98,131 ---- assert(lpt->_head->is_CountedLoop(), "must be"); CountedLoopNode *cl = lpt->_head->as_CountedLoop(); if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop ! bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop()); ! if (post_loop_allowed) { ! if (cl->is_reduction_loop()) return; // no predication mapping ! Node *limit = cl->limit(); ! if (limit->is_Con()) return; // non constant limits only ! // Now check the limit for expressions we do not handle ! if (limit->is_Add()) { ! Node *in2 = limit->in(2); ! if (in2->is_Con()) { ! int val = in2->get_int(); ! // should not try to program these cases ! if (val < 0) return; ! } ! } ! } ! ! // skip any loop that has not been assigned max unroll by analysis ! if (do_optimization) { ! if (cl->slp_max_unroll() == 0) return; ! } ! // Check for no control flow in body (other than exit) Node *cl_exit = cl->loopexit(); ! if (cl->is_main_loop() && (cl_exit->in(0) != lpt->_head)) { #ifndef PRODUCT if (TraceSuperWord) { tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head"); tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump(); tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();
*** 118,136 **** // Make sure the are no extra control users of the loop backedge if (cl->back_control()->outcnt() != 1) { return; } ! // We only re-enter slp when we vector mapped a queried loop and we want to ! // continue unrolling, in this case, slp is not subsequently done. ! if (cl->do_unroll_only()) return; // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit)))) CountedLoopEndNode* pre_end = get_pre_loop_end(cl); if (pre_end == NULL) return; Node *pre_opaq1 = pre_end->limit(); if (pre_opaq1->Opcode() != Op_Opaque1) return; init(); // initialize data structures set_lpt(lpt); set_lp(cl); --- 139,158 ---- // Make sure the are no extra control users of the loop backedge if (cl->back_control()->outcnt() != 1) { return; } ! // Skip any loops already optimized by slp ! if (cl->is_vectorized_loop()) return; + if (cl->is_main_loop()) { // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit)))) CountedLoopEndNode* pre_end = get_pre_loop_end(cl); if (pre_end == NULL) return; Node *pre_opaq1 = pre_end->limit(); if (pre_opaq1->Opcode() != Op_Opaque1) return; + } init(); // initialize data structures set_lpt(lpt); set_lp(cl);
*** 139,148 **** --- 161,183 ---- set_bb(cl); if (do_optimization) { assert(_packset.length() == 0, "packset must be empty"); SLP_extract(); + if (PostLoopMultiversioning && Matcher::has_predicated_vectors()) { + if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) { + IdealLoopTree *lpt_next = lpt->_next; + CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop(); + _phase->has_range_checks(lpt_next); + if (cl_next->is_post_loop() && !cl_next->range_checks_present()) { + if (!cl_next->is_vectorized_loop()) { + int slp_max_unroll_factor = cl->slp_max_unroll(); + cl_next->set_slp_max_unroll(slp_max_unroll_factor); + } + } + } + } } } //------------------------------early unrolling analysis------------------------------ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
*** 151,167 **** --- 186,206 ---- size_t ignored_size = lpt()->_body.size(); int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size); Node_Stack nstack((int)ignored_size); CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); Node *cl_exit = cl->loopexit(); + int rpo_idx = _post_block.length(); + + assert(rpo_idx == 0, "post loop block is empty"); // First clear the entries for (uint i = 0; i < lpt()->_body.size(); i++) { ignored_loop_nodes[i] = -1; } int max_vector = Matcher::max_vector_size(T_INT); + bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop()); // Process the loop, some/all of the stack entries will not be in order, ergo // need to preprocess the ignored initial state before we process the loop for (uint i = 0; i < lpt()->_body.size(); i++) { Node* n = lpt()->_body.at(i);
*** 256,275 **** --- 295,333 ---- } if (is_slp) { // Now we try to find the maximum supported consistent vector which the machine // description can use + bool small_basic_type = false; for (uint i = 0; i < lpt()->_body.size(); i++) { if (ignored_loop_nodes[i] != -1) continue; BasicType bt; Node* n = lpt()->_body.at(i); if (n->is_Mem()) { bt = n->as_Mem()->memory_type(); } else { bt = n->bottom_type()->basic_type(); } + + if (post_loop_allowed) { + if (!small_basic_type) { + switch (bt) { + case T_CHAR: + case T_BYTE: + case T_SHORT: + small_basic_type = true; + break; + case T_LONG: + // TODO: Remove when support completed for mask context with LONG. + // The ad files need to be augmented. + small_basic_type = true; + break; + } + } + } + if (is_java_primitive(bt) == false) continue; int cur_max_vector = Matcher::max_vector_size(bt); // If a max vector exists which is not larger than _local_loop_unroll_factor
*** 285,303 **** --- 343,374 ---- // Map the maximal common vector if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) { if (cur_max_vector < max_vector) { max_vector = cur_max_vector; } + + // We only process post loops on predicated targets where we want to + // mask map the loop to a single iteration + if (post_loop_allowed) { + _post_block.at_put_grow(rpo_idx++, n); + } } } if (is_slp) { local_loop_unroll_factor = max_vector; cl->mark_passed_slp(); } cl->mark_was_slp(); + if (cl->is_main_loop()) { + cl->set_slp_max_unroll(local_loop_unroll_factor); + } else if (post_loop_allowed) { + if (!small_basic_type) { + // avoid replication context for small basic types in programmable masked loops cl->set_slp_max_unroll(local_loop_unroll_factor); } + } + } } //------------------------------SLP_extract--------------------------- // Extract the superword level parallelism //
*** 347,362 **** --- 418,437 ---- #endif // Ready the block if (!construct_bb()) { return; // Exit if no interesting nodes or complex graph. } + // build _dg, _disjoint_ptrs dependence_graph(); // compute function depth(Node*) compute_max_depth(); + CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); + bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop()); + if (cl->is_main_loop()) { if (_do_vector_loop) { if (mark_generations() != -1) { hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly if (!construct_bb()) {
*** 407,416 **** --- 482,524 ---- } filter_packs(); schedule(); + } else if (post_loop_allowed) { + int saved_mapped_unroll_factor = cl->slp_max_unroll(); + if (saved_mapped_unroll_factor) { + int vector_mapped_unroll_factor = saved_mapped_unroll_factor; + + // now reset the slp_unroll_factor so that we can check the analysis mapped + // what the vector loop was mapped to + cl->set_slp_max_unroll(0); + + // do the analysis on the post loop + unrolling_analysis(vector_mapped_unroll_factor); + + // if our analyzed loop is a canonical fit, start processing it + if (vector_mapped_unroll_factor == saved_mapped_unroll_factor) { + // now add the vector nodes to packsets + for (int i = 0; i < _post_block.length(); i++) { + Node* n = _post_block.at(i); + Node_List* singleton = new Node_List(); + singleton->push(n); + _packset.append(singleton); + set_my_pack(n, singleton); + } + + // map base types for vector usage + compute_vector_element_type(); + } else { + return; + } + } else { + // for some reason we could not map the slp analysis state of the vectorized loop + return; + } + } output(); } //------------------------------find_adjacent_refs---------------------------
*** 808,817 **** --- 916,926 ---- //---------------------------dependence_graph--------------------------- // Construct dependency graph. // Add dependence edges to load/store nodes for memory dependence // A.out()->DependNode.in(1) and DependNode.out()->B.prec(x) void SuperWord::dependence_graph() { + CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); // First, assign a dependence node to each memory node for (int i = 0; i < _block.length(); i++ ) { Node *n = _block.at(i); if (n->is_Mem() || n->is_Phi() && n->bottom_type() == Type::MEMORY) { _dg.make_node(n);
*** 822,832 **** --- 931,943 ---- for (int i = 0; i < _mem_slice_head.length(); i++) { Node* n = _mem_slice_head.at(i); Node* n_tail = _mem_slice_tail.at(i); // Get slice in predecessor order (last is first) + if (cl->is_main_loop()) { mem_slice_preds(n_tail, n, _nlist); + } #ifndef PRODUCT if(TraceSuperWord && Verbose) { tty->print_cr("SuperWord::dependence_graph: built a new mem slice"); for (int j = _nlist.length() - 1; j >= 0 ; j--) {
*** 2026,2049 **** tty->print("SuperWord::output "); lpt()->dump_head(); } #endif // MUST ENSURE main loop's initial value is properly aligned: // (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0 align_initial_loop_index(align_to_ref()); // Insert extract (unpack) operations for scalar uses for (int i = 0; i < _packset.length(); i++) { insert_extracts(_packset.at(i)); } Compile* C = _phase->C; - CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); uint max_vlen_in_bytes = 0; uint max_vlen = 0; NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop before create_reserve_version_of_loop"); print_loop(true);}) CountedLoopReserveKit make_reversable(_phase, _lpt, do_reserve_copy()); --- 2137,2163 ---- tty->print("SuperWord::output "); lpt()->dump_head(); } #endif + CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); + if (cl->is_main_loop()) { // MUST ENSURE main loop's initial value is properly aligned: // (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0 align_initial_loop_index(align_to_ref()); // Insert extract (unpack) operations for scalar uses for (int i = 0; i < _packset.length(); i++) { insert_extracts(_packset.at(i)); } + } Compile* C = _phase->C; uint max_vlen_in_bytes = 0; uint max_vlen = 0; + bool can_process_post_loop = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop()); NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop before create_reserve_version_of_loop"); print_loop(true);}) CountedLoopReserveKit make_reversable(_phase, _lpt, do_reserve_copy());
*** 2061,2070 **** --- 2175,2188 ---- uint vlen = p->size(); uint vlen_in_bytes = 0; Node* vn = NULL; Node* low_adr = p->at(0); Node* first = executed_first(p); + if (can_process_post_loop) { + // override vlen with the main loops vector length + vlen = cl->slp_max_unroll(); + } NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d executed first, %d executed last in pack", first->_idx, n->_idx); print_pack(p);}) int opc = n->Opcode(); if (n->is_Load()) { Node* ctl = n->in(MemNode::Control); Node* mem = first->in(MemNode::Memory);
*** 2150,2159 **** --- 2268,2281 ---- // Promote operand to vector (Sqrt/Abs/Neg are 2 address instructions) Node* in = vector_opd(p, 1); vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n)); vlen_in_bytes = vn->as_Vector()->length_in_bytes(); } else if (is_cmov_pack(p)) { + if (can_process_post_loop) { + // do not refactor of flow in post loop context + return; + } if (!n->is_CMove()) { continue; } // place here CMoveVDNode NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: print before CMove vectorization"); print_loop(false);})
*** 2214,2231 **** --- 2336,2362 ---- return; //and reverse to backup IG } ShouldNotReachHere(); } + _block.at_put(i, vn); _igvn.register_new_node_with_optimizer(vn); _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0))); for (uint j = 0; j < p->size(); j++) { Node* pm = p->at(j); _igvn.replace_node(pm, vn); } _igvn._worklist.push(vn); + if (can_process_post_loop) { + // first check if the vector size if the maximum vector which we can use on the machine, + // other vector size have reduced values for predicated data mapping. + if (vlen_in_bytes != (uint)MaxVectorSize) { + return; + } + } + if (vlen_in_bytes > max_vlen_in_bytes) { max_vlen = vlen; max_vlen_in_bytes = vlen_in_bytes; } #ifdef ASSERT
*** 2244,2262 **** uint slp_max_unroll_factor = cl->slp_max_unroll(); if (slp_max_unroll_factor == max_vlen) { if (TraceSuperWordLoopUnrollAnalysis) { tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte); } ! // For atomic unrolled loops which are vector mapped, instigate more unrolling. cl->set_notpassed_slp(); ! // if vector resources are limited, do not allow additional unrolling if (FLOATPRESSURE > 8) { C->set_major_progress(); - } cl->mark_do_unroll_only(); if (do_reserve_copy()) { cl->mark_loop_vectorized(); } } } } --- 2375,2415 ---- uint slp_max_unroll_factor = cl->slp_max_unroll(); if (slp_max_unroll_factor == max_vlen) { if (TraceSuperWordLoopUnrollAnalysis) { tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte); } ! ! // For atomic unrolled loops which are vector mapped, instigate more unrolling cl->set_notpassed_slp(); ! if (cl->is_main_loop()) { ! // if vector resources are limited, do not allow additional unrolling, also ! // do not unroll more on pure vector loops which were not reduced so that we can ! // program the post loop to single iteration execution. if (FLOATPRESSURE > 8) { C->set_major_progress(); cl->mark_do_unroll_only(); + } + } + if (do_reserve_copy()) { cl->mark_loop_vectorized(); + if (can_process_post_loop) { + // Now create the difference of trip and limit and use it as our mask index. + // Note: We limited the unroll of the vectorized loop so that + // only vlen-1 size iterations can remain to be mask programmed. + Node *incr = cl->incr(); + SubINode *index = new SubINode(cl->limit(), cl->init_trip()); + _igvn.register_new_node_with_optimizer(index); + MaskCreateINode *mask = new MaskCreateINode(_phase->get_ctrl(cl->init_trip()), index); + _igvn.register_new_node_with_optimizer(mask); + // make this a single iteration loop + AddINode *new_incr = new AddINode(incr->in(1), mask); + _igvn.register_new_node_with_optimizer(new_incr); + _phase->set_ctrl(new_incr, _phase->get_ctrl(incr)); + _igvn.replace_node(incr, new_incr); + cl->mark_is_multiversioned(); + } } } } }
*** 2271,2280 **** --- 2424,2439 ---- // Create a vector operand for the nodes in pack p for operand: in(opd_idx) Node* SuperWord::vector_opd(Node_List* p, int opd_idx) { Node* p0 = p->at(0); uint vlen = p->size(); Node* opd = p0->in(opd_idx); + CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); + + if (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop()) { + // override vlen with the main loops vector length + vlen = cl->slp_max_unroll(); + } if (same_inputs(p, opd_idx)) { if (opd->is_Vector() || opd->is_LoadVector()) { assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector"); if (opd_idx == 2 && VectorNode::is_shift(p0)) {
*** 3087,3103 **** CountedLoopNode* loop_node = pre_end->loopnode(); if (loop_node == NULL || !loop_node->is_pre_loop()) return NULL; return pre_end; } - //------------------------------init--------------------------- void SuperWord::init() { _dg.init(); _packset.clear(); _disjoint_ptrs.clear(); _block.clear(); _data_entry.clear(); _mem_slice_head.clear(); _mem_slice_tail.clear(); _iteration_first.clear(); _iteration_last.clear(); --- 3246,3262 ---- CountedLoopNode* loop_node = pre_end->loopnode(); if (loop_node == NULL || !loop_node->is_pre_loop()) return NULL; return pre_end; } //------------------------------init--------------------------- void SuperWord::init() { _dg.init(); _packset.clear(); _disjoint_ptrs.clear(); _block.clear(); + _post_block.clear(); _data_entry.clear(); _mem_slice_head.clear(); _mem_slice_tail.clear(); _iteration_first.clear(); _iteration_last.clear();
*** 3117,3126 **** --- 3276,3286 ---- void SuperWord::restart() { _dg.init(); _packset.clear(); _disjoint_ptrs.clear(); _block.clear(); + _post_block.clear(); _data_entry.clear(); _mem_slice_head.clear(); _mem_slice_tail.clear(); _node_info.clear(); }
< prev index next >