< prev index next >
src/share/vm/opto/superword.cpp
Print this page
*** 49,58 ****
--- 49,59 ----
_igvn(phase->_igvn),
_arena(phase->C->comp_arena()),
_packset(arena(), 8, 0, NULL), // packs for the current block
_bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
_block(arena(), 8, 0, NULL), // nodes in current block
+ _post_block(arena(), 8, 0, NULL), // nodes common to current block which are marked as post loop vectorizable
_data_entry(arena(), 8, 0, NULL), // nodes with all inputs from outside
_mem_slice_head(arena(), 8, 0, NULL), // memory slice heads
_mem_slice_tail(arena(), 8, 0, NULL), // memory slice tails
_node_info(arena(), 8, 0, SWNodeInfo::initial), // info needed per node
_clone_map(phase->C->clone_map()), // map of nodes created in cloning
*** 97,110 ****
assert(lpt->_head->is_CountedLoop(), "must be");
CountedLoopNode *cl = lpt->_head->as_CountedLoop();
if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
! if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops
// Check for no control flow in body (other than exit)
Node *cl_exit = cl->loopexit();
! if (cl_exit->in(0) != lpt->_head) {
#ifndef PRODUCT
if (TraceSuperWord) {
tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();
--- 98,131 ----
assert(lpt->_head->is_CountedLoop(), "must be");
CountedLoopNode *cl = lpt->_head->as_CountedLoop();
if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
! bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
! if (post_loop_allowed) {
! if (cl->is_reduction_loop()) return; // no predication mapping
! Node *limit = cl->limit();
! if (limit->is_Con()) return; // non constant limits only
! // Now check the limit for expressions we do not handle
! if (limit->is_Add()) {
! Node *in2 = limit->in(2);
! if (in2->is_Con()) {
! int val = in2->get_int();
! // should not try to program these cases
! if (val < 0) return;
! }
! }
! }
!
! // skip any loop that has not been assigned max unroll by analysis
! if (do_optimization) {
! if (cl->slp_max_unroll() == 0) return;
! }
!
// Check for no control flow in body (other than exit)
Node *cl_exit = cl->loopexit();
! if (cl->is_main_loop() && (cl_exit->in(0) != lpt->_head)) {
#ifndef PRODUCT
if (TraceSuperWord) {
tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();
*** 118,136 ****
// Make sure the are no extra control users of the loop backedge
if (cl->back_control()->outcnt() != 1) {
return;
}
! // We only re-enter slp when we vector mapped a queried loop and we want to
! // continue unrolling, in this case, slp is not subsequently done.
! if (cl->do_unroll_only()) return;
// Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
if (pre_end == NULL) return;
Node *pre_opaq1 = pre_end->limit();
if (pre_opaq1->Opcode() != Op_Opaque1) return;
init(); // initialize data structures
set_lpt(lpt);
set_lp(cl);
--- 139,158 ----
// Make sure the are no extra control users of the loop backedge
if (cl->back_control()->outcnt() != 1) {
return;
}
! // Skip any loops already optimized by slp
! if (cl->is_vectorized_loop()) return;
+ if (cl->is_main_loop()) {
// Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
if (pre_end == NULL) return;
Node *pre_opaq1 = pre_end->limit();
if (pre_opaq1->Opcode() != Op_Opaque1) return;
+ }
init(); // initialize data structures
set_lpt(lpt);
set_lp(cl);
*** 139,148 ****
--- 161,183 ----
set_bb(cl);
if (do_optimization) {
assert(_packset.length() == 0, "packset must be empty");
SLP_extract();
+ if (PostLoopMultiversioning && Matcher::has_predicated_vectors()) {
+ if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) {
+ IdealLoopTree *lpt_next = lpt->_next;
+ CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
+ _phase->has_range_checks(lpt_next);
+ if (cl_next->is_post_loop() && !cl_next->range_checks_present()) {
+ if (!cl_next->is_vectorized_loop()) {
+ int slp_max_unroll_factor = cl->slp_max_unroll();
+ cl_next->set_slp_max_unroll(slp_max_unroll_factor);
+ }
+ }
+ }
+ }
}
}
//------------------------------early unrolling analysis------------------------------
void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
*** 151,167 ****
--- 186,206 ----
size_t ignored_size = lpt()->_body.size();
int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
Node_Stack nstack((int)ignored_size);
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
Node *cl_exit = cl->loopexit();
+ int rpo_idx = _post_block.length();
+
+ assert(rpo_idx == 0, "post loop block is empty");
// First clear the entries
for (uint i = 0; i < lpt()->_body.size(); i++) {
ignored_loop_nodes[i] = -1;
}
int max_vector = Matcher::max_vector_size(T_INT);
+ bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
// Process the loop, some/all of the stack entries will not be in order, ergo
// need to preprocess the ignored initial state before we process the loop
for (uint i = 0; i < lpt()->_body.size(); i++) {
Node* n = lpt()->_body.at(i);
*** 256,275 ****
--- 295,333 ----
}
if (is_slp) {
// Now we try to find the maximum supported consistent vector which the machine
// description can use
+ bool small_basic_type = false;
for (uint i = 0; i < lpt()->_body.size(); i++) {
if (ignored_loop_nodes[i] != -1) continue;
BasicType bt;
Node* n = lpt()->_body.at(i);
if (n->is_Mem()) {
bt = n->as_Mem()->memory_type();
} else {
bt = n->bottom_type()->basic_type();
}
+
+ if (post_loop_allowed) {
+ if (!small_basic_type) {
+ switch (bt) {
+ case T_CHAR:
+ case T_BYTE:
+ case T_SHORT:
+ small_basic_type = true;
+ break;
+ case T_LONG:
+ // TODO: Remove when support completed for mask context with LONG.
+ // The ad files need to be augmented.
+ small_basic_type = true;
+ break;
+ }
+ }
+ }
+
if (is_java_primitive(bt) == false) continue;
int cur_max_vector = Matcher::max_vector_size(bt);
// If a max vector exists which is not larger than _local_loop_unroll_factor
*** 285,303 ****
--- 343,374 ----
// Map the maximal common vector
if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
if (cur_max_vector < max_vector) {
max_vector = cur_max_vector;
}
+
+ // We only process post loops on predicated targets where we want to
+ // mask map the loop to a single iteration
+ if (post_loop_allowed) {
+ _post_block.at_put_grow(rpo_idx++, n);
+ }
}
}
if (is_slp) {
local_loop_unroll_factor = max_vector;
cl->mark_passed_slp();
}
cl->mark_was_slp();
+ if (cl->is_main_loop()) {
+ cl->set_slp_max_unroll(local_loop_unroll_factor);
+ } else if (post_loop_allowed) {
+ if (!small_basic_type) {
+ // avoid replication context for small basic types in programmable masked loops
cl->set_slp_max_unroll(local_loop_unroll_factor);
}
+ }
+ }
}
//------------------------------SLP_extract---------------------------
// Extract the superword level parallelism
//
*** 347,362 ****
--- 418,437 ----
#endif
// Ready the block
if (!construct_bb()) {
return; // Exit if no interesting nodes or complex graph.
}
+
// build _dg, _disjoint_ptrs
dependence_graph();
// compute function depth(Node*)
compute_max_depth();
+ CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
+ bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
+ if (cl->is_main_loop()) {
if (_do_vector_loop) {
if (mark_generations() != -1) {
hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly
if (!construct_bb()) {
*** 407,416 ****
--- 482,524 ----
}
filter_packs();
schedule();
+ } else if (post_loop_allowed) {
+ int saved_mapped_unroll_factor = cl->slp_max_unroll();
+ if (saved_mapped_unroll_factor) {
+ int vector_mapped_unroll_factor = saved_mapped_unroll_factor;
+
+ // now reset the slp_unroll_factor so that we can check the analysis mapped
+ // what the vector loop was mapped to
+ cl->set_slp_max_unroll(0);
+
+ // do the analysis on the post loop
+ unrolling_analysis(vector_mapped_unroll_factor);
+
+ // if our analyzed loop is a canonical fit, start processing it
+ if (vector_mapped_unroll_factor == saved_mapped_unroll_factor) {
+ // now add the vector nodes to packsets
+ for (int i = 0; i < _post_block.length(); i++) {
+ Node* n = _post_block.at(i);
+ Node_List* singleton = new Node_List();
+ singleton->push(n);
+ _packset.append(singleton);
+ set_my_pack(n, singleton);
+ }
+
+ // map base types for vector usage
+ compute_vector_element_type();
+ } else {
+ return;
+ }
+ } else {
+ // for some reason we could not map the slp analysis state of the vectorized loop
+ return;
+ }
+ }
output();
}
//------------------------------find_adjacent_refs---------------------------
*** 808,817 ****
--- 916,926 ----
//---------------------------dependence_graph---------------------------
// Construct dependency graph.
// Add dependence edges to load/store nodes for memory dependence
// A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
void SuperWord::dependence_graph() {
+ CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
// First, assign a dependence node to each memory node
for (int i = 0; i < _block.length(); i++ ) {
Node *n = _block.at(i);
if (n->is_Mem() || n->is_Phi() && n->bottom_type() == Type::MEMORY) {
_dg.make_node(n);
*** 822,832 ****
--- 931,943 ----
for (int i = 0; i < _mem_slice_head.length(); i++) {
Node* n = _mem_slice_head.at(i);
Node* n_tail = _mem_slice_tail.at(i);
// Get slice in predecessor order (last is first)
+ if (cl->is_main_loop()) {
mem_slice_preds(n_tail, n, _nlist);
+ }
#ifndef PRODUCT
if(TraceSuperWord && Verbose) {
tty->print_cr("SuperWord::dependence_graph: built a new mem slice");
for (int j = _nlist.length() - 1; j >= 0 ; j--) {
*** 2026,2049 ****
tty->print("SuperWord::output ");
lpt()->dump_head();
}
#endif
// MUST ENSURE main loop's initial value is properly aligned:
// (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
align_initial_loop_index(align_to_ref());
// Insert extract (unpack) operations for scalar uses
for (int i = 0; i < _packset.length(); i++) {
insert_extracts(_packset.at(i));
}
Compile* C = _phase->C;
- CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
uint max_vlen_in_bytes = 0;
uint max_vlen = 0;
NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop before create_reserve_version_of_loop"); print_loop(true);})
CountedLoopReserveKit make_reversable(_phase, _lpt, do_reserve_copy());
--- 2137,2163 ----
tty->print("SuperWord::output ");
lpt()->dump_head();
}
#endif
+ CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
+ if (cl->is_main_loop()) {
// MUST ENSURE main loop's initial value is properly aligned:
// (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
align_initial_loop_index(align_to_ref());
// Insert extract (unpack) operations for scalar uses
for (int i = 0; i < _packset.length(); i++) {
insert_extracts(_packset.at(i));
}
+ }
Compile* C = _phase->C;
uint max_vlen_in_bytes = 0;
uint max_vlen = 0;
+ bool can_process_post_loop = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop before create_reserve_version_of_loop"); print_loop(true);})
CountedLoopReserveKit make_reversable(_phase, _lpt, do_reserve_copy());
*** 2061,2070 ****
--- 2175,2188 ----
uint vlen = p->size();
uint vlen_in_bytes = 0;
Node* vn = NULL;
Node* low_adr = p->at(0);
Node* first = executed_first(p);
+ if (can_process_post_loop) {
+ // override vlen with the main loops vector length
+ vlen = cl->slp_max_unroll();
+ }
NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d executed first, %d executed last in pack", first->_idx, n->_idx); print_pack(p);})
int opc = n->Opcode();
if (n->is_Load()) {
Node* ctl = n->in(MemNode::Control);
Node* mem = first->in(MemNode::Memory);
*** 2150,2159 ****
--- 2268,2281 ----
// Promote operand to vector (Sqrt/Abs/Neg are 2 address instructions)
Node* in = vector_opd(p, 1);
vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} else if (is_cmov_pack(p)) {
+ if (can_process_post_loop) {
+ // do not refactor of flow in post loop context
+ return;
+ }
if (!n->is_CMove()) {
continue;
}
// place here CMoveVDNode
NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: print before CMove vectorization"); print_loop(false);})
*** 2214,2231 ****
--- 2336,2362 ----
return; //and reverse to backup IG
}
ShouldNotReachHere();
}
+ _block.at_put(i, vn);
_igvn.register_new_node_with_optimizer(vn);
_phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
for (uint j = 0; j < p->size(); j++) {
Node* pm = p->at(j);
_igvn.replace_node(pm, vn);
}
_igvn._worklist.push(vn);
+ if (can_process_post_loop) {
+ // first check if the vector size if the maximum vector which we can use on the machine,
+ // other vector size have reduced values for predicated data mapping.
+ if (vlen_in_bytes != (uint)MaxVectorSize) {
+ return;
+ }
+ }
+
if (vlen_in_bytes > max_vlen_in_bytes) {
max_vlen = vlen;
max_vlen_in_bytes = vlen_in_bytes;
}
#ifdef ASSERT
*** 2244,2262 ****
uint slp_max_unroll_factor = cl->slp_max_unroll();
if (slp_max_unroll_factor == max_vlen) {
if (TraceSuperWordLoopUnrollAnalysis) {
tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte);
}
! // For atomic unrolled loops which are vector mapped, instigate more unrolling.
cl->set_notpassed_slp();
! // if vector resources are limited, do not allow additional unrolling
if (FLOATPRESSURE > 8) {
C->set_major_progress();
- }
cl->mark_do_unroll_only();
if (do_reserve_copy()) {
cl->mark_loop_vectorized();
}
}
}
}
--- 2375,2415 ----
uint slp_max_unroll_factor = cl->slp_max_unroll();
if (slp_max_unroll_factor == max_vlen) {
if (TraceSuperWordLoopUnrollAnalysis) {
tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte);
}
!
! // For atomic unrolled loops which are vector mapped, instigate more unrolling
cl->set_notpassed_slp();
! if (cl->is_main_loop()) {
! // if vector resources are limited, do not allow additional unrolling, also
! // do not unroll more on pure vector loops which were not reduced so that we can
! // program the post loop to single iteration execution.
if (FLOATPRESSURE > 8) {
C->set_major_progress();
cl->mark_do_unroll_only();
+ }
+ }
+
if (do_reserve_copy()) {
cl->mark_loop_vectorized();
+ if (can_process_post_loop) {
+ // Now create the difference of trip and limit and use it as our mask index.
+ // Note: We limited the unroll of the vectorized loop so that
+ // only vlen-1 size iterations can remain to be mask programmed.
+ Node *incr = cl->incr();
+ SubINode *index = new SubINode(cl->limit(), cl->init_trip());
+ _igvn.register_new_node_with_optimizer(index);
+ MaskCreateINode *mask = new MaskCreateINode(_phase->get_ctrl(cl->init_trip()), index);
+ _igvn.register_new_node_with_optimizer(mask);
+ // make this a single iteration loop
+ AddINode *new_incr = new AddINode(incr->in(1), mask);
+ _igvn.register_new_node_with_optimizer(new_incr);
+ _phase->set_ctrl(new_incr, _phase->get_ctrl(incr));
+ _igvn.replace_node(incr, new_incr);
+ cl->mark_is_multiversioned();
+ }
}
}
}
}
*** 2271,2280 ****
--- 2424,2439 ----
// Create a vector operand for the nodes in pack p for operand: in(opd_idx)
Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
Node* p0 = p->at(0);
uint vlen = p->size();
Node* opd = p0->in(opd_idx);
+ CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
+
+ if (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop()) {
+ // override vlen with the main loops vector length
+ vlen = cl->slp_max_unroll();
+ }
if (same_inputs(p, opd_idx)) {
if (opd->is_Vector() || opd->is_LoadVector()) {
assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector");
if (opd_idx == 2 && VectorNode::is_shift(p0)) {
*** 3087,3103 ****
CountedLoopNode* loop_node = pre_end->loopnode();
if (loop_node == NULL || !loop_node->is_pre_loop()) return NULL;
return pre_end;
}
-
//------------------------------init---------------------------
void SuperWord::init() {
_dg.init();
_packset.clear();
_disjoint_ptrs.clear();
_block.clear();
_data_entry.clear();
_mem_slice_head.clear();
_mem_slice_tail.clear();
_iteration_first.clear();
_iteration_last.clear();
--- 3246,3262 ----
CountedLoopNode* loop_node = pre_end->loopnode();
if (loop_node == NULL || !loop_node->is_pre_loop()) return NULL;
return pre_end;
}
//------------------------------init---------------------------
void SuperWord::init() {
_dg.init();
_packset.clear();
_disjoint_ptrs.clear();
_block.clear();
+ _post_block.clear();
_data_entry.clear();
_mem_slice_head.clear();
_mem_slice_tail.clear();
_iteration_first.clear();
_iteration_last.clear();
*** 3117,3126 ****
--- 3276,3286 ----
void SuperWord::restart() {
_dg.init();
_packset.clear();
_disjoint_ptrs.clear();
_block.clear();
+ _post_block.clear();
_data_entry.clear();
_mem_slice_head.clear();
_mem_slice_tail.clear();
_node_info.clear();
}
< prev index next >