--- old/src/share/vm/opto/loopTransform.cpp 2016-03-14 13:22:00.815086200 -0700 +++ new/src/share/vm/opto/loopTransform.cpp 2016-03-14 13:22:00.577086200 -0700 @@ -983,7 +983,7 @@ // Insert pre and post loops. If peel_only is set, the pre-loop can not have // more iterations added. It acts as a 'peel' only, no lower-bound RCE, no // alignment. Useful to unroll loops that do no array accesses. -void PhaseIdealLoop::insert_pre_post_loops( IdealLoopTree *loop, Node_List &old_new, bool peel_only ) { +void PhaseIdealLoop::insert_pre_post_loops(IdealLoopTree *loop, Node_List &old_new, bool peel_only) { #ifndef PRODUCT if (TraceLoopOpts) { @@ -998,10 +998,10 @@ // Find common pieces of the loop being guarded with pre & post loops CountedLoopNode *main_head = loop->_head->as_CountedLoop(); - assert( main_head->is_normal_loop(), "" ); + assert(main_head->is_normal_loop(), ""); CountedLoopEndNode *main_end = main_head->loopexit(); guarantee(main_end != NULL, "no loop exit node"); - assert( main_end->outcnt() == 2, "1 true, 1 false path only" ); + assert(main_end->outcnt() == 2, "1 true, 1 false path only"); uint dd_main_head = dom_depth(main_head); uint max = main_head->outcnt(); @@ -1015,93 +1015,24 @@ // Need only 1 user of 'bol' because I will be hacking the loop bounds. Node *bol = main_end->in(CountedLoopEndNode::TestValue); - if( bol->outcnt() != 1 ) { + if (bol->outcnt() != 1) { bol = bol->clone(); register_new_node(bol,main_end->in(CountedLoopEndNode::TestControl)); _igvn.replace_input_of(main_end, CountedLoopEndNode::TestValue, bol); } // Need only 1 user of 'cmp' because I will be hacking the loop bounds. - if( cmp->outcnt() != 1 ) { + if (cmp->outcnt() != 1) { cmp = cmp->clone(); register_new_node(cmp,main_end->in(CountedLoopEndNode::TestControl)); _igvn.replace_input_of(bol, 1, cmp); } - //------------------------------ - // Step A: Create Post-Loop. - Node* main_exit = main_end->proj_out(false); - assert( main_exit->Opcode() == Op_IfFalse, "" ); - int dd_main_exit = dom_depth(main_exit); - - // Step A1: Clone the loop body. The clone becomes the post-loop. The main - // loop pre-header illegally has 2 control users (old & new loops). - clone_loop( loop, old_new, dd_main_exit ); - assert( old_new[main_end ->_idx]->Opcode() == Op_CountedLoopEnd, "" ); - CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop(); - post_head->set_post_loop(main_head); - - // Reduce the post-loop trip count. - CountedLoopEndNode* post_end = old_new[main_end ->_idx]->as_CountedLoopEnd(); - post_end->_prob = PROB_FAIR; - - // Build the main-loop normal exit. - IfFalseNode *new_main_exit = new IfFalseNode(main_end); - _igvn.register_new_node_with_optimizer( new_main_exit ); - set_idom(new_main_exit, main_end, dd_main_exit ); - set_loop(new_main_exit, loop->_parent); - - // Step A2: Build a zero-trip guard for the post-loop. After leaving the - // main-loop, the post-loop may not execute at all. We 'opaque' the incr - // (the main-loop trip-counter exit value) because we will be changing - // the exit value (via unrolling) so we cannot constant-fold away the zero - // trip guard until all unrolling is done. - Node *zer_opaq = new Opaque1Node(C, incr); - Node *zer_cmp = new CmpINode( zer_opaq, limit ); - Node *zer_bol = new BoolNode( zer_cmp, b_test ); - register_new_node( zer_opaq, new_main_exit ); - register_new_node( zer_cmp , new_main_exit ); - register_new_node( zer_bol , new_main_exit ); - - // Build the IfNode - IfNode *zer_iff = new IfNode( new_main_exit, zer_bol, PROB_FAIR, COUNT_UNKNOWN ); - _igvn.register_new_node_with_optimizer( zer_iff ); - set_idom(zer_iff, new_main_exit, dd_main_exit); - set_loop(zer_iff, loop->_parent); - - // Plug in the false-path, taken if we need to skip post-loop - _igvn.replace_input_of(main_exit, 0, zer_iff); - set_idom(main_exit, zer_iff, dd_main_exit); - set_idom(main_exit->unique_out(), zer_iff, dd_main_exit); - // Make the true-path, must enter the post loop - Node *zer_taken = new IfTrueNode( zer_iff ); - _igvn.register_new_node_with_optimizer( zer_taken ); - set_idom(zer_taken, zer_iff, dd_main_exit); - set_loop(zer_taken, loop->_parent); - // Plug in the true path - _igvn.hash_delete( post_head ); - post_head->set_req(LoopNode::EntryControl, zer_taken); - set_idom(post_head, zer_taken, dd_main_exit); - - Arena *a = Thread::current()->resource_area(); - VectorSet visited(a); - Node_Stack clones(a, main_head->back_control()->outcnt()); - // Step A3: Make the fall-in values to the post-loop come from the - // fall-out values of the main-loop. - for (DUIterator_Fast imax, i = main_head->fast_outs(imax); i < imax; i++) { - Node* main_phi = main_head->fast_out(i); - if( main_phi->is_Phi() && main_phi->in(0) == main_head && main_phi->outcnt() >0 ) { - Node *post_phi = old_new[main_phi->_idx]; - Node *fallmain = clone_up_backedge_goo(main_head->back_control(), - post_head->init_control(), - main_phi->in(LoopNode::LoopBackControl), - visited, clones); - _igvn.hash_delete(post_phi); - post_phi->set_req( LoopNode::EntryControl, fallmain ); - } - } + // Add the post loop + PostLoopInfo post_loop_info; + insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_loop_info); // Update local caches for next stanza - main_exit = new_main_exit; + Node *main_exit = post_loop_info.new_main_exit; //------------------------------ @@ -1109,7 +1040,7 @@ // Step B1: Clone the loop body. The clone becomes the pre-loop. The main // loop pre-header illegally has 2 control users (old & new loops). - clone_loop( loop, old_new, dd_main_head ); + clone_loop(loop, old_new, dd_main_head); CountedLoopNode* pre_head = old_new[main_head->_idx]->as_CountedLoop(); CountedLoopEndNode* pre_end = old_new[main_end ->_idx]->as_CountedLoopEnd(); pre_head->set_pre_loop(main_head); @@ -1120,9 +1051,9 @@ // Find the pre-loop normal exit. Node* pre_exit = pre_end->proj_out(false); - assert( pre_exit->Opcode() == Op_IfFalse, "" ); + assert(pre_exit->Opcode() == Op_IfFalse, ""); IfFalseNode *new_pre_exit = new IfFalseNode(pre_end); - _igvn.register_new_node_with_optimizer( new_pre_exit ); + _igvn.register_new_node_with_optimizer(new_pre_exit); set_idom(new_pre_exit, pre_end, dd_main_head); set_loop(new_pre_exit, loop->_parent); @@ -1131,47 +1062,48 @@ // zero-trip guard will become the minimum-trip guard when we unroll // the main-loop. Node *min_opaq = new Opaque1Node(C, limit); - Node *min_cmp = new CmpINode( pre_incr, min_opaq ); - Node *min_bol = new BoolNode( min_cmp, b_test ); - register_new_node( min_opaq, new_pre_exit ); - register_new_node( min_cmp , new_pre_exit ); - register_new_node( min_bol , new_pre_exit ); + Node *min_cmp = new CmpINode(pre_incr, min_opaq); + Node *min_bol = new BoolNode(min_cmp, b_test); + register_new_node(min_opaq, new_pre_exit); + register_new_node(min_cmp , new_pre_exit); + register_new_node(min_bol , new_pre_exit); // Build the IfNode (assume the main-loop is executed always). - IfNode *min_iff = new IfNode( new_pre_exit, min_bol, PROB_ALWAYS, COUNT_UNKNOWN ); - _igvn.register_new_node_with_optimizer( min_iff ); + IfNode *min_iff = new IfNode(new_pre_exit, min_bol, PROB_ALWAYS, COUNT_UNKNOWN); + _igvn.register_new_node_with_optimizer(min_iff); set_idom(min_iff, new_pre_exit, dd_main_head); set_loop(min_iff, loop->_parent); // Plug in the false-path, taken if we need to skip main-loop - _igvn.hash_delete( pre_exit ); + _igvn.hash_delete(pre_exit); pre_exit->set_req(0, min_iff); set_idom(pre_exit, min_iff, dd_main_head); set_idom(pre_exit->unique_out(), min_iff, dd_main_head); // Make the true-path, must enter the main loop - Node *min_taken = new IfTrueNode( min_iff ); - _igvn.register_new_node_with_optimizer( min_taken ); + Node *min_taken = new IfTrueNode(min_iff); + _igvn.register_new_node_with_optimizer(min_taken); set_idom(min_taken, min_iff, dd_main_head); set_loop(min_taken, loop->_parent); // Plug in the true path - _igvn.hash_delete( main_head ); + _igvn.hash_delete(main_head); main_head->set_req(LoopNode::EntryControl, min_taken); set_idom(main_head, min_taken, dd_main_head); - visited.Clear(); - clones.clear(); + Arena *a = Thread::current()->resource_area(); + VectorSet visited(a); + Node_Stack clones(a, main_head->back_control()->outcnt()); // Step B3: Make the fall-in values to the main-loop come from the // fall-out values of the pre-loop. for (DUIterator_Fast i2max, i2 = main_head->fast_outs(i2max); i2 < i2max; i2++) { Node* main_phi = main_head->fast_out(i2); - if( main_phi->is_Phi() && main_phi->in(0) == main_head && main_phi->outcnt() > 0 ) { + if (main_phi->is_Phi() && main_phi->in(0) == main_head && main_phi->outcnt() > 0) { Node *pre_phi = old_new[main_phi->_idx]; Node *fallpre = clone_up_backedge_goo(pre_head->back_control(), main_head->init_control(), pre_phi->in(LoopNode::LoopBackControl), visited, clones); _igvn.hash_delete(main_phi); - main_phi->set_req( LoopNode::EntryControl, fallpre ); + main_phi->set_req(LoopNode::EntryControl, fallpre); } } @@ -1185,29 +1117,25 @@ // variable value and the induction variable Phi to preserve correct // dependencies. - // CastII for the post loop: - bool inserted = cast_incr_before_loop(zer_opaq->in(1), zer_taken, post_head); - assert(inserted, "no castII inserted"); - // CastII for the main loop: - inserted = cast_incr_before_loop(pre_incr, min_taken, main_head); + bool inserted = cast_incr_before_loop(pre_incr, min_taken, main_head); assert(inserted, "no castII inserted"); // Step B4: Shorten the pre-loop to run only 1 iteration (for now). // RCE and alignment may change this later. Node *cmp_end = pre_end->cmp_node(); - assert( cmp_end->in(2) == limit, "" ); - Node *pre_limit = new AddINode( init, stride ); + assert(cmp_end->in(2) == limit, ""); + Node *pre_limit = new AddINode(init, stride); // Save the original loop limit in this Opaque1 node for // use by range check elimination. Node *pre_opaq = new Opaque1Node(C, pre_limit, limit); - register_new_node( pre_limit, pre_head->in(0) ); - register_new_node( pre_opaq , pre_head->in(0) ); + register_new_node(pre_limit, pre_head->in(0)); + register_new_node(pre_opaq , pre_head->in(0)); // Since no other users of pre-loop compare, I can hack limit directly - assert( cmp_end->outcnt() == 1, "no other users" ); + assert(cmp_end->outcnt() == 1, "no other users"); _igvn.hash_delete(cmp_end); cmp_end->set_req(2, peel_only ? pre_limit : pre_opaq); @@ -1227,24 +1155,24 @@ // Modify pre loop end condition Node* pre_bol = pre_end->in(CountedLoopEndNode::TestValue)->as_Bool(); BoolNode* new_bol0 = new BoolNode(pre_bol->in(1), new_test); - register_new_node( new_bol0, pre_head->in(0) ); + register_new_node(new_bol0, pre_head->in(0)); _igvn.replace_input_of(pre_end, CountedLoopEndNode::TestValue, new_bol0); // Modify main loop guard condition assert(min_iff->in(CountedLoopEndNode::TestValue) == min_bol, "guard okay"); BoolNode* new_bol1 = new BoolNode(min_bol->in(1), new_test); - register_new_node( new_bol1, new_pre_exit ); + register_new_node(new_bol1, new_pre_exit); _igvn.hash_delete(min_iff); min_iff->set_req(CountedLoopEndNode::TestValue, new_bol1); // Modify main loop end condition BoolNode* main_bol = main_end->in(CountedLoopEndNode::TestValue)->as_Bool(); BoolNode* new_bol2 = new BoolNode(main_bol->in(1), new_test); - register_new_node( new_bol2, main_end->in(CountedLoopEndNode::TestControl) ); + register_new_node(new_bol2, main_end->in(CountedLoopEndNode::TestControl)); _igvn.replace_input_of(main_end, CountedLoopEndNode::TestValue, new_bol2); } // Flag main loop main_head->set_main_loop(); - if( peel_only ) main_head->set_main_no_pre_loop(); + if (peel_only) main_head->set_main_no_pre_loop(); // Subtract a trip count for the pre-loop. main_head->set_trip_count(main_head->trip_count() - 1); @@ -1252,7 +1180,7 @@ // It's difficult to be precise about the trip-counts // for the pre/post loops. They are usually very short, // so guess that 4 trips is a reasonable value. - post_head->set_profile_trip_cnt(4.0); + post_loop_info.post_head->set_profile_trip_cnt(4.0); pre_head->set_profile_trip_cnt(4.0); // Now force out all loop-invariant dominating tests. The optimizer @@ -1298,19 +1226,81 @@ guarantee(main_end != NULL, "no loop exit node"); // diagnostic to show loop end is not properly formed assert(main_end->outcnt() == 2, "1 true, 1 false path only"); - uint dd_main_head = dom_depth(main_head); - uint max = main_head->outcnt(); // mark this loop as processed main_head->mark_has_atomic_post_loop(); - Node *pre_header = main_head->in(LoopNode::EntryControl); - Node *init = main_head->init_trip(); Node *incr = main_end->incr(); Node *limit = main_end->limit(); - Node *stride = main_end->stride(); - Node *cmp = main_end->cmp_node(); - BoolTest::mask b_test = main_end->test_trip(); + + // In this case we throw away the result as we are not using it to connect anything else. + PostLoopInfo post_loop_info; + insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_loop_info); + + // It's difficult to be precise about the trip-counts + // for post loops. They are usually very short, + // so guess that unit vector trips is a reasonable value. + post_loop_info.post_head->set_profile_trip_cnt(cur_unroll); + + // Now force out all loop-invariant dominating tests. The optimizer + // finds some, but we _know_ they are all useless. + peeled_dom_test_elim(loop, old_new); + loop->record_for_igvn(); +} + + +//-------------------------insert_scalar_rced_post_loop------------------------ +// Insert a copy of the rce'd main loop as a post loop, +// We have not unrolled the main loop, so this is the right time to inject this. +// Later we will examine the partner of this post loop pair which still has range checks +// to see inject code which tests at runtime if the range checks are applicable. +void PhaseIdealLoop::insert_scalar_rced_post_loop(IdealLoopTree *loop, Node_List &old_new) { + if (!loop->_head->is_CountedLoop()) return; + + CountedLoopNode *cl = loop->_head->as_CountedLoop(); + + // only process RCE'd main loops + if (cl->loop_has_range_checks() || !cl->is_main_loop()) return; + +#ifndef PRODUCT + if (TraceLoopOpts) { + tty->print("PostScalarRce "); + loop->dump_head(); + } +#endif + C->set_major_progress(); + + // Find common pieces of the loop being guarded with pre & post loops + CountedLoopNode *main_head = loop->_head->as_CountedLoop(); + CountedLoopEndNode *main_end = main_head->loopexit(); + guarantee(main_end != NULL, "no loop exit node"); + // diagnostic to show loop end is not properly formed + assert(main_end->outcnt() == 2, "1 true, 1 false path only"); + + Node *incr = main_end->incr(); + Node *limit = main_end->limit(); + + // In this case we throw away the result as we are not using it to connect anything else. + PostLoopInfo post_loop_info; + insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_loop_info); + + // It's difficult to be precise about the trip-counts + // for post loops. They are usually very short, + // so guess that unit vector trips is a reasonable value. + post_loop_info.post_head->set_profile_trip_cnt(4.0); + + // Now force out all loop-invariant dominating tests. The optimizer + // finds some, but we _know_ they are all useless. + peeled_dom_test_elim(loop, old_new); + loop->record_for_igvn(); +} + + +//------------------------------insert_post_loop------------------------------- +// Insert post loops. Add a post loop to the given loop passed. +void PhaseIdealLoop::insert_post_loop(IdealLoopTree *loop, Node_List &old_new, + CountedLoopNode *main_head, CountedLoopEndNode *main_end, + Node *incr, Node *limit, PostLoopInfo &post_loop_info) { //------------------------------ // Step A: Create a new post-Loop. @@ -1325,6 +1315,7 @@ CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop(); post_head->set_normal_loop(); post_head->set_post_loop(main_head); + post_loop_info.post_head = post_head; // Reduce the post-loop trip count. CountedLoopEndNode* post_end = old_new[main_end->_idx]->as_CountedLoopEnd(); @@ -1336,14 +1327,14 @@ set_idom(new_main_exit, main_end, dd_main_exit); set_loop(new_main_exit, loop->_parent); - // Step A2: Build a zero-trip guard for the vector post-loop. After leaving the - // main-loop, the vector post-loop may not execute at all. We 'opaque' the incr - // (the vectorized main-loop trip-counter exit value) because we will be changing + // Step A2: Build a zero-trip guard for the post-loop. After leaving the + // main-loop, the post-loop may not execute at all. We 'opaque' the incr + // (the previous loop trip-counter exit value) because we will be changing // the exit value (via additional unrolling) so we cannot constant-fold away the zero // trip guard until all unrolling is done. Node *zer_opaq = new Opaque1Node(C, incr); Node *zer_cmp = new CmpINode(zer_opaq, limit); - Node *zer_bol = new BoolNode(zer_cmp, b_test); + Node *zer_bol = new BoolNode(zer_cmp, main_end->test_trip()); register_new_node(zer_opaq, new_main_exit); register_new_node(zer_cmp, new_main_exit); register_new_node(zer_bol, new_main_exit); @@ -1354,11 +1345,11 @@ set_idom(zer_iff, new_main_exit, dd_main_exit); set_loop(zer_iff, loop->_parent); - // Plug in the false-path, taken if we need to skip vector post-loop + // Plug in the false-path, taken if we need to skip this post-loop _igvn.replace_input_of(main_exit, 0, zer_iff); set_idom(main_exit, zer_iff, dd_main_exit); set_idom(main_exit->unique_out(), zer_iff, dd_main_exit); - // Make the true-path, must enter the vector post loop + // Make the true-path, must enter this post loop Node *zer_taken = new IfTrueNode(zer_iff); _igvn.register_new_node_with_optimizer(zer_taken); set_idom(zer_taken, zer_iff, dd_main_exit); @@ -1371,16 +1362,16 @@ Arena *a = Thread::current()->resource_area(); VectorSet visited(a); Node_Stack clones(a, main_head->back_control()->outcnt()); - // Step A3: Make the fall-in values to the vector post-loop come from the + // Step A3: Make the fall-in values to the post-loop come from the // fall-out values of the main-loop. for (DUIterator_Fast imax, i = main_head->fast_outs(imax); i < imax; i++) { Node* main_phi = main_head->fast_out(i); if (main_phi->is_Phi() && main_phi->in(0) == main_head && main_phi->outcnt() >0) { Node *cur_phi = old_new[main_phi->_idx]; Node *fallnew = clone_up_backedge_goo(main_head->back_control(), - post_head->init_control(), - main_phi->in(LoopNode::LoopBackControl), - visited, clones); + post_head->init_control(), + main_phi->in(LoopNode::LoopBackControl), + visited, clones); _igvn.hash_delete(cur_phi); cur_phi->set_req(LoopNode::EntryControl, fallnew); } @@ -1390,17 +1381,10 @@ bool inserted = cast_incr_before_loop(zer_opaq->in(1), zer_taken, post_head); assert(inserted, "no castII inserted"); - // It's difficult to be precise about the trip-counts - // for post loops. They are usually very short, - // so guess that unit vector trips is a reasonable value. - post_head->set_profile_trip_cnt((float)slp_max_unroll_factor); - - // Now force out all loop-invariant dominating tests. The optimizer - // finds some, but we _know_ they are all useless. - peeled_dom_test_elim(loop, old_new); - loop->record_for_igvn(); + post_loop_info.new_main_exit = new_main_exit; } + //------------------------------is_invariant----------------------------- // Return true if n is invariant bool IdealLoopTree::is_invariant(Node* n) const { @@ -2097,7 +2081,7 @@ //------------------------------do_range_check--------------------------------- // Eliminate range-checks and other trip-counter vs loop-invariant tests. -void PhaseIdealLoop::do_range_check( IdealLoopTree *loop, Node_List &old_new ) { +void PhaseIdealLoop::do_range_check(IdealLoopTree *loop, Node_List &range_check_list) { #ifndef PRODUCT if (PrintOpto && VerifyLoopOptimizations) { tty->print("Range Check Elimination "); @@ -2141,9 +2125,9 @@ // not ever trip low tests. Node *p_f = iffm->in(0); // pre loop may have been optimized out - if (p_f->Opcode() != Op_IfFalse) { + if (p_f->Opcode() != Op_IfFalse) return; - } + CountedLoopEndNode *pre_end = p_f->in(0)->as_CountedLoopEnd(); assert(pre_end->loopnode()->is_pre_loop(), ""); Node *pre_opaq1 = pre_end->limit(); @@ -2185,23 +2169,24 @@ // Check loop body for tests of trip-counter plus loop-invariant vs // loop-invariant. - for( uint i = 0; i < loop->_body.size(); i++ ) { + for (uint i = 0; i < loop->_body.size(); i++) { Node *iff = loop->_body[i]; if (iff->Opcode() == Op_If || iff->Opcode() == Op_RangeCheck) { // Test? // Test is an IfNode, has 2 projections. If BOTH are in the loop // we need loop unswitching instead of iteration splitting. + range_check_list.push(iff); Node *exit = loop->is_loop_exit(iff); - if( !exit ) continue; + if (!exit) continue; int flip = (exit->Opcode() == Op_IfTrue) ? 1 : 0; // Get boolean condition to test Node *i1 = iff->in(1); - if( !i1->is_Bool() ) continue; + if (!i1->is_Bool()) continue; BoolNode *bol = i1->as_Bool(); BoolTest b_test = bol->_test; // Flip sense of test if exit condition is flipped - if( flip ) + if (flip) b_test = b_test.negate(); // Get compare @@ -2213,13 +2198,13 @@ jint scale_con= 1; // Assume trip counter not scaled Node *limit_c = get_ctrl(limit); - if( loop->is_member(get_loop(limit_c) ) ) { + if (loop->is_member(get_loop(limit_c))) { // Compare might have operands swapped; commute them b_test = b_test.commute(); rc_exp = cmp->in(2); limit = cmp->in(1); limit_c = get_ctrl(limit); - if( loop->is_member(get_loop(limit_c) ) ) + if (loop->is_member(get_loop(limit_c))) continue; // Both inputs are loop varying; cannot RCE } // Here we know 'limit' is loop invariant @@ -2227,27 +2212,25 @@ // 'limit' maybe pinned below the zero trip test (probably from a // previous round of rce), in which case, it can't be used in the // zero trip test expression which must occur before the zero test's if. - if( limit_c == ctrl ) { + if (limit_c == ctrl) continue; // Don't rce this check but continue looking for other candidates. - } // Check for scaled induction variable plus an offset Node *offset = NULL; - if (!is_scaled_iv_plus_offset(rc_exp, trip_counter, &scale_con, &offset)) { + if (!is_scaled_iv_plus_offset(rc_exp, trip_counter, &scale_con, &offset)) continue; - } Node *offset_c = get_ctrl(offset); - if( loop->is_member( get_loop(offset_c) ) ) + if (loop->is_member( get_loop(offset_c))) continue; // Offset is not really loop invariant // Here we know 'offset' is loop invariant. // As above for the 'limit', the 'offset' maybe pinned below the // zero trip test. - if( offset_c == ctrl ) { + if (offset_c == ctrl) continue; // Don't rce this check but continue looking for other candidates. - } + #ifdef ASSERT if (TraceRangeLimitCheck) { tty->print_cr("RC bool node%s", flip ? " flipped:" : ":"); @@ -2262,8 +2245,8 @@ // sense of the test. // Adjust pre and main loop limits to guard the correct iteration set - if( cmp->Opcode() == Op_CmpU ) {// Unsigned compare is really 2 tests - if( b_test._test == BoolTest::lt ) { // Range checks always use lt + if (cmp->Opcode() == Op_CmpU) {// Unsigned compare is really 2 tests + if (b_test._test == BoolTest::lt) { // Range checks always use lt // The underflow and overflow limits: 0 <= scale*I+offset < limit add_constraint( stride_con, scale_con, offset, zero, limit, pre_ctrl, &pre_limit, &main_limit ); if (!conditional_rc) { @@ -2283,23 +2266,23 @@ case BoolTest::ge: // Convert (I*scale+offset) >= Limit to (I*(-scale)+(-offset)) <= -Limit scale_con = -scale_con; - offset = new SubINode( zero, offset ); - register_new_node( offset, pre_ctrl ); - limit = new SubINode( zero, limit ); - register_new_node( limit, pre_ctrl ); + offset = new SubINode(zero, offset); + register_new_node(offset, pre_ctrl); + limit = new SubINode(zero, limit); + register_new_node(limit, pre_ctrl); // Fall into LE case case BoolTest::le: if (b_test._test != BoolTest::gt) { // Convert X <= Y to X < Y+1 - limit = new AddINode( limit, one ); - register_new_node( limit, pre_ctrl ); + limit = new AddINode(limit, one); + register_new_node(limit, pre_ctrl); } // Fall into LT case case BoolTest::lt: // The underflow and overflow limits: MIN_INT <= scale*I+offset < limit // Note: (MIN_INT+1 == -MAX_INT) is used instead of MIN_INT here // to avoid problem with scale == -1: MIN_INT/(-1) == MIN_INT. - add_constraint( stride_con, scale_con, offset, mini, limit, pre_ctrl, &pre_limit, &main_limit ); + add_constraint(stride_con, scale_con, offset, mini, limit, pre_ctrl, &pre_limit, &main_limit); if (!conditional_rc) { // ((MIN_INT+1)-offset)/scale could be outside of loop iterations range. // Note: negative offset is replaced with 0 but (MIN_INT+1)/scale could @@ -2317,7 +2300,7 @@ // Kill the eliminated test C->set_major_progress(); - Node *kill_con = _igvn.intcon( 1-flip ); + Node *kill_con = _igvn.intcon(1-flip); set_ctrl(kill_con, C->root()); _igvn.replace_input_of(iff, 1, kill_con); // Find surviving projection @@ -2334,6 +2317,7 @@ --imax; } } + range_check_list.pop(); } // End of is IF @@ -2373,15 +2357,15 @@ Node *main_cle = cl->loopexit(); Node *main_bol = main_cle->in(1); // Hacking loop bounds; need private copies of exit test - if( main_bol->outcnt() > 1 ) {// BoolNode shared? + if (main_bol->outcnt() > 1) {// BoolNode shared? main_bol = main_bol->clone();// Clone a private BoolNode register_new_node( main_bol, main_cle->in(0) ); _igvn.replace_input_of(main_cle, 1, main_bol); } Node *main_cmp = main_bol->in(1); - if( main_cmp->outcnt() > 1 ) { // CmpNode shared? + if (main_cmp->outcnt() > 1) { // CmpNode shared? main_cmp = main_cmp->clone();// Clone a private CmpNode - register_new_node( main_cmp, main_cle->in(0) ); + register_new_node(main_cmp, main_cle->in(0)); _igvn.replace_input_of(main_bol, 1, main_cmp); } // Hack the now-private loop bounds @@ -2391,6 +2375,166 @@ _igvn.replace_input_of(opqzm, 1, main_limit); } +//------------------------------has_range_checks------------------------------- +// Check to see if RCE cleaned the current loop of range-checks. +void PhaseIdealLoop::has_range_checks(IdealLoopTree *loop) { + assert(RangeCheckElimination, ""); + CountedLoopNode *cl = loop->_head->as_CountedLoop(); + + // skip this loop if it is already marked + if (cl->loop_has_range_checks()) return; + + if (cl->is_main_loop() || cl->is_post_loop()) { + // Now check for existance of range checks + for (uint i = 0; i < loop->_body.size(); i++) { + Node *iff = loop->_body[i]; + if (iff->Opcode() == Op_If || + iff->Opcode() == Op_RangeCheck) { + cl->mark_has_range_checks(); + break; + } + } + } +} + +//-------------------------multi_version_post_loops---------------------------- +// Check the range checks that remain, if simple, use the bounds to guard +// which version to a post loop we execute, one with range checks or one without +bool PhaseIdealLoop::multi_version_post_loops(IdealLoopTree *rce_loop, IdealLoopTree *legacy_loop) { + bool multi_version_succeeded = false; + assert(RangeCheckElimination, ""); + CountedLoopNode *legacy_cl = legacy_loop->_head->as_CountedLoop(); + assert(legacy_cl->is_post_loop(), ""); + + // Check for existance of range checks using the unique instance to make a guard with + Unique_Node_List worklist; + for (uint i = 0; i < legacy_loop->_body.size(); i++) { + Node *iff = legacy_loop->_body[i]; + if (iff->Opcode() == Op_If || iff->Opcode() == Op_RangeCheck) { + worklist.push(iff); + } + } + + // Find RCE'd post loop so that we can stage its guard. + Node* ctrl = legacy_cl->in(LoopNode::EntryControl); + if (!ctrl->is_IfTrue() && !ctrl->is_IfFalse()) return multi_version_succeeded; + Node* iffm = ctrl->in(0); + if (!iffm->is_If()) return multi_version_succeeded; + Node* cur_bool = iffm->in(1); + if (!cur_bool->is_Bool()) return multi_version_succeeded; + Node* cur_cmp = cur_bool->in(1); + if (!cur_cmp->is_Cmp()) return multi_version_succeeded; + Node* cur_opq = cur_cmp->in(1); + // Can not optimize a loop if zero-trip Opaque1 node is optimized away. + if (cur_opq->Opcode() != Op_Opaque1) return multi_version_succeeded; + + // Now we test that both the post loops are connected + Node* post_loop_region = iffm->in(0); + if (post_loop_region == NULL) return multi_version_succeeded; + if (!post_loop_region->is_Region()) return multi_version_succeeded; + Node* covering_region = post_loop_region->in(RegionNode::Control+1); + if (covering_region == NULL) return multi_version_succeeded; + if (!covering_region->is_Region()) return multi_version_succeeded; + Node* p_f = covering_region->in(RegionNode::Control); + if (p_f == NULL) return multi_version_succeeded; + if (!p_f->is_IfFalse()) return multi_version_succeeded; + if (!p_f->in(0)->is_CountedLoopEnd()) return multi_version_succeeded; + CountedLoopEndNode* rce_loop_end = p_f->in(0)->as_CountedLoopEnd(); + if (rce_loop_end == NULL) return multi_version_succeeded; + CountedLoopNode* rce_cl = rce_loop_end->loopnode(); + if (rce_cl == NULL || !rce_cl->is_post_loop()) return multi_version_succeeded; + CountedLoopNode *known_rce_cl = rce_loop->_head->as_CountedLoop(); + if (rce_cl != known_rce_cl) return multi_version_succeeded; + + // Then we fetch the cover entry test + ctrl = rce_cl->in(LoopNode::EntryControl); + if (!ctrl->is_IfTrue() && !ctrl->is_IfFalse()) return multi_version_succeeded; + +#ifndef PRODUCT + if (TraceLoopOpts) { + tty->print("PostMultiVersion\n"); + rce_loop->dump_head(); + legacy_loop->dump_head(); + } +#endif + + // Now fetch the limit we want to compare against + Node *limit = rce_cl->limit(); + bool first_time = true; + + // If we got this far, we identified the post loop which has been RCE'd and + // we have a work list. Now we will try to transform the if guard to cause + // the loop pair to be multi version executed with the determination left to runtime + // or the optimizer if full information is known about the given arrays at compile time. + Node *last_min = NULL; + while (worklist.size()) { + Node* rc_iffm = worklist.pop(); + if (rc_iffm->is_If()) { + Node *rc_bolzm = rc_iffm->in(1); + if (rc_bolzm->is_Bool()) { + Node *rc_cmpzm = rc_bolzm->in(1); + if (rc_cmpzm->is_Cmp()) { + Node *rc_left = rc_cmpzm->in(2); + if (first_time) { + last_min = rc_left; + first_time = false; + } else { + Node *cur_min = new MinINode(last_min, rc_left); + last_min = cur_min; + _igvn.register_new_node_with_optimizer(last_min); + } + } + } + } + } + + // All we have to do is update the limit of the rce loop + // with the min of our expression and the current limit. + // We will use this expression to replace the current limit. + if (last_min) { + Node *cur_min = new MinINode(last_min, limit); + _igvn.register_new_node_with_optimizer(cur_min); + Node *cmp_node = rce_loop_end->cmp_node(); + _igvn.replace_input_of(cmp_node, 2, cur_min); + set_idom(cmp_node, cur_min, dom_depth(ctrl)); + set_ctrl(cur_min, ctrl); + set_loop(cur_min, rce_loop->_parent); + + legacy_cl->mark_is_multiversioned(); + rce_cl->mark_is_multiversioned(); + multi_version_succeeded = true; + + C->set_major_progress(); + } + + return multi_version_succeeded; +} + +//-------------------------poison_rce_post_loop-------------------------------- +// Causes the rce'd post loop to be optimized away if multiverioning fails +void PhaseIdealLoop::poison_rce_post_loop(IdealLoopTree *rce_loop) { + CountedLoopNode *rce_cl = rce_loop->_head->as_CountedLoop(); + Node* ctrl = rce_cl->in(LoopNode::EntryControl); + if (ctrl->is_IfTrue() || ctrl->is_IfFalse()) { + Node* iffm = ctrl->in(0); + if (iffm->is_If()) { + Node* cur_bool = iffm->in(1); + if (cur_bool->is_Bool()) { + Node* cur_cmp = cur_bool->in(1); + if (cur_cmp->is_Cmp()) { + BoolTest::mask new_test = BoolTest::gt; + BoolNode *new_bool = new BoolNode(cur_cmp, new_test); + _igvn.replace_node(cur_bool, new_bool); + _igvn._worklist.push(new_bool); + Node* left_op = cur_cmp->in(1); + _igvn.replace_input_of(cur_cmp, 2, left_op); + C->set_major_progress(); + } + } + } + } +} + //------------------------------DCE_loop_body---------------------------------- // Remove simplistic dead code from loop body void IdealLoopTree::DCE_loop_body() { @@ -2748,8 +2892,19 @@ // Adjust the pre- and main-loop limits to let the pre and post loops run // with full checks, but the main-loop with no checks. Remove said // checks from the main body. - if (should_rce) - phase->do_range_check(this,old_new); + if (should_rce) { + Node_List range_check_list; + phase->do_range_check(this, range_check_list); + if (range_check_list.size() > 0) { + cl->mark_has_range_checks(); + } + } + + if (should_rce && should_unroll && !should_peel && PostLoopMultiversioning) { + // Try to setup multiversioning on main loops before they are unrolled + if (cl->is_main_loop() && (cl->unrolled_count() == 1)) + phase->insert_scalar_rced_post_loop(this, old_new); + } // Double loop body for unrolling. Adjust the minimum-trip test (will do // twice as many iterations as before) and the main body limit (only do