49 #endif
50 #ifdef TARGET_ARCH_MODEL_arm
51 # include "adfiles/ad_arm.hpp"
52 #endif
53 #ifdef TARGET_ARCH_MODEL_ppc
54 # include "adfiles/ad_ppc.hpp"
55 #endif
56
57 // Portions of code courtesy of Clifford Click
58
59 // Optimization - Graph Style
60
61 // To avoid float value underflow
62 #define MIN_BLOCK_FREQUENCY 1.e-35f
63
64 //----------------------------schedule_node_into_block-------------------------
65 // Insert node n into block b. Look for projections of n and make sure they
66 // are in b also.
67 void PhaseCFG::schedule_node_into_block( Node *n, Block *b ) {
68 // Set basic block of n, Add n to b,
69 _bbs.map(n->_idx, b);
70 b->add_inst(n);
71
72 // After Matching, nearly any old Node may have projections trailing it.
73 // These are usually machine-dependent flags. In any case, they might
74 // float to another block below this one. Move them up.
75 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
76 Node* use = n->fast_out(i);
77 if (use->is_Proj()) {
78 Block* buse = _bbs[use->_idx];
79 if (buse != b) { // In wrong block?
80 if (buse != NULL)
81 buse->find_remove(use); // Remove from wrong block
82 _bbs.map(use->_idx, b); // Re-insert in this block
83 b->add_inst(use);
84 }
85 }
86 }
87 }
88
89 //----------------------------replace_block_proj_ctrl-------------------------
90 // Nodes that have is_block_proj() nodes as their control need to use
91 // the appropriate Region for their actual block as their control since
92 // the projection will be in a predecessor block.
93 void PhaseCFG::replace_block_proj_ctrl( Node *n ) {
94 const Node *in0 = n->in(0);
95 assert(in0 != NULL, "Only control-dependent");
96 const Node *p = in0->is_block_proj();
97 if (p != NULL && p != n) { // Control from a block projection?
98 assert(!n->pinned() || n->is_MachConstantBase(), "only pinned MachConstantBase node is expected here");
99 // Find trailing Region
100 Block *pb = _bbs[in0->_idx]; // Block-projection already has basic block
101 uint j = 0;
102 if (pb->_num_succs != 1) { // More then 1 successor?
103 // Search for successor
104 uint max = pb->_nodes.size();
105 assert( max > 1, "" );
106 uint start = max - pb->_num_succs;
107 // Find which output path belongs to projection
108 for (j = start; j < max; j++) {
109 if( pb->_nodes[j] == in0 )
110 break;
111 }
112 assert( j < max, "must find" );
113 // Change control to match head of successor basic block
114 j -= start;
115 }
116 n->set_req(0, pb->_succs[j]->head());
117 }
118 }
119
120
121 //------------------------------schedule_pinned_nodes--------------------------
122 // Set the basic block for Nodes pinned into blocks
123 void PhaseCFG::schedule_pinned_nodes( VectorSet &visited ) {
124 // Allocate node stack of size C->unique()+8 to avoid frequent realloc
125 GrowableArray <Node *> spstack(C->unique()+8);
126 spstack.push(_root);
127 while ( spstack.is_nonempty() ) {
128 Node *n = spstack.pop();
129 if( !visited.test_set(n->_idx) ) { // Test node and flag it as visited
130 if( n->pinned() && !_bbs.lookup(n->_idx) ) { // Pinned? Nail it down!
131 assert( n->in(0), "pinned Node must have Control" );
132 // Before setting block replace block_proj control edge
133 replace_block_proj_ctrl(n);
134 Node *input = n->in(0);
135 while( !input->is_block_start() )
136 input = input->in(0);
137 Block *b = _bbs[input->_idx]; // Basic block of controlling input
138 schedule_node_into_block(n, b);
139 }
140 for( int i = n->req() - 1; i >= 0; --i ) { // For all inputs
141 if( n->in(i) != NULL )
142 spstack.push(n->in(i));
143 }
144 }
145 }
146 }
147
148 #ifdef ASSERT
149 // Assert that new input b2 is dominated by all previous inputs.
150 // Check this by by seeing that it is dominated by b1, the deepest
151 // input observed until b2.
152 static void assert_dom(Block* b1, Block* b2, Node* n, Block_Array &bbs) {
153 if (b1 == NULL) return;
154 assert(b1->_dom_depth < b2->_dom_depth, "sanity");
155 Block* tmp = b2;
156 while (tmp != b1 && tmp != NULL) {
157 tmp = tmp->_idom;
158 }
159 if (tmp != b1) {
160 // Detected an unschedulable graph. Print some nice stuff and die.
161 tty->print_cr("!!! Unschedulable graph !!!");
162 for (uint j=0; j<n->len(); j++) { // For all inputs
163 Node* inn = n->in(j); // Get input
164 if (inn == NULL) continue; // Ignore NULL, missing inputs
165 Block* inb = bbs[inn->_idx];
166 tty->print("B%d idom=B%d depth=%2d ",inb->_pre_order,
167 inb->_idom ? inb->_idom->_pre_order : 0, inb->_dom_depth);
168 inn->dump();
169 }
170 tty->print("Failing node: ");
171 n->dump();
172 assert(false, "unscheduable graph");
173 }
174 }
175 #endif
176
177 static Block* find_deepest_input(Node* n, Block_Array &bbs) {
178 // Find the last input dominated by all other inputs.
179 Block* deepb = NULL; // Deepest block so far
180 int deepb_dom_depth = 0;
181 for (uint k = 0; k < n->len(); k++) { // For all inputs
182 Node* inn = n->in(k); // Get input
183 if (inn == NULL) continue; // Ignore NULL, missing inputs
184 Block* inb = bbs[inn->_idx];
185 assert(inb != NULL, "must already have scheduled this input");
186 if (deepb_dom_depth < (int) inb->_dom_depth) {
187 // The new inb must be dominated by the previous deepb.
188 // The various inputs must be linearly ordered in the dom
189 // tree, or else there will not be a unique deepest block.
190 DEBUG_ONLY(assert_dom(deepb, inb, n, bbs));
191 deepb = inb; // Save deepest block
192 deepb_dom_depth = deepb->_dom_depth;
193 }
194 }
195 assert(deepb != NULL, "must be at least one input to n");
196 return deepb;
197 }
198
199
200 //------------------------------schedule_early---------------------------------
201 // Find the earliest Block any instruction can be placed in. Some instructions
202 // are pinned into Blocks. Unpinned instructions can appear in last block in
203 // which all their inputs occur.
204 bool PhaseCFG::schedule_early(VectorSet &visited, Node_List &roots) {
205 // Allocate stack with enough space to avoid frequent realloc
206 Node_Stack nstack(roots.Size() + 8); // (unique >> 1) + 24 from Java2D stats
207 // roots.push(_root); _root will be processed among C->top() inputs
208 roots.push(C->top());
209 visited.set(C->top()->_idx);
210
226 const Node *in0 = n->in(0);
227 if (in0 != NULL) { // Control-dependent?
228 replace_block_proj_ctrl(n);
229 } else { // n->in(0) == NULL
230 if (n->req() == 1) { // This guy is a constant with NO inputs?
231 n->set_req(0, _root);
232 }
233 }
234 }
235
236 // First, visit all inputs and force them to get a block. If an
237 // input is already in a block we quit following inputs (to avoid
238 // cycles). Instead we put that Node on a worklist to be handled
239 // later (since IT'S inputs may not have a block yet).
240 bool done = true; // Assume all n's inputs will be processed
241 while (i < n->len()) { // For all inputs
242 Node *in = n->in(i); // Get input
243 ++i;
244 if (in == NULL) continue; // Ignore NULL, missing inputs
245 int is_visited = visited.test_set(in->_idx);
246 if (!_bbs.lookup(in->_idx)) { // Missing block selection?
247 if (is_visited) {
248 // assert( !visited.test(in->_idx), "did not schedule early" );
249 return false;
250 }
251 nstack.push(n, i); // Save parent node and next input's index.
252 nstack_top_n = in; // Process current input now.
253 nstack_top_i = 0;
254 done = false; // Not all n's inputs processed.
255 break; // continue while_nstack_nonempty;
256 } else if (!is_visited) { // Input not yet visited?
257 roots.push(in); // Visit this guy later, using worklist
258 }
259 }
260 if (done) {
261 // All of n's inputs have been processed, complete post-processing.
262
263 // Some instructions are pinned into a block. These include Region,
264 // Phi, Start, Return, and other control-dependent instructions and
265 // any projections which depend on them.
266 if (!n->pinned()) {
267 // Set earliest legal block.
268 _bbs.map(n->_idx, find_deepest_input(n, _bbs));
269 } else {
270 assert(_bbs[n->_idx] == _bbs[n->in(0)->_idx], "Pinned Node should be at the same block as its control edge");
271 }
272
273 if (nstack.is_empty()) {
274 // Finished all nodes on stack.
275 // Process next node on the worklist 'roots'.
276 break;
277 }
278 // Get saved parent node and next input's index.
279 nstack_top_n = nstack.node();
280 nstack_top_i = nstack.index();
281 nstack.pop();
282 } // if (done)
283 } // while (true)
284 } // while (roots.size() != 0)
285 return true;
286 }
287
288 //------------------------------dom_lca----------------------------------------
289 // Find least common ancestor in dominator tree
290 // LCA is a current notion of LCA, to be raised above 'this'.
296 Block* anc = this;
297 while (anc->_dom_depth > LCA->_dom_depth)
298 anc = anc->_idom; // Walk up till anc is as high as LCA
299
300 while (LCA->_dom_depth > anc->_dom_depth)
301 LCA = LCA->_idom; // Walk up till LCA is as high as anc
302
303 while (LCA != anc) { // Walk both up till they are the same
304 LCA = LCA->_idom;
305 anc = anc->_idom;
306 }
307
308 return LCA;
309 }
310
311 //--------------------------raise_LCA_above_use--------------------------------
312 // We are placing a definition, and have been given a def->use edge.
313 // The definition must dominate the use, so move the LCA upward in the
314 // dominator tree to dominate the use. If the use is a phi, adjust
315 // the LCA only with the phi input paths which actually use this def.
316 static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, Block_Array &bbs) {
317 Block* buse = bbs[use->_idx];
318 if (buse == NULL) return LCA; // Unused killing Projs have no use block
319 if (!use->is_Phi()) return buse->dom_lca(LCA);
320 uint pmax = use->req(); // Number of Phi inputs
321 // Why does not this loop just break after finding the matching input to
322 // the Phi? Well...it's like this. I do not have true def-use/use-def
323 // chains. Means I cannot distinguish, from the def-use direction, which
324 // of many use-defs lead from the same use to the same def. That is, this
325 // Phi might have several uses of the same def. Each use appears in a
326 // different predecessor block. But when I enter here, I cannot distinguish
327 // which use-def edge I should find the predecessor block for. So I find
328 // them all. Means I do a little extra work if a Phi uses the same value
329 // more than once.
330 for (uint j=1; j<pmax; j++) { // For all inputs
331 if (use->in(j) == def) { // Found matching input?
332 Block* pred = bbs[buse->pred(j)->_idx];
333 LCA = pred->dom_lca(LCA);
334 }
335 }
336 return LCA;
337 }
338
339 //----------------------------raise_LCA_above_marks----------------------------
340 // Return a new LCA that dominates LCA and any of its marked predecessors.
341 // Search all my parents up to 'early' (exclusive), looking for predecessors
342 // which are marked with the given index. Return the LCA (in the dom tree)
343 // of all marked blocks. If there are none marked, return the original
344 // LCA.
345 static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark,
346 Block* early, Block_Array &bbs) {
347 Block_List worklist;
348 worklist.push(LCA);
349 while (worklist.size() > 0) {
350 Block* mid = worklist.pop();
351 if (mid == early) continue; // stop searching here
352
353 // Test and set the visited bit.
354 if (mid->raise_LCA_visited() == mark) continue; // already visited
355
356 // Don't process the current LCA, otherwise the search may terminate early
357 if (mid != LCA && mid->raise_LCA_mark() == mark) {
358 // Raise the LCA.
359 LCA = mid->dom_lca(LCA);
360 if (LCA == early) break; // stop searching everywhere
361 assert(early->dominates(LCA), "early is high enough");
362 // Resume searching at that point, skipping intermediate levels.
363 worklist.push(LCA);
364 if (LCA == mid)
365 continue; // Don't mark as visited to avoid early termination.
366 } else {
367 // Keep searching through this block's predecessors.
368 for (uint j = 1, jmax = mid->num_preds(); j < jmax; j++) {
369 Block* mid_parent = bbs[ mid->pred(j)->_idx ];
370 worklist.push(mid_parent);
371 }
372 }
373 mid->set_raise_LCA_visited(mark);
374 }
375 return LCA;
376 }
377
378 //--------------------------memory_early_block--------------------------------
379 // This is a variation of find_deepest_input, the heart of schedule_early.
380 // Find the "early" block for a load, if we considered only memory and
381 // address inputs, that is, if other data inputs were ignored.
382 //
383 // Because a subset of edges are considered, the resulting block will
384 // be earlier (at a shallower dom_depth) than the true schedule_early
385 // point of the node. We compute this earlier block as a more permissive
386 // site for anti-dependency insertion, but only if subsume_loads is enabled.
387 static Block* memory_early_block(Node* load, Block* early, Block_Array &bbs) {
388 Node* base;
389 Node* index;
390 Node* store = load->in(MemNode::Memory);
391 load->as_Mach()->memory_inputs(base, index);
392
393 assert(base != NodeSentinel && index != NodeSentinel,
394 "unexpected base/index inputs");
395
396 Node* mem_inputs[4];
397 int mem_inputs_length = 0;
398 if (base != NULL) mem_inputs[mem_inputs_length++] = base;
399 if (index != NULL) mem_inputs[mem_inputs_length++] = index;
400 if (store != NULL) mem_inputs[mem_inputs_length++] = store;
401
402 // In the comparision below, add one to account for the control input,
403 // which may be null, but always takes up a spot in the in array.
404 if (mem_inputs_length + 1 < (int) load->req()) {
405 // This "load" has more inputs than just the memory, base and index inputs.
406 // For purposes of checking anti-dependences, we need to start
407 // from the early block of only the address portion of the instruction,
408 // and ignore other blocks that may have factored into the wider
409 // schedule_early calculation.
410 if (load->in(0) != NULL) mem_inputs[mem_inputs_length++] = load->in(0);
411
412 Block* deepb = NULL; // Deepest block so far
413 int deepb_dom_depth = 0;
414 for (int i = 0; i < mem_inputs_length; i++) {
415 Block* inb = bbs[mem_inputs[i]->_idx];
416 if (deepb_dom_depth < (int) inb->_dom_depth) {
417 // The new inb must be dominated by the previous deepb.
418 // The various inputs must be linearly ordered in the dom
419 // tree, or else there will not be a unique deepest block.
420 DEBUG_ONLY(assert_dom(deepb, inb, load, bbs));
421 deepb = inb; // Save deepest block
422 deepb_dom_depth = deepb->_dom_depth;
423 }
424 }
425 early = deepb;
426 }
427
428 return early;
429 }
430
431 //--------------------------insert_anti_dependences---------------------------
432 // A load may need to witness memory that nearby stores can overwrite.
433 // For each nearby store, either insert an "anti-dependence" edge
434 // from the load to the store, or else move LCA upward to force the
435 // load to (eventually) be scheduled in a block above the store.
436 //
437 // Do not add edges to stores on distinct control-flow paths;
438 // only add edges to stores which might interfere.
439 //
440 // Return the (updated) LCA. There will not be any possibly interfering
471 "String equals is a 'load' that does not conflict with any stores");
472 assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrIndexOf),
473 "String indexOf is a 'load' that does not conflict with any stores");
474 assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_AryEq),
475 "Arrays equals is a 'load' that do not conflict with any stores");
476
477 if (!C->alias_type(load_alias_idx)->is_rewritable()) {
478 // It is impossible to spoil this load by putting stores before it,
479 // because we know that the stores will never update the value
480 // which 'load' must witness.
481 return LCA;
482 }
483
484 node_idx_t load_index = load->_idx;
485
486 // Note the earliest legal placement of 'load', as determined by
487 // by the unique point in the dom tree where all memory effects
488 // and other inputs are first available. (Computed by schedule_early.)
489 // For normal loads, 'early' is the shallowest place (dom graph wise)
490 // to look for anti-deps between this load and any store.
491 Block* early = _bbs[load_index];
492
493 // If we are subsuming loads, compute an "early" block that only considers
494 // memory or address inputs. This block may be different than the
495 // schedule_early block in that it could be at an even shallower depth in the
496 // dominator tree, and allow for a broader discovery of anti-dependences.
497 if (C->subsume_loads()) {
498 early = memory_early_block(load, early, _bbs);
499 }
500
501 ResourceArea *area = Thread::current()->resource_area();
502 Node_List worklist_mem(area); // prior memory state to store
503 Node_List worklist_store(area); // possible-def to explore
504 Node_List worklist_visited(area); // visited mergemem nodes
505 Node_List non_early_stores(area); // all relevant stores outside of early
506 bool must_raise_LCA = false;
507
508 #ifdef TRACK_PHI_INPUTS
509 // %%% This extra checking fails because MergeMem nodes are not GVNed.
510 // Provide "phi_inputs" to check if every input to a PhiNode is from the
511 // original memory state. This indicates a PhiNode for which should not
512 // prevent the load from sinking. For such a block, set_raise_LCA_mark
513 // may be overly conservative.
514 // Mechanism: count inputs seen for each Phi encountered in worklist_store.
515 DEBUG_ONLY(GrowableArray<uint> phi_inputs(area, C->unique(),0,0));
516 #endif
517
518 // 'load' uses some memory state; look for users of the same state.
602 // instead of control + memory.
603 if (mstore->ideal_Opcode() == Op_SafePoint)
604 continue;
605 } else {
606 // Some raw memory, such as the load of "top" at an allocation,
607 // can be control dependent on the previous safepoint. See
608 // comments in GraphKit::allocate_heap() about control input.
609 // Inserting an anti-dep between such a safepoint and a use
610 // creates a cycle, and will cause a subsequent failure in
611 // local scheduling. (BugId 4919904)
612 // (%%% How can a control input be a safepoint and not a projection??)
613 if (mstore->ideal_Opcode() == Op_SafePoint && load->in(0) == mstore)
614 continue;
615 }
616 }
617
618 // Identify a block that the current load must be above,
619 // or else observe that 'store' is all the way up in the
620 // earliest legal block for 'load'. In the latter case,
621 // immediately insert an anti-dependence edge.
622 Block* store_block = _bbs[store->_idx];
623 assert(store_block != NULL, "unused killing projections skipped above");
624
625 if (store->is_Phi()) {
626 // 'load' uses memory which is one (or more) of the Phi's inputs.
627 // It must be scheduled not before the Phi, but rather before
628 // each of the relevant Phi inputs.
629 //
630 // Instead of finding the LCA of all inputs to a Phi that match 'mem',
631 // we mark each corresponding predecessor block and do a combined
632 // hoisting operation later (raise_LCA_above_marks).
633 //
634 // Do not assert(store_block != early, "Phi merging memory after access")
635 // PhiNode may be at start of block 'early' with backedge to 'early'
636 DEBUG_ONLY(bool found_match = false);
637 for (uint j = PhiNode::Input, jmax = store->req(); j < jmax; j++) {
638 if (store->in(j) == mem) { // Found matching input?
639 DEBUG_ONLY(found_match = true);
640 Block* pred_block = _bbs[store_block->pred(j)->_idx];
641 if (pred_block != early) {
642 // If any predecessor of the Phi matches the load's "early block",
643 // we do not need a precedence edge between the Phi and 'load'
644 // since the load will be forced into a block preceding the Phi.
645 pred_block->set_raise_LCA_mark(load_index);
646 assert(!LCA_orig->dominates(pred_block) ||
647 early->dominates(pred_block), "early is high enough");
648 must_raise_LCA = true;
649 } else {
650 // anti-dependent upon PHI pinned below 'early', no edge needed
651 LCA = early; // but can not schedule below 'early'
652 }
653 }
654 }
655 assert(found_match, "no worklist bug");
656 #ifdef TRACK_PHI_INPUTS
657 #ifdef ASSERT
658 // This assert asks about correct handling of PhiNodes, which may not
659 // have all input edges directly from 'mem'. See BugId 4621264
660 int num_mem_inputs = phi_inputs.at_grow(store->_idx,0) + 1;
694 }
695 }
696 // (Worklist is now empty; all nearby stores have been visited.)
697
698 // Finished if 'load' must be scheduled in its 'early' block.
699 // If we found any stores there, they have already been given
700 // precedence edges.
701 if (LCA == early) return LCA;
702
703 // We get here only if there are no possibly-interfering stores
704 // in the load's 'early' block. Move LCA up above all predecessors
705 // which contain stores we have noted.
706 //
707 // The raised LCA block can be a home to such interfering stores,
708 // but its predecessors must not contain any such stores.
709 //
710 // The raised LCA will be a lower bound for placing the load,
711 // preventing the load from sinking past any block containing
712 // a store that may invalidate the memory state required by 'load'.
713 if (must_raise_LCA)
714 LCA = raise_LCA_above_marks(LCA, load->_idx, early, _bbs);
715 if (LCA == early) return LCA;
716
717 // Insert anti-dependence edges from 'load' to each store
718 // in the non-early LCA block.
719 // Mine the non_early_stores list for such stores.
720 if (LCA->raise_LCA_mark() == load_index) {
721 while (non_early_stores.size() > 0) {
722 Node* store = non_early_stores.pop();
723 Block* store_block = _bbs[store->_idx];
724 if (store_block == LCA) {
725 // add anti_dependence from store to load in its own block
726 assert(store != load->in(0), "dependence cycle found");
727 if (verify) {
728 assert(store->find_edge(load) != -1, "missing precedence edge");
729 } else {
730 store->add_prec(load);
731 }
732 } else {
733 assert(store_block->raise_LCA_mark() == load_index, "block was marked");
734 // Any other stores we found must be either inside the new LCA
735 // or else outside the original LCA. In the latter case, they
736 // did not interfere with any use of 'load'.
737 assert(LCA->dominates(store_block)
738 || !LCA_orig->dominates(store_block), "no stray stores");
739 }
740 }
741 }
742
743 // Return the highest block containing stores; any stores
744 // within that block have been given anti-dependence edges.
745 return LCA;
746 }
747
748 // This class is used to iterate backwards over the nodes in the graph.
749
750 class Node_Backward_Iterator {
751
752 private:
753 Node_Backward_Iterator();
754
755 public:
756 // Constructor for the iterator
757 Node_Backward_Iterator(Node *root, VectorSet &visited, Node_List &stack, Block_Array &bbs);
758
759 // Postincrement operator to iterate over the nodes
760 Node *next();
761
762 private:
763 VectorSet &_visited;
764 Node_List &_stack;
765 Block_Array &_bbs;
766 };
767
768 // Constructor for the Node_Backward_Iterator
769 Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_List &stack, Block_Array &bbs )
770 : _visited(visited), _stack(stack), _bbs(bbs) {
771 // The stack should contain exactly the root
772 stack.clear();
773 stack.push(root);
774
775 // Clear the visited bits
776 visited.Clear();
777 }
778
779 // Iterator for the Node_Backward_Iterator
780 Node *Node_Backward_Iterator::next() {
781
782 // If the _stack is empty, then just return NULL: finished.
783 if ( !_stack.size() )
784 return NULL;
785
786 // '_stack' is emulating a real _stack. The 'visit-all-users' loop has been
787 // made stateless, so I do not need to record the index 'i' on my _stack.
788 // Instead I visit all users each time, scanning for unvisited users.
789 // I visit unvisited not-anti-dependence users first, then anti-dependent
790 // children next.
791 Node *self = _stack.pop();
792
793 // I cycle here when I am entering a deeper level of recursion.
794 // The key variable 'self' was set prior to jumping here.
795 while( 1 ) {
796
797 _visited.set(self->_idx);
798
799 // Now schedule all uses as late as possible.
800 uint src = self->is_Proj() ? self->in(0)->_idx : self->_idx;
801 uint src_rpo = _bbs[src]->_rpo;
802
803 // Schedule all nodes in a post-order visit
804 Node *unvisited = NULL; // Unvisited anti-dependent Node, if any
805
806 // Scan for unvisited nodes
807 for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
808 // For all uses, schedule late
809 Node* n = self->fast_out(i); // Use
810
811 // Skip already visited children
812 if ( _visited.test(n->_idx) )
813 continue;
814
815 // do not traverse backward control edges
816 Node *use = n->is_Proj() ? n->in(0) : n;
817 uint use_rpo = _bbs[use->_idx]->_rpo;
818
819 if ( use_rpo < src_rpo )
820 continue;
821
822 // Phi nodes always precede uses in a basic block
823 if ( use_rpo == src_rpo && use->is_Phi() )
824 continue;
825
826 unvisited = n; // Found unvisited
827
828 // Check for possible-anti-dependent
829 if( !n->needs_anti_dependence_check() )
830 break; // Not visited, not anti-dep; schedule it NOW
831 }
832
833 // Did I find an unvisited not-anti-dependent Node?
834 if ( !unvisited )
835 break; // All done with children; post-visit 'self'
836
837 // Visit the unvisited Node. Contains the obvious push to
838 // indicate I'm entering a deeper level of recursion. I push the
839 // old state onto the _stack and set a new state and loop (recurse).
840 _stack.push(self);
841 self = unvisited;
842 } // End recursion loop
843
844 return self;
845 }
846
847 //------------------------------ComputeLatenciesBackwards----------------------
848 // Compute the latency of all the instructions.
849 void PhaseCFG::ComputeLatenciesBackwards(VectorSet &visited, Node_List &stack) {
850 #ifndef PRODUCT
851 if (trace_opto_pipelining())
852 tty->print("\n#---- ComputeLatenciesBackwards ----\n");
853 #endif
854
855 Node_Backward_Iterator iter((Node *)_root, visited, stack, _bbs);
856 Node *n;
857
858 // Walk over all the nodes from last to first
859 while (n = iter.next()) {
860 // Set the latency for the definitions of this instruction
861 partial_latency_of_defs(n);
862 }
863 } // end ComputeLatenciesBackwards
864
865 //------------------------------partial_latency_of_defs------------------------
866 // Compute the latency impact of this node on all defs. This computes
867 // a number that increases as we approach the beginning of the routine.
868 void PhaseCFG::partial_latency_of_defs(Node *n) {
869 // Set the latency for this instruction
870 #ifndef PRODUCT
871 if (trace_opto_pipelining()) {
872 tty->print("# latency_to_inputs: node_latency[%d] = %d for node",
873 n->_idx, _node_latency->at_grow(n->_idx));
874 dump();
875 }
876 #endif
877
878 if (n->is_Proj())
879 n = n->in(0);
880
881 if (n->is_Root())
882 return;
883
884 uint nlen = n->len();
885 uint use_latency = _node_latency->at_grow(n->_idx);
886 uint use_pre_order = _bbs[n->_idx]->_pre_order;
887
888 for ( uint j=0; j<nlen; j++ ) {
889 Node *def = n->in(j);
890
891 if (!def || def == n)
892 continue;
893
894 // Walk backwards thru projections
895 if (def->is_Proj())
896 def = def->in(0);
897
898 #ifndef PRODUCT
899 if (trace_opto_pipelining()) {
900 tty->print("# in(%2d): ", j);
901 def->dump();
902 }
903 #endif
904
905 // If the defining block is not known, assume it is ok
906 Block *def_block = _bbs[def->_idx];
907 uint def_pre_order = def_block ? def_block->_pre_order : 0;
908
909 if ( (use_pre_order < def_pre_order) ||
910 (use_pre_order == def_pre_order && n->is_Phi()) )
911 continue;
912
913 uint delta_latency = n->latency(j);
914 uint current_latency = delta_latency + use_latency;
915
916 if (_node_latency->at_grow(def->_idx) < current_latency) {
917 _node_latency->at_put_grow(def->_idx, current_latency);
918 }
919
920 #ifndef PRODUCT
921 if (trace_opto_pipelining()) {
922 tty->print_cr("# %d + edge_latency(%d) == %d -> %d, node_latency[%d] = %d",
923 use_latency, j, delta_latency, current_latency, def->_idx,
924 _node_latency->at_grow(def->_idx));
925 }
926 #endif
927 }
928 }
929
930 //------------------------------latency_from_use-------------------------------
931 // Compute the latency of a specific use
932 int PhaseCFG::latency_from_use(Node *n, const Node *def, Node *use) {
933 // If self-reference, return no latency
934 if (use == n || use->is_Root())
935 return 0;
936
937 uint def_pre_order = _bbs[def->_idx]->_pre_order;
938 uint latency = 0;
939
940 // If the use is not a projection, then it is simple...
941 if (!use->is_Proj()) {
942 #ifndef PRODUCT
943 if (trace_opto_pipelining()) {
944 tty->print("# out(): ");
945 use->dump();
946 }
947 #endif
948
949 uint use_pre_order = _bbs[use->_idx]->_pre_order;
950
951 if (use_pre_order < def_pre_order)
952 return 0;
953
954 if (use_pre_order == def_pre_order && use->is_Phi())
955 return 0;
956
957 uint nlen = use->len();
958 uint nl = _node_latency->at_grow(use->_idx);
959
960 for ( uint j=0; j<nlen; j++ ) {
961 if (use->in(j) == n) {
962 // Change this if we want local latencies
963 uint ul = use->latency(j);
964 uint l = ul + nl;
965 if (latency < l) latency = l;
966 #ifndef PRODUCT
967 if (trace_opto_pipelining()) {
968 tty->print_cr("# %d + edge_latency(%d) == %d -> %d, latency = %d",
969 nl, j, ul, l, latency);
1001 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
1002 uint l = latency_from_use(n, def, n->fast_out(i));
1003
1004 if (latency < l) latency = l;
1005 }
1006
1007 _node_latency->at_put_grow(n->_idx, latency);
1008 }
1009
1010 //------------------------------hoist_to_cheaper_block-------------------------
1011 // Pick a block for node self, between early and LCA, that is a cheaper
1012 // alternative to LCA.
1013 Block* PhaseCFG::hoist_to_cheaper_block(Block* LCA, Block* early, Node* self) {
1014 const double delta = 1+PROB_UNLIKELY_MAG(4);
1015 Block* least = LCA;
1016 double least_freq = least->_freq;
1017 uint target = _node_latency->at_grow(self->_idx);
1018 uint start_latency = _node_latency->at_grow(LCA->_nodes[0]->_idx);
1019 uint end_latency = _node_latency->at_grow(LCA->_nodes[LCA->end_idx()]->_idx);
1020 bool in_latency = (target <= start_latency);
1021 const Block* root_block = _bbs[_root->_idx];
1022
1023 // Turn off latency scheduling if scheduling is just plain off
1024 if (!C->do_scheduling())
1025 in_latency = true;
1026
1027 // Do not hoist (to cover latency) instructions which target a
1028 // single register. Hoisting stretches the live range of the
1029 // single register and may force spilling.
1030 MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;
1031 if (mach && mach->out_RegMask().is_bound1() && mach->out_RegMask().is_NotEmpty())
1032 in_latency = true;
1033
1034 #ifndef PRODUCT
1035 if (trace_opto_pipelining()) {
1036 tty->print("# Find cheaper block for latency %d: ",
1037 _node_latency->at_grow(self->_idx));
1038 self->dump();
1039 tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",
1040 LCA->_pre_order,
1041 LCA->_nodes[0]->_idx,
1109 #endif
1110 _node_latency->at_put_grow(self->_idx, end_latency);
1111 partial_latency_of_defs(self);
1112 }
1113
1114 return least;
1115 }
1116
1117
1118 //------------------------------schedule_late-----------------------------------
1119 // Now schedule all codes as LATE as possible. This is the LCA in the
1120 // dominator tree of all USES of a value. Pick the block with the least
1121 // loop nesting depth that is lowest in the dominator tree.
1122 extern const char must_clone[];
1123 void PhaseCFG::schedule_late(VectorSet &visited, Node_List &stack) {
1124 #ifndef PRODUCT
1125 if (trace_opto_pipelining())
1126 tty->print("\n#---- schedule_late ----\n");
1127 #endif
1128
1129 Node_Backward_Iterator iter((Node *)_root, visited, stack, _bbs);
1130 Node *self;
1131
1132 // Walk over all the nodes from last to first
1133 while (self = iter.next()) {
1134 Block* early = _bbs[self->_idx]; // Earliest legal placement
1135
1136 if (self->is_top()) {
1137 // Top node goes in bb #2 with other constants.
1138 // It must be special-cased, because it has no out edges.
1139 early->add_inst(self);
1140 continue;
1141 }
1142
1143 // No uses, just terminate
1144 if (self->outcnt() == 0) {
1145 assert(self->is_MachProj(), "sanity");
1146 continue; // Must be a dead machine projection
1147 }
1148
1149 // If node is pinned in the block, then no scheduling can be done.
1150 if( self->pinned() ) // Pinned in block?
1151 continue;
1152
1153 MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;
1154 if (mach) {
1162 // Don't move CheckCastPP nodes away from their input, if the input
1163 // is a rawptr (5071820).
1164 Node *def = self->in(1);
1165 if (def != NULL && def->bottom_type()->base() == Type::RawPtr) {
1166 early->add_inst(self);
1167 #ifdef ASSERT
1168 _raw_oops.push(def);
1169 #endif
1170 continue;
1171 }
1172 break;
1173 }
1174 }
1175
1176 // Gather LCA of all uses
1177 Block *LCA = NULL;
1178 {
1179 for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
1180 // For all uses, find LCA
1181 Node* use = self->fast_out(i);
1182 LCA = raise_LCA_above_use(LCA, use, self, _bbs);
1183 }
1184 } // (Hide defs of imax, i from rest of block.)
1185
1186 // Place temps in the block of their use. This isn't a
1187 // requirement for correctness but it reduces useless
1188 // interference between temps and other nodes.
1189 if (mach != NULL && mach->is_MachTemp()) {
1190 _bbs.map(self->_idx, LCA);
1191 LCA->add_inst(self);
1192 continue;
1193 }
1194
1195 // Check if 'self' could be anti-dependent on memory
1196 if (self->needs_anti_dependence_check()) {
1197 // Hoist LCA above possible-defs and insert anti-dependences to
1198 // defs in new LCA block.
1199 LCA = insert_anti_dependences(LCA, self);
1200 }
1201
1202 if (early->_dom_depth > LCA->_dom_depth) {
1203 // Somehow the LCA has moved above the earliest legal point.
1204 // (One way this can happen is via memory_early_block.)
1205 if (C->subsume_loads() == true && !C->failing()) {
1206 // Retry with subsume_loads == false
1207 // If this is the first failure, the sentinel string will "stick"
1208 // to the Compile object, and the C2Compiler will see it and retry.
1209 C->record_failure(C2Compiler::retry_no_subsuming_loads());
1210 } else {
1245 // since precedence edges are only inserted when we're sure they
1246 // are needed make sure that after placement in a block we don't
1247 // need any new precedence edges.
1248 verify_anti_dependences(late, self);
1249 }
1250 #endif
1251 } // Loop until all nodes have been visited
1252
1253 } // end ScheduleLate
1254
1255 //------------------------------GlobalCodeMotion-------------------------------
1256 void PhaseCFG::GlobalCodeMotion( Matcher &matcher, uint unique, Node_List &proj_list ) {
1257 ResourceMark rm;
1258
1259 #ifndef PRODUCT
1260 if (trace_opto_pipelining()) {
1261 tty->print("\n---- Start GlobalCodeMotion ----\n");
1262 }
1263 #endif
1264
1265 // Initialize the bbs.map for things on the proj_list
1266 uint i;
1267 for( i=0; i < proj_list.size(); i++ )
1268 _bbs.map(proj_list[i]->_idx, NULL);
1269
1270 // Set the basic block for Nodes pinned into blocks
1271 Arena *a = Thread::current()->resource_area();
1272 VectorSet visited(a);
1273 schedule_pinned_nodes( visited );
1274
1275 // Find the earliest Block any instruction can be placed in. Some
1276 // instructions are pinned into Blocks. Unpinned instructions can
1277 // appear in last block in which all their inputs occur.
1278 visited.Clear();
1279 Node_List stack(a);
1280 stack.map( (unique >> 1) + 16, NULL); // Pre-grow the list
1281 if (!schedule_early(visited, stack)) {
1282 // Bailout without retry
1283 C->record_method_not_compilable("early schedule failed");
1284 return;
1285 }
1286
1287 // Build Def-Use edges.
1288 proj_list.push(_root); // Add real root as another root
1316
1317 // Detect implicit-null-check opportunities. Basically, find NULL checks
1318 // with suitable memory ops nearby. Use the memory op to do the NULL check.
1319 // I can generate a memory op if there is not one nearby.
1320 if (C->is_method_compilation()) {
1321 // Don't do it for natives, adapters, or runtime stubs
1322 int allowed_reasons = 0;
1323 // ...and don't do it when there have been too many traps, globally.
1324 for (int reason = (int)Deoptimization::Reason_none+1;
1325 reason < Compile::trapHistLength; reason++) {
1326 assert(reason < BitsPerInt, "recode bit map");
1327 if (!C->too_many_traps((Deoptimization::DeoptReason) reason))
1328 allowed_reasons |= nth_bit(reason);
1329 }
1330 // By reversing the loop direction we get a very minor gain on mpegaudio.
1331 // Feel free to revert to a forward loop for clarity.
1332 // for( int i=0; i < (int)matcher._null_check_tests.size(); i+=2 ) {
1333 for( int i= matcher._null_check_tests.size()-2; i>=0; i-=2 ) {
1334 Node *proj = matcher._null_check_tests[i ];
1335 Node *val = matcher._null_check_tests[i+1];
1336 _bbs[proj->_idx]->implicit_null_check(this, proj, val, allowed_reasons);
1337 // The implicit_null_check will only perform the transformation
1338 // if the null branch is truly uncommon, *and* it leads to an
1339 // uncommon trap. Combined with the too_many_traps guards
1340 // above, this prevents SEGV storms reported in 6366351,
1341 // by recompiling offending methods without this optimization.
1342 }
1343 }
1344
1345 #ifndef PRODUCT
1346 if (trace_opto_pipelining()) {
1347 tty->print("\n---- Start Local Scheduling ----\n");
1348 }
1349 #endif
1350
1351 // Schedule locally. Right now a simple topological sort.
1352 // Later, do a real latency aware scheduler.
1353 uint max_idx = C->unique();
1354 GrowableArray<int> ready_cnt(max_idx, max_idx, -1);
1355 visited.Clear();
1356 for (i = 0; i < _num_blocks; i++) {
1357 if (!_blocks[i]->schedule_local(this, matcher, ready_cnt, visited)) {
1358 if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) {
1359 C->record_method_not_compilable("local schedule failed");
1360 }
1361 return;
1362 }
1363 }
1364
1365 // If we inserted any instructions between a Call and his CatchNode,
1366 // clone the instructions on all paths below the Catch.
1367 for( i=0; i < _num_blocks; i++ )
1368 _blocks[i]->call_catch_cleanup(_bbs, C);
1369
1370 #ifndef PRODUCT
1371 if (trace_opto_pipelining()) {
1372 tty->print("\n---- After GlobalCodeMotion ----\n");
1373 for (uint i = 0; i < _num_blocks; i++) {
1374 _blocks[i]->dump();
1375 }
1376 }
1377 #endif
1378 // Dead.
1379 _node_latency = (GrowableArray<uint> *)0xdeadbeef;
1380 }
1381
1382
1383 //------------------------------Estimate_Block_Frequency-----------------------
1384 // Estimate block frequencies based on IfNode probabilities.
1385 void PhaseCFG::Estimate_Block_Frequency() {
1386
1387 // Force conditional branches leading to uncommon traps to be unlikely,
1388 // not because we get to the uncommon_trap with less relative frequency,
1389 // but because an uncommon_trap typically causes a deopt, so we only get
1390 // there once.
1391 if (C->do_freq_based_layout()) {
1392 Block_List worklist;
1393 Block* root_blk = _blocks[0];
1394 for (uint i = 1; i < root_blk->num_preds(); i++) {
1395 Block *pb = _bbs[root_blk->pred(i)->_idx];
1396 if (pb->has_uncommon_code()) {
1397 worklist.push(pb);
1398 }
1399 }
1400 while (worklist.size() > 0) {
1401 Block* uct = worklist.pop();
1402 if (uct == _broot) continue;
1403 for (uint i = 1; i < uct->num_preds(); i++) {
1404 Block *pb = _bbs[uct->pred(i)->_idx];
1405 if (pb->_num_succs == 1) {
1406 worklist.push(pb);
1407 } else if (pb->num_fall_throughs() == 2) {
1408 pb->update_uncommon_branch(uct);
1409 }
1410 }
1411 }
1412 }
1413
1414 // Create the loop tree and calculate loop depth.
1415 _root_loop = create_loop_tree();
1416 _root_loop->compute_loop_depth(0);
1417
1418 // Compute block frequency of each block, relative to a single loop entry.
1419 _root_loop->compute_freq();
1420
1421 // Adjust all frequencies to be relative to a single method entry
1422 _root_loop->_freq = 1.0;
1423 _root_loop->scale_freq();
1424
1425 // Save outmost loop frequency for LRG frequency threshold
1426 _outer_loop_freq = _root_loop->outer_loop_freq();
1427
1428 // force paths ending at uncommon traps to be infrequent
1429 if (!C->do_freq_based_layout()) {
1430 Block_List worklist;
1431 Block* root_blk = _blocks[0];
1432 for (uint i = 1; i < root_blk->num_preds(); i++) {
1433 Block *pb = _bbs[root_blk->pred(i)->_idx];
1434 if (pb->has_uncommon_code()) {
1435 worklist.push(pb);
1436 }
1437 }
1438 while (worklist.size() > 0) {
1439 Block* uct = worklist.pop();
1440 uct->_freq = PROB_MIN;
1441 for (uint i = 1; i < uct->num_preds(); i++) {
1442 Block *pb = _bbs[uct->pred(i)->_idx];
1443 if (pb->_num_succs == 1 && pb->_freq > PROB_MIN) {
1444 worklist.push(pb);
1445 }
1446 }
1447 }
1448 }
1449
1450 #ifdef ASSERT
1451 for (uint i = 0; i < _num_blocks; i++ ) {
1452 Block *b = _blocks[i];
1453 assert(b->_freq >= MIN_BLOCK_FREQUENCY, "Register Allocator requires meaningful block frequency");
1454 }
1455 #endif
1456
1457 #ifndef PRODUCT
1458 if (PrintCFGBlockFreq) {
1459 tty->print_cr("CFG Block Frequencies");
1460 _root_loop->dump_tree();
1461 if (Verbose) {
1462 tty->print_cr("PhaseCFG dump");
1482 // It doesn't have to be for the loop tree to be built, but if it is not,
1483 // then the blocks have been reordered since dom graph building...which
1484 // may question the RPO numbering
1485 assert(b->_rpo == i, "unexpected reverse post order number");
1486 }
1487 #endif
1488
1489 int idct = 0;
1490 CFGLoop* root_loop = new CFGLoop(idct++);
1491
1492 Block_List worklist;
1493
1494 // Assign blocks to loops
1495 for(uint i = _num_blocks - 1; i > 0; i-- ) { // skip Root block
1496 Block *b = _blocks[i];
1497
1498 if (b->head()->is_Loop()) {
1499 Block* loop_head = b;
1500 assert(loop_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1501 Node* tail_n = loop_head->pred(LoopNode::LoopBackControl);
1502 Block* tail = _bbs[tail_n->_idx];
1503
1504 // Defensively filter out Loop nodes for non-single-entry loops.
1505 // For all reasonable loops, the head occurs before the tail in RPO.
1506 if (i <= tail->_rpo) {
1507
1508 // The tail and (recursive) predecessors of the tail
1509 // are made members of a new loop.
1510
1511 assert(worklist.size() == 0, "nonempty worklist");
1512 CFGLoop* nloop = new CFGLoop(idct++);
1513 assert(loop_head->_loop == NULL, "just checking");
1514 loop_head->_loop = nloop;
1515 // Add to nloop so push_pred() will skip over inner loops
1516 nloop->add_member(loop_head);
1517 nloop->push_pred(loop_head, LoopNode::LoopBackControl, worklist, _bbs);
1518
1519 while (worklist.size() > 0) {
1520 Block* member = worklist.pop();
1521 if (member != loop_head) {
1522 for (uint j = 1; j < member->num_preds(); j++) {
1523 nloop->push_pred(member, j, worklist, _bbs);
1524 }
1525 }
1526 }
1527 }
1528 }
1529 }
1530
1531 // Create a member list for each loop consisting
1532 // of both blocks and (immediate child) loops.
1533 for (uint i = 0; i < _num_blocks; i++) {
1534 Block *b = _blocks[i];
1535 CFGLoop* lp = b->_loop;
1536 if (lp == NULL) {
1537 // Not assigned to a loop. Add it to the method's pseudo loop.
1538 b->_loop = root_loop;
1539 lp = root_loop;
1540 }
1541 if (lp == root_loop || b != lp->head()) { // loop heads are already members
1542 lp->add_member(b);
1543 }
1544 if (lp != root_loop) {
1545 if (lp->parent() == NULL) {
1546 // Not a nested loop. Make it a child of the method's pseudo loop.
1547 root_loop->add_nested_loop(lp);
1548 }
1549 if (b == lp->head()) {
1550 // Add nested loop to member list of parent loop.
1551 lp->parent()->add_member(lp);
1552 }
1553 }
1554 }
1555
1556 return root_loop;
1557 }
1558
1559 //------------------------------push_pred--------------------------------------
1560 void CFGLoop::push_pred(Block* blk, int i, Block_List& worklist, Block_Array& node_to_blk) {
1561 Node* pred_n = blk->pred(i);
1562 Block* pred = node_to_blk[pred_n->_idx];
1563 CFGLoop *pred_loop = pred->_loop;
1564 if (pred_loop == NULL) {
1565 // Filter out blocks for non-single-entry loops.
1566 // For all reasonable loops, the head occurs before the tail in RPO.
1567 if (pred->_rpo > head()->_rpo) {
1568 pred->_loop = this;
1569 worklist.push(pred);
1570 }
1571 } else if (pred_loop != this) {
1572 // Nested loop.
1573 while (pred_loop->_parent != NULL && pred_loop->_parent != this) {
1574 pred_loop = pred_loop->_parent;
1575 }
1576 // Make pred's loop be a child
1577 if (pred_loop->_parent == NULL) {
1578 add_nested_loop(pred_loop);
1579 // Continue with loop entry predecessor.
1580 Block* pred_head = pred_loop->head();
1581 assert(pred_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1582 assert(pred_head != head(), "loop head in only one loop");
1583 push_pred(pred_head, LoopNode::EntryControl, worklist, node_to_blk);
1584 } else {
1585 assert(pred_loop->_parent == this && _parent == NULL, "just checking");
1586 }
1587 }
1588 }
1589
1590 //------------------------------add_nested_loop--------------------------------
1591 // Make cl a child of the current loop in the loop tree.
1592 void CFGLoop::add_nested_loop(CFGLoop* cl) {
1593 assert(_parent == NULL, "no parent yet");
1594 assert(cl != this, "not my own parent");
1595 cl->_parent = this;
1596 CFGLoop* ch = _child;
1597 if (ch == NULL) {
1598 _child = cl;
1599 } else {
1600 while (ch->_sibling != NULL) { ch = ch->_sibling; }
1601 ch->_sibling = cl;
1602 }
1603 }
|
49 #endif
50 #ifdef TARGET_ARCH_MODEL_arm
51 # include "adfiles/ad_arm.hpp"
52 #endif
53 #ifdef TARGET_ARCH_MODEL_ppc
54 # include "adfiles/ad_ppc.hpp"
55 #endif
56
57 // Portions of code courtesy of Clifford Click
58
59 // Optimization - Graph Style
60
61 // To avoid float value underflow
62 #define MIN_BLOCK_FREQUENCY 1.e-35f
63
64 //----------------------------schedule_node_into_block-------------------------
65 // Insert node n into block b. Look for projections of n and make sure they
66 // are in b also.
67 void PhaseCFG::schedule_node_into_block( Node *n, Block *b ) {
68 // Set basic block of n, Add n to b,
69 map_node_to_block(n, b);
70 b->add_inst(n);
71
72 // After Matching, nearly any old Node may have projections trailing it.
73 // These are usually machine-dependent flags. In any case, they might
74 // float to another block below this one. Move them up.
75 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
76 Node* use = n->fast_out(i);
77 if (use->is_Proj()) {
78 Block* buse = get_block_for_node(use);
79 if (buse != b) { // In wrong block?
80 if (buse != NULL) {
81 buse->find_remove(use); // Remove from wrong block
82 }
83 map_node_to_block(use, b);
84 b->add_inst(use);
85 }
86 }
87 }
88 }
89
90 //----------------------------replace_block_proj_ctrl-------------------------
91 // Nodes that have is_block_proj() nodes as their control need to use
92 // the appropriate Region for their actual block as their control since
93 // the projection will be in a predecessor block.
94 void PhaseCFG::replace_block_proj_ctrl( Node *n ) {
95 const Node *in0 = n->in(0);
96 assert(in0 != NULL, "Only control-dependent");
97 const Node *p = in0->is_block_proj();
98 if (p != NULL && p != n) { // Control from a block projection?
99 assert(!n->pinned() || n->is_MachConstantBase(), "only pinned MachConstantBase node is expected here");
100 // Find trailing Region
101 Block *pb = get_block_for_node(in0); // Block-projection already has basic block
102 uint j = 0;
103 if (pb->_num_succs != 1) { // More then 1 successor?
104 // Search for successor
105 uint max = pb->_nodes.size();
106 assert( max > 1, "" );
107 uint start = max - pb->_num_succs;
108 // Find which output path belongs to projection
109 for (j = start; j < max; j++) {
110 if( pb->_nodes[j] == in0 )
111 break;
112 }
113 assert( j < max, "must find" );
114 // Change control to match head of successor basic block
115 j -= start;
116 }
117 n->set_req(0, pb->_succs[j]->head());
118 }
119 }
120
121
122 //------------------------------schedule_pinned_nodes--------------------------
123 // Set the basic block for Nodes pinned into blocks
124 void PhaseCFG::schedule_pinned_nodes( VectorSet &visited ) {
125 // Allocate node stack of size C->unique()+8 to avoid frequent realloc
126 GrowableArray <Node *> spstack(C->unique()+8);
127 spstack.push(_root);
128 while ( spstack.is_nonempty() ) {
129 Node *n = spstack.pop();
130 if( !visited.test_set(n->_idx) ) { // Test node and flag it as visited
131 if( n->pinned() && !has_block(n)) { // Pinned? Nail it down!
132 assert( n->in(0), "pinned Node must have Control" );
133 // Before setting block replace block_proj control edge
134 replace_block_proj_ctrl(n);
135 Node *input = n->in(0);
136 while (!input->is_block_start()) {
137 input = input->in(0);
138 }
139 Block *b = get_block_for_node(input); // Basic block of controlling input
140 schedule_node_into_block(n, b);
141 }
142 for( int i = n->req() - 1; i >= 0; --i ) { // For all inputs
143 if( n->in(i) != NULL )
144 spstack.push(n->in(i));
145 }
146 }
147 }
148 }
149
150 #ifdef ASSERT
151 // Assert that new input b2 is dominated by all previous inputs.
152 // Check this by by seeing that it is dominated by b1, the deepest
153 // input observed until b2.
154 static void assert_dom(Block* b1, Block* b2, Node* n, const PhaseCFG* cfg) {
155 if (b1 == NULL) return;
156 assert(b1->_dom_depth < b2->_dom_depth, "sanity");
157 Block* tmp = b2;
158 while (tmp != b1 && tmp != NULL) {
159 tmp = tmp->_idom;
160 }
161 if (tmp != b1) {
162 // Detected an unschedulable graph. Print some nice stuff and die.
163 tty->print_cr("!!! Unschedulable graph !!!");
164 for (uint j=0; j<n->len(); j++) { // For all inputs
165 Node* inn = n->in(j); // Get input
166 if (inn == NULL) continue; // Ignore NULL, missing inputs
167 Block* inb = cfg->get_block_for_node(inn);
168 tty->print("B%d idom=B%d depth=%2d ",inb->_pre_order,
169 inb->_idom ? inb->_idom->_pre_order : 0, inb->_dom_depth);
170 inn->dump();
171 }
172 tty->print("Failing node: ");
173 n->dump();
174 assert(false, "unscheduable graph");
175 }
176 }
177 #endif
178
179 static Block* find_deepest_input(Node* n, const PhaseCFG* cfg) {
180 // Find the last input dominated by all other inputs.
181 Block* deepb = NULL; // Deepest block so far
182 int deepb_dom_depth = 0;
183 for (uint k = 0; k < n->len(); k++) { // For all inputs
184 Node* inn = n->in(k); // Get input
185 if (inn == NULL) continue; // Ignore NULL, missing inputs
186 Block* inb = cfg->get_block_for_node(inn);
187 assert(inb != NULL, "must already have scheduled this input");
188 if (deepb_dom_depth < (int) inb->_dom_depth) {
189 // The new inb must be dominated by the previous deepb.
190 // The various inputs must be linearly ordered in the dom
191 // tree, or else there will not be a unique deepest block.
192 DEBUG_ONLY(assert_dom(deepb, inb, n, cfg));
193 deepb = inb; // Save deepest block
194 deepb_dom_depth = deepb->_dom_depth;
195 }
196 }
197 assert(deepb != NULL, "must be at least one input to n");
198 return deepb;
199 }
200
201
202 //------------------------------schedule_early---------------------------------
203 // Find the earliest Block any instruction can be placed in. Some instructions
204 // are pinned into Blocks. Unpinned instructions can appear in last block in
205 // which all their inputs occur.
206 bool PhaseCFG::schedule_early(VectorSet &visited, Node_List &roots) {
207 // Allocate stack with enough space to avoid frequent realloc
208 Node_Stack nstack(roots.Size() + 8); // (unique >> 1) + 24 from Java2D stats
209 // roots.push(_root); _root will be processed among C->top() inputs
210 roots.push(C->top());
211 visited.set(C->top()->_idx);
212
228 const Node *in0 = n->in(0);
229 if (in0 != NULL) { // Control-dependent?
230 replace_block_proj_ctrl(n);
231 } else { // n->in(0) == NULL
232 if (n->req() == 1) { // This guy is a constant with NO inputs?
233 n->set_req(0, _root);
234 }
235 }
236 }
237
238 // First, visit all inputs and force them to get a block. If an
239 // input is already in a block we quit following inputs (to avoid
240 // cycles). Instead we put that Node on a worklist to be handled
241 // later (since IT'S inputs may not have a block yet).
242 bool done = true; // Assume all n's inputs will be processed
243 while (i < n->len()) { // For all inputs
244 Node *in = n->in(i); // Get input
245 ++i;
246 if (in == NULL) continue; // Ignore NULL, missing inputs
247 int is_visited = visited.test_set(in->_idx);
248 if (!has_block(in)) { // Missing block selection?
249 if (is_visited) {
250 // assert( !visited.test(in->_idx), "did not schedule early" );
251 return false;
252 }
253 nstack.push(n, i); // Save parent node and next input's index.
254 nstack_top_n = in; // Process current input now.
255 nstack_top_i = 0;
256 done = false; // Not all n's inputs processed.
257 break; // continue while_nstack_nonempty;
258 } else if (!is_visited) { // Input not yet visited?
259 roots.push(in); // Visit this guy later, using worklist
260 }
261 }
262 if (done) {
263 // All of n's inputs have been processed, complete post-processing.
264
265 // Some instructions are pinned into a block. These include Region,
266 // Phi, Start, Return, and other control-dependent instructions and
267 // any projections which depend on them.
268 if (!n->pinned()) {
269 // Set earliest legal block.
270 map_node_to_block(n, find_deepest_input(n, this));
271 } else {
272 assert(get_block_for_node(n) == get_block_for_node(n->in(0)), "Pinned Node should be at the same block as its control edge");
273 }
274
275 if (nstack.is_empty()) {
276 // Finished all nodes on stack.
277 // Process next node on the worklist 'roots'.
278 break;
279 }
280 // Get saved parent node and next input's index.
281 nstack_top_n = nstack.node();
282 nstack_top_i = nstack.index();
283 nstack.pop();
284 } // if (done)
285 } // while (true)
286 } // while (roots.size() != 0)
287 return true;
288 }
289
290 //------------------------------dom_lca----------------------------------------
291 // Find least common ancestor in dominator tree
292 // LCA is a current notion of LCA, to be raised above 'this'.
298 Block* anc = this;
299 while (anc->_dom_depth > LCA->_dom_depth)
300 anc = anc->_idom; // Walk up till anc is as high as LCA
301
302 while (LCA->_dom_depth > anc->_dom_depth)
303 LCA = LCA->_idom; // Walk up till LCA is as high as anc
304
305 while (LCA != anc) { // Walk both up till they are the same
306 LCA = LCA->_idom;
307 anc = anc->_idom;
308 }
309
310 return LCA;
311 }
312
313 //--------------------------raise_LCA_above_use--------------------------------
314 // We are placing a definition, and have been given a def->use edge.
315 // The definition must dominate the use, so move the LCA upward in the
316 // dominator tree to dominate the use. If the use is a phi, adjust
317 // the LCA only with the phi input paths which actually use this def.
318 static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, const PhaseCFG* cfg) {
319 Block* buse = cfg->get_block_for_node(use);
320 if (buse == NULL) return LCA; // Unused killing Projs have no use block
321 if (!use->is_Phi()) return buse->dom_lca(LCA);
322 uint pmax = use->req(); // Number of Phi inputs
323 // Why does not this loop just break after finding the matching input to
324 // the Phi? Well...it's like this. I do not have true def-use/use-def
325 // chains. Means I cannot distinguish, from the def-use direction, which
326 // of many use-defs lead from the same use to the same def. That is, this
327 // Phi might have several uses of the same def. Each use appears in a
328 // different predecessor block. But when I enter here, I cannot distinguish
329 // which use-def edge I should find the predecessor block for. So I find
330 // them all. Means I do a little extra work if a Phi uses the same value
331 // more than once.
332 for (uint j=1; j<pmax; j++) { // For all inputs
333 if (use->in(j) == def) { // Found matching input?
334 Block* pred = cfg->get_block_for_node(buse->pred(j));
335 LCA = pred->dom_lca(LCA);
336 }
337 }
338 return LCA;
339 }
340
341 //----------------------------raise_LCA_above_marks----------------------------
342 // Return a new LCA that dominates LCA and any of its marked predecessors.
343 // Search all my parents up to 'early' (exclusive), looking for predecessors
344 // which are marked with the given index. Return the LCA (in the dom tree)
345 // of all marked blocks. If there are none marked, return the original
346 // LCA.
347 static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark, Block* early, const PhaseCFG* cfg) {
348 Block_List worklist;
349 worklist.push(LCA);
350 while (worklist.size() > 0) {
351 Block* mid = worklist.pop();
352 if (mid == early) continue; // stop searching here
353
354 // Test and set the visited bit.
355 if (mid->raise_LCA_visited() == mark) continue; // already visited
356
357 // Don't process the current LCA, otherwise the search may terminate early
358 if (mid != LCA && mid->raise_LCA_mark() == mark) {
359 // Raise the LCA.
360 LCA = mid->dom_lca(LCA);
361 if (LCA == early) break; // stop searching everywhere
362 assert(early->dominates(LCA), "early is high enough");
363 // Resume searching at that point, skipping intermediate levels.
364 worklist.push(LCA);
365 if (LCA == mid)
366 continue; // Don't mark as visited to avoid early termination.
367 } else {
368 // Keep searching through this block's predecessors.
369 for (uint j = 1, jmax = mid->num_preds(); j < jmax; j++) {
370 Block* mid_parent = cfg->get_block_for_node(mid->pred(j));
371 worklist.push(mid_parent);
372 }
373 }
374 mid->set_raise_LCA_visited(mark);
375 }
376 return LCA;
377 }
378
379 //--------------------------memory_early_block--------------------------------
380 // This is a variation of find_deepest_input, the heart of schedule_early.
381 // Find the "early" block for a load, if we considered only memory and
382 // address inputs, that is, if other data inputs were ignored.
383 //
384 // Because a subset of edges are considered, the resulting block will
385 // be earlier (at a shallower dom_depth) than the true schedule_early
386 // point of the node. We compute this earlier block as a more permissive
387 // site for anti-dependency insertion, but only if subsume_loads is enabled.
388 static Block* memory_early_block(Node* load, Block* early, const PhaseCFG* cfg) {
389 Node* base;
390 Node* index;
391 Node* store = load->in(MemNode::Memory);
392 load->as_Mach()->memory_inputs(base, index);
393
394 assert(base != NodeSentinel && index != NodeSentinel,
395 "unexpected base/index inputs");
396
397 Node* mem_inputs[4];
398 int mem_inputs_length = 0;
399 if (base != NULL) mem_inputs[mem_inputs_length++] = base;
400 if (index != NULL) mem_inputs[mem_inputs_length++] = index;
401 if (store != NULL) mem_inputs[mem_inputs_length++] = store;
402
403 // In the comparision below, add one to account for the control input,
404 // which may be null, but always takes up a spot in the in array.
405 if (mem_inputs_length + 1 < (int) load->req()) {
406 // This "load" has more inputs than just the memory, base and index inputs.
407 // For purposes of checking anti-dependences, we need to start
408 // from the early block of only the address portion of the instruction,
409 // and ignore other blocks that may have factored into the wider
410 // schedule_early calculation.
411 if (load->in(0) != NULL) mem_inputs[mem_inputs_length++] = load->in(0);
412
413 Block* deepb = NULL; // Deepest block so far
414 int deepb_dom_depth = 0;
415 for (int i = 0; i < mem_inputs_length; i++) {
416 Block* inb = cfg->get_block_for_node(mem_inputs[i]);
417 if (deepb_dom_depth < (int) inb->_dom_depth) {
418 // The new inb must be dominated by the previous deepb.
419 // The various inputs must be linearly ordered in the dom
420 // tree, or else there will not be a unique deepest block.
421 DEBUG_ONLY(assert_dom(deepb, inb, load, cfg));
422 deepb = inb; // Save deepest block
423 deepb_dom_depth = deepb->_dom_depth;
424 }
425 }
426 early = deepb;
427 }
428
429 return early;
430 }
431
432 //--------------------------insert_anti_dependences---------------------------
433 // A load may need to witness memory that nearby stores can overwrite.
434 // For each nearby store, either insert an "anti-dependence" edge
435 // from the load to the store, or else move LCA upward to force the
436 // load to (eventually) be scheduled in a block above the store.
437 //
438 // Do not add edges to stores on distinct control-flow paths;
439 // only add edges to stores which might interfere.
440 //
441 // Return the (updated) LCA. There will not be any possibly interfering
472 "String equals is a 'load' that does not conflict with any stores");
473 assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrIndexOf),
474 "String indexOf is a 'load' that does not conflict with any stores");
475 assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_AryEq),
476 "Arrays equals is a 'load' that do not conflict with any stores");
477
478 if (!C->alias_type(load_alias_idx)->is_rewritable()) {
479 // It is impossible to spoil this load by putting stores before it,
480 // because we know that the stores will never update the value
481 // which 'load' must witness.
482 return LCA;
483 }
484
485 node_idx_t load_index = load->_idx;
486
487 // Note the earliest legal placement of 'load', as determined by
488 // by the unique point in the dom tree where all memory effects
489 // and other inputs are first available. (Computed by schedule_early.)
490 // For normal loads, 'early' is the shallowest place (dom graph wise)
491 // to look for anti-deps between this load and any store.
492 Block* early = get_block_for_node(load);
493
494 // If we are subsuming loads, compute an "early" block that only considers
495 // memory or address inputs. This block may be different than the
496 // schedule_early block in that it could be at an even shallower depth in the
497 // dominator tree, and allow for a broader discovery of anti-dependences.
498 if (C->subsume_loads()) {
499 early = memory_early_block(load, early, this);
500 }
501
502 ResourceArea *area = Thread::current()->resource_area();
503 Node_List worklist_mem(area); // prior memory state to store
504 Node_List worklist_store(area); // possible-def to explore
505 Node_List worklist_visited(area); // visited mergemem nodes
506 Node_List non_early_stores(area); // all relevant stores outside of early
507 bool must_raise_LCA = false;
508
509 #ifdef TRACK_PHI_INPUTS
510 // %%% This extra checking fails because MergeMem nodes are not GVNed.
511 // Provide "phi_inputs" to check if every input to a PhiNode is from the
512 // original memory state. This indicates a PhiNode for which should not
513 // prevent the load from sinking. For such a block, set_raise_LCA_mark
514 // may be overly conservative.
515 // Mechanism: count inputs seen for each Phi encountered in worklist_store.
516 DEBUG_ONLY(GrowableArray<uint> phi_inputs(area, C->unique(),0,0));
517 #endif
518
519 // 'load' uses some memory state; look for users of the same state.
603 // instead of control + memory.
604 if (mstore->ideal_Opcode() == Op_SafePoint)
605 continue;
606 } else {
607 // Some raw memory, such as the load of "top" at an allocation,
608 // can be control dependent on the previous safepoint. See
609 // comments in GraphKit::allocate_heap() about control input.
610 // Inserting an anti-dep between such a safepoint and a use
611 // creates a cycle, and will cause a subsequent failure in
612 // local scheduling. (BugId 4919904)
613 // (%%% How can a control input be a safepoint and not a projection??)
614 if (mstore->ideal_Opcode() == Op_SafePoint && load->in(0) == mstore)
615 continue;
616 }
617 }
618
619 // Identify a block that the current load must be above,
620 // or else observe that 'store' is all the way up in the
621 // earliest legal block for 'load'. In the latter case,
622 // immediately insert an anti-dependence edge.
623 Block* store_block = get_block_for_node(store);
624 assert(store_block != NULL, "unused killing projections skipped above");
625
626 if (store->is_Phi()) {
627 // 'load' uses memory which is one (or more) of the Phi's inputs.
628 // It must be scheduled not before the Phi, but rather before
629 // each of the relevant Phi inputs.
630 //
631 // Instead of finding the LCA of all inputs to a Phi that match 'mem',
632 // we mark each corresponding predecessor block and do a combined
633 // hoisting operation later (raise_LCA_above_marks).
634 //
635 // Do not assert(store_block != early, "Phi merging memory after access")
636 // PhiNode may be at start of block 'early' with backedge to 'early'
637 DEBUG_ONLY(bool found_match = false);
638 for (uint j = PhiNode::Input, jmax = store->req(); j < jmax; j++) {
639 if (store->in(j) == mem) { // Found matching input?
640 DEBUG_ONLY(found_match = true);
641 Block* pred_block = get_block_for_node(store_block->pred(j));
642 if (pred_block != early) {
643 // If any predecessor of the Phi matches the load's "early block",
644 // we do not need a precedence edge between the Phi and 'load'
645 // since the load will be forced into a block preceding the Phi.
646 pred_block->set_raise_LCA_mark(load_index);
647 assert(!LCA_orig->dominates(pred_block) ||
648 early->dominates(pred_block), "early is high enough");
649 must_raise_LCA = true;
650 } else {
651 // anti-dependent upon PHI pinned below 'early', no edge needed
652 LCA = early; // but can not schedule below 'early'
653 }
654 }
655 }
656 assert(found_match, "no worklist bug");
657 #ifdef TRACK_PHI_INPUTS
658 #ifdef ASSERT
659 // This assert asks about correct handling of PhiNodes, which may not
660 // have all input edges directly from 'mem'. See BugId 4621264
661 int num_mem_inputs = phi_inputs.at_grow(store->_idx,0) + 1;
695 }
696 }
697 // (Worklist is now empty; all nearby stores have been visited.)
698
699 // Finished if 'load' must be scheduled in its 'early' block.
700 // If we found any stores there, they have already been given
701 // precedence edges.
702 if (LCA == early) return LCA;
703
704 // We get here only if there are no possibly-interfering stores
705 // in the load's 'early' block. Move LCA up above all predecessors
706 // which contain stores we have noted.
707 //
708 // The raised LCA block can be a home to such interfering stores,
709 // but its predecessors must not contain any such stores.
710 //
711 // The raised LCA will be a lower bound for placing the load,
712 // preventing the load from sinking past any block containing
713 // a store that may invalidate the memory state required by 'load'.
714 if (must_raise_LCA)
715 LCA = raise_LCA_above_marks(LCA, load->_idx, early, this);
716 if (LCA == early) return LCA;
717
718 // Insert anti-dependence edges from 'load' to each store
719 // in the non-early LCA block.
720 // Mine the non_early_stores list for such stores.
721 if (LCA->raise_LCA_mark() == load_index) {
722 while (non_early_stores.size() > 0) {
723 Node* store = non_early_stores.pop();
724 Block* store_block = get_block_for_node(store);
725 if (store_block == LCA) {
726 // add anti_dependence from store to load in its own block
727 assert(store != load->in(0), "dependence cycle found");
728 if (verify) {
729 assert(store->find_edge(load) != -1, "missing precedence edge");
730 } else {
731 store->add_prec(load);
732 }
733 } else {
734 assert(store_block->raise_LCA_mark() == load_index, "block was marked");
735 // Any other stores we found must be either inside the new LCA
736 // or else outside the original LCA. In the latter case, they
737 // did not interfere with any use of 'load'.
738 assert(LCA->dominates(store_block)
739 || !LCA_orig->dominates(store_block), "no stray stores");
740 }
741 }
742 }
743
744 // Return the highest block containing stores; any stores
745 // within that block have been given anti-dependence edges.
746 return LCA;
747 }
748
749 // This class is used to iterate backwards over the nodes in the graph.
750
751 class Node_Backward_Iterator {
752
753 private:
754 Node_Backward_Iterator();
755
756 public:
757 // Constructor for the iterator
758 Node_Backward_Iterator(Node *root, VectorSet &visited, Node_List &stack, PhaseCFG &cfg);
759
760 // Postincrement operator to iterate over the nodes
761 Node *next();
762
763 private:
764 VectorSet &_visited;
765 Node_List &_stack;
766 PhaseCFG &_cfg;
767 };
768
769 // Constructor for the Node_Backward_Iterator
770 Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_List &stack, PhaseCFG &cfg)
771 : _visited(visited), _stack(stack), _cfg(cfg) {
772 // The stack should contain exactly the root
773 stack.clear();
774 stack.push(root);
775
776 // Clear the visited bits
777 visited.Clear();
778 }
779
780 // Iterator for the Node_Backward_Iterator
781 Node *Node_Backward_Iterator::next() {
782
783 // If the _stack is empty, then just return NULL: finished.
784 if ( !_stack.size() )
785 return NULL;
786
787 // '_stack' is emulating a real _stack. The 'visit-all-users' loop has been
788 // made stateless, so I do not need to record the index 'i' on my _stack.
789 // Instead I visit all users each time, scanning for unvisited users.
790 // I visit unvisited not-anti-dependence users first, then anti-dependent
791 // children next.
792 Node *self = _stack.pop();
793
794 // I cycle here when I am entering a deeper level of recursion.
795 // The key variable 'self' was set prior to jumping here.
796 while( 1 ) {
797
798 _visited.set(self->_idx);
799
800 // Now schedule all uses as late as possible.
801 const Node* src = self->is_Proj() ? self->in(0) : self;
802 uint src_rpo = _cfg.get_block_for_node(src)->_rpo;
803
804 // Schedule all nodes in a post-order visit
805 Node *unvisited = NULL; // Unvisited anti-dependent Node, if any
806
807 // Scan for unvisited nodes
808 for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
809 // For all uses, schedule late
810 Node* n = self->fast_out(i); // Use
811
812 // Skip already visited children
813 if ( _visited.test(n->_idx) )
814 continue;
815
816 // do not traverse backward control edges
817 Node *use = n->is_Proj() ? n->in(0) : n;
818 uint use_rpo = _cfg.get_block_for_node(use)->_rpo;
819
820 if ( use_rpo < src_rpo )
821 continue;
822
823 // Phi nodes always precede uses in a basic block
824 if ( use_rpo == src_rpo && use->is_Phi() )
825 continue;
826
827 unvisited = n; // Found unvisited
828
829 // Check for possible-anti-dependent
830 if( !n->needs_anti_dependence_check() )
831 break; // Not visited, not anti-dep; schedule it NOW
832 }
833
834 // Did I find an unvisited not-anti-dependent Node?
835 if ( !unvisited )
836 break; // All done with children; post-visit 'self'
837
838 // Visit the unvisited Node. Contains the obvious push to
839 // indicate I'm entering a deeper level of recursion. I push the
840 // old state onto the _stack and set a new state and loop (recurse).
841 _stack.push(self);
842 self = unvisited;
843 } // End recursion loop
844
845 return self;
846 }
847
848 //------------------------------ComputeLatenciesBackwards----------------------
849 // Compute the latency of all the instructions.
850 void PhaseCFG::ComputeLatenciesBackwards(VectorSet &visited, Node_List &stack) {
851 #ifndef PRODUCT
852 if (trace_opto_pipelining())
853 tty->print("\n#---- ComputeLatenciesBackwards ----\n");
854 #endif
855
856 Node_Backward_Iterator iter((Node *)_root, visited, stack, *this);
857 Node *n;
858
859 // Walk over all the nodes from last to first
860 while (n = iter.next()) {
861 // Set the latency for the definitions of this instruction
862 partial_latency_of_defs(n);
863 }
864 } // end ComputeLatenciesBackwards
865
866 //------------------------------partial_latency_of_defs------------------------
867 // Compute the latency impact of this node on all defs. This computes
868 // a number that increases as we approach the beginning of the routine.
869 void PhaseCFG::partial_latency_of_defs(Node *n) {
870 // Set the latency for this instruction
871 #ifndef PRODUCT
872 if (trace_opto_pipelining()) {
873 tty->print("# latency_to_inputs: node_latency[%d] = %d for node",
874 n->_idx, _node_latency->at_grow(n->_idx));
875 dump();
876 }
877 #endif
878
879 if (n->is_Proj())
880 n = n->in(0);
881
882 if (n->is_Root())
883 return;
884
885 uint nlen = n->len();
886 uint use_latency = _node_latency->at_grow(n->_idx);
887 uint use_pre_order = get_block_for_node(n)->_pre_order;
888
889 for ( uint j=0; j<nlen; j++ ) {
890 Node *def = n->in(j);
891
892 if (!def || def == n)
893 continue;
894
895 // Walk backwards thru projections
896 if (def->is_Proj())
897 def = def->in(0);
898
899 #ifndef PRODUCT
900 if (trace_opto_pipelining()) {
901 tty->print("# in(%2d): ", j);
902 def->dump();
903 }
904 #endif
905
906 // If the defining block is not known, assume it is ok
907 Block *def_block = get_block_for_node(def);
908 uint def_pre_order = def_block ? def_block->_pre_order : 0;
909
910 if ( (use_pre_order < def_pre_order) ||
911 (use_pre_order == def_pre_order && n->is_Phi()) )
912 continue;
913
914 uint delta_latency = n->latency(j);
915 uint current_latency = delta_latency + use_latency;
916
917 if (_node_latency->at_grow(def->_idx) < current_latency) {
918 _node_latency->at_put_grow(def->_idx, current_latency);
919 }
920
921 #ifndef PRODUCT
922 if (trace_opto_pipelining()) {
923 tty->print_cr("# %d + edge_latency(%d) == %d -> %d, node_latency[%d] = %d",
924 use_latency, j, delta_latency, current_latency, def->_idx,
925 _node_latency->at_grow(def->_idx));
926 }
927 #endif
928 }
929 }
930
931 //------------------------------latency_from_use-------------------------------
932 // Compute the latency of a specific use
933 int PhaseCFG::latency_from_use(Node *n, const Node *def, Node *use) {
934 // If self-reference, return no latency
935 if (use == n || use->is_Root()) {
936 return 0;
937 }
938
939 uint def_pre_order = get_block_for_node(def)->_pre_order;
940 uint latency = 0;
941
942 // If the use is not a projection, then it is simple...
943 if (!use->is_Proj()) {
944 #ifndef PRODUCT
945 if (trace_opto_pipelining()) {
946 tty->print("# out(): ");
947 use->dump();
948 }
949 #endif
950
951 uint use_pre_order = get_block_for_node(use)->_pre_order;
952
953 if (use_pre_order < def_pre_order)
954 return 0;
955
956 if (use_pre_order == def_pre_order && use->is_Phi())
957 return 0;
958
959 uint nlen = use->len();
960 uint nl = _node_latency->at_grow(use->_idx);
961
962 for ( uint j=0; j<nlen; j++ ) {
963 if (use->in(j) == n) {
964 // Change this if we want local latencies
965 uint ul = use->latency(j);
966 uint l = ul + nl;
967 if (latency < l) latency = l;
968 #ifndef PRODUCT
969 if (trace_opto_pipelining()) {
970 tty->print_cr("# %d + edge_latency(%d) == %d -> %d, latency = %d",
971 nl, j, ul, l, latency);
1003 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
1004 uint l = latency_from_use(n, def, n->fast_out(i));
1005
1006 if (latency < l) latency = l;
1007 }
1008
1009 _node_latency->at_put_grow(n->_idx, latency);
1010 }
1011
1012 //------------------------------hoist_to_cheaper_block-------------------------
1013 // Pick a block for node self, between early and LCA, that is a cheaper
1014 // alternative to LCA.
1015 Block* PhaseCFG::hoist_to_cheaper_block(Block* LCA, Block* early, Node* self) {
1016 const double delta = 1+PROB_UNLIKELY_MAG(4);
1017 Block* least = LCA;
1018 double least_freq = least->_freq;
1019 uint target = _node_latency->at_grow(self->_idx);
1020 uint start_latency = _node_latency->at_grow(LCA->_nodes[0]->_idx);
1021 uint end_latency = _node_latency->at_grow(LCA->_nodes[LCA->end_idx()]->_idx);
1022 bool in_latency = (target <= start_latency);
1023 const Block* root_block = get_block_for_node(_root);
1024
1025 // Turn off latency scheduling if scheduling is just plain off
1026 if (!C->do_scheduling())
1027 in_latency = true;
1028
1029 // Do not hoist (to cover latency) instructions which target a
1030 // single register. Hoisting stretches the live range of the
1031 // single register and may force spilling.
1032 MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;
1033 if (mach && mach->out_RegMask().is_bound1() && mach->out_RegMask().is_NotEmpty())
1034 in_latency = true;
1035
1036 #ifndef PRODUCT
1037 if (trace_opto_pipelining()) {
1038 tty->print("# Find cheaper block for latency %d: ",
1039 _node_latency->at_grow(self->_idx));
1040 self->dump();
1041 tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",
1042 LCA->_pre_order,
1043 LCA->_nodes[0]->_idx,
1111 #endif
1112 _node_latency->at_put_grow(self->_idx, end_latency);
1113 partial_latency_of_defs(self);
1114 }
1115
1116 return least;
1117 }
1118
1119
1120 //------------------------------schedule_late-----------------------------------
1121 // Now schedule all codes as LATE as possible. This is the LCA in the
1122 // dominator tree of all USES of a value. Pick the block with the least
1123 // loop nesting depth that is lowest in the dominator tree.
1124 extern const char must_clone[];
1125 void PhaseCFG::schedule_late(VectorSet &visited, Node_List &stack) {
1126 #ifndef PRODUCT
1127 if (trace_opto_pipelining())
1128 tty->print("\n#---- schedule_late ----\n");
1129 #endif
1130
1131 Node_Backward_Iterator iter((Node *)_root, visited, stack, *this);
1132 Node *self;
1133
1134 // Walk over all the nodes from last to first
1135 while (self = iter.next()) {
1136 Block* early = get_block_for_node(self); // Earliest legal placement
1137
1138 if (self->is_top()) {
1139 // Top node goes in bb #2 with other constants.
1140 // It must be special-cased, because it has no out edges.
1141 early->add_inst(self);
1142 continue;
1143 }
1144
1145 // No uses, just terminate
1146 if (self->outcnt() == 0) {
1147 assert(self->is_MachProj(), "sanity");
1148 continue; // Must be a dead machine projection
1149 }
1150
1151 // If node is pinned in the block, then no scheduling can be done.
1152 if( self->pinned() ) // Pinned in block?
1153 continue;
1154
1155 MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;
1156 if (mach) {
1164 // Don't move CheckCastPP nodes away from their input, if the input
1165 // is a rawptr (5071820).
1166 Node *def = self->in(1);
1167 if (def != NULL && def->bottom_type()->base() == Type::RawPtr) {
1168 early->add_inst(self);
1169 #ifdef ASSERT
1170 _raw_oops.push(def);
1171 #endif
1172 continue;
1173 }
1174 break;
1175 }
1176 }
1177
1178 // Gather LCA of all uses
1179 Block *LCA = NULL;
1180 {
1181 for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
1182 // For all uses, find LCA
1183 Node* use = self->fast_out(i);
1184 LCA = raise_LCA_above_use(LCA, use, self, this);
1185 }
1186 } // (Hide defs of imax, i from rest of block.)
1187
1188 // Place temps in the block of their use. This isn't a
1189 // requirement for correctness but it reduces useless
1190 // interference between temps and other nodes.
1191 if (mach != NULL && mach->is_MachTemp()) {
1192 map_node_to_block(self, LCA);
1193 LCA->add_inst(self);
1194 continue;
1195 }
1196
1197 // Check if 'self' could be anti-dependent on memory
1198 if (self->needs_anti_dependence_check()) {
1199 // Hoist LCA above possible-defs and insert anti-dependences to
1200 // defs in new LCA block.
1201 LCA = insert_anti_dependences(LCA, self);
1202 }
1203
1204 if (early->_dom_depth > LCA->_dom_depth) {
1205 // Somehow the LCA has moved above the earliest legal point.
1206 // (One way this can happen is via memory_early_block.)
1207 if (C->subsume_loads() == true && !C->failing()) {
1208 // Retry with subsume_loads == false
1209 // If this is the first failure, the sentinel string will "stick"
1210 // to the Compile object, and the C2Compiler will see it and retry.
1211 C->record_failure(C2Compiler::retry_no_subsuming_loads());
1212 } else {
1247 // since precedence edges are only inserted when we're sure they
1248 // are needed make sure that after placement in a block we don't
1249 // need any new precedence edges.
1250 verify_anti_dependences(late, self);
1251 }
1252 #endif
1253 } // Loop until all nodes have been visited
1254
1255 } // end ScheduleLate
1256
1257 //------------------------------GlobalCodeMotion-------------------------------
1258 void PhaseCFG::GlobalCodeMotion( Matcher &matcher, uint unique, Node_List &proj_list ) {
1259 ResourceMark rm;
1260
1261 #ifndef PRODUCT
1262 if (trace_opto_pipelining()) {
1263 tty->print("\n---- Start GlobalCodeMotion ----\n");
1264 }
1265 #endif
1266
1267 // Initialize the node to block mapping for things on the proj_list
1268 for (uint i = 0; i < proj_list.size(); i++) {
1269 unmap_node_from_block(proj_list[i]);
1270 }
1271
1272 // Set the basic block for Nodes pinned into blocks
1273 Arena *a = Thread::current()->resource_area();
1274 VectorSet visited(a);
1275 schedule_pinned_nodes( visited );
1276
1277 // Find the earliest Block any instruction can be placed in. Some
1278 // instructions are pinned into Blocks. Unpinned instructions can
1279 // appear in last block in which all their inputs occur.
1280 visited.Clear();
1281 Node_List stack(a);
1282 stack.map( (unique >> 1) + 16, NULL); // Pre-grow the list
1283 if (!schedule_early(visited, stack)) {
1284 // Bailout without retry
1285 C->record_method_not_compilable("early schedule failed");
1286 return;
1287 }
1288
1289 // Build Def-Use edges.
1290 proj_list.push(_root); // Add real root as another root
1318
1319 // Detect implicit-null-check opportunities. Basically, find NULL checks
1320 // with suitable memory ops nearby. Use the memory op to do the NULL check.
1321 // I can generate a memory op if there is not one nearby.
1322 if (C->is_method_compilation()) {
1323 // Don't do it for natives, adapters, or runtime stubs
1324 int allowed_reasons = 0;
1325 // ...and don't do it when there have been too many traps, globally.
1326 for (int reason = (int)Deoptimization::Reason_none+1;
1327 reason < Compile::trapHistLength; reason++) {
1328 assert(reason < BitsPerInt, "recode bit map");
1329 if (!C->too_many_traps((Deoptimization::DeoptReason) reason))
1330 allowed_reasons |= nth_bit(reason);
1331 }
1332 // By reversing the loop direction we get a very minor gain on mpegaudio.
1333 // Feel free to revert to a forward loop for clarity.
1334 // for( int i=0; i < (int)matcher._null_check_tests.size(); i+=2 ) {
1335 for( int i= matcher._null_check_tests.size()-2; i>=0; i-=2 ) {
1336 Node *proj = matcher._null_check_tests[i ];
1337 Node *val = matcher._null_check_tests[i+1];
1338 get_block_for_node(proj)->implicit_null_check(this, proj, val, allowed_reasons);
1339 // The implicit_null_check will only perform the transformation
1340 // if the null branch is truly uncommon, *and* it leads to an
1341 // uncommon trap. Combined with the too_many_traps guards
1342 // above, this prevents SEGV storms reported in 6366351,
1343 // by recompiling offending methods without this optimization.
1344 }
1345 }
1346
1347 #ifndef PRODUCT
1348 if (trace_opto_pipelining()) {
1349 tty->print("\n---- Start Local Scheduling ----\n");
1350 }
1351 #endif
1352
1353 // Schedule locally. Right now a simple topological sort.
1354 // Later, do a real latency aware scheduler.
1355 uint max_idx = C->unique();
1356 GrowableArray<int> ready_cnt(max_idx, max_idx, -1);
1357 visited.Clear();
1358 for (uint i = 0; i < _num_blocks; i++) {
1359 if (!_blocks[i]->schedule_local(this, matcher, ready_cnt, visited)) {
1360 if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) {
1361 C->record_method_not_compilable("local schedule failed");
1362 }
1363 return;
1364 }
1365 }
1366
1367 // If we inserted any instructions between a Call and his CatchNode,
1368 // clone the instructions on all paths below the Catch.
1369 for (uint i = 0; i < _num_blocks; i++) {
1370 _blocks[i]->call_catch_cleanup(this, C);
1371 }
1372
1373 #ifndef PRODUCT
1374 if (trace_opto_pipelining()) {
1375 tty->print("\n---- After GlobalCodeMotion ----\n");
1376 for (uint i = 0; i < _num_blocks; i++) {
1377 _blocks[i]->dump();
1378 }
1379 }
1380 #endif
1381 // Dead.
1382 _node_latency = (GrowableArray<uint> *)0xdeadbeef;
1383 }
1384
1385
1386 //------------------------------Estimate_Block_Frequency-----------------------
1387 // Estimate block frequencies based on IfNode probabilities.
1388 void PhaseCFG::Estimate_Block_Frequency() {
1389
1390 // Force conditional branches leading to uncommon traps to be unlikely,
1391 // not because we get to the uncommon_trap with less relative frequency,
1392 // but because an uncommon_trap typically causes a deopt, so we only get
1393 // there once.
1394 if (C->do_freq_based_layout()) {
1395 Block_List worklist;
1396 Block* root_blk = _blocks[0];
1397 for (uint i = 1; i < root_blk->num_preds(); i++) {
1398 Block *pb = get_block_for_node(root_blk->pred(i));
1399 if (pb->has_uncommon_code()) {
1400 worklist.push(pb);
1401 }
1402 }
1403 while (worklist.size() > 0) {
1404 Block* uct = worklist.pop();
1405 if (uct == _broot) continue;
1406 for (uint i = 1; i < uct->num_preds(); i++) {
1407 Block *pb = get_block_for_node(uct->pred(i));
1408 if (pb->_num_succs == 1) {
1409 worklist.push(pb);
1410 } else if (pb->num_fall_throughs() == 2) {
1411 pb->update_uncommon_branch(uct);
1412 }
1413 }
1414 }
1415 }
1416
1417 // Create the loop tree and calculate loop depth.
1418 _root_loop = create_loop_tree();
1419 _root_loop->compute_loop_depth(0);
1420
1421 // Compute block frequency of each block, relative to a single loop entry.
1422 _root_loop->compute_freq();
1423
1424 // Adjust all frequencies to be relative to a single method entry
1425 _root_loop->_freq = 1.0;
1426 _root_loop->scale_freq();
1427
1428 // Save outmost loop frequency for LRG frequency threshold
1429 _outer_loop_freq = _root_loop->outer_loop_freq();
1430
1431 // force paths ending at uncommon traps to be infrequent
1432 if (!C->do_freq_based_layout()) {
1433 Block_List worklist;
1434 Block* root_blk = _blocks[0];
1435 for (uint i = 1; i < root_blk->num_preds(); i++) {
1436 Block *pb = get_block_for_node(root_blk->pred(i));
1437 if (pb->has_uncommon_code()) {
1438 worklist.push(pb);
1439 }
1440 }
1441 while (worklist.size() > 0) {
1442 Block* uct = worklist.pop();
1443 uct->_freq = PROB_MIN;
1444 for (uint i = 1; i < uct->num_preds(); i++) {
1445 Block *pb = get_block_for_node(uct->pred(i));
1446 if (pb->_num_succs == 1 && pb->_freq > PROB_MIN) {
1447 worklist.push(pb);
1448 }
1449 }
1450 }
1451 }
1452
1453 #ifdef ASSERT
1454 for (uint i = 0; i < _num_blocks; i++ ) {
1455 Block *b = _blocks[i];
1456 assert(b->_freq >= MIN_BLOCK_FREQUENCY, "Register Allocator requires meaningful block frequency");
1457 }
1458 #endif
1459
1460 #ifndef PRODUCT
1461 if (PrintCFGBlockFreq) {
1462 tty->print_cr("CFG Block Frequencies");
1463 _root_loop->dump_tree();
1464 if (Verbose) {
1465 tty->print_cr("PhaseCFG dump");
1485 // It doesn't have to be for the loop tree to be built, but if it is not,
1486 // then the blocks have been reordered since dom graph building...which
1487 // may question the RPO numbering
1488 assert(b->_rpo == i, "unexpected reverse post order number");
1489 }
1490 #endif
1491
1492 int idct = 0;
1493 CFGLoop* root_loop = new CFGLoop(idct++);
1494
1495 Block_List worklist;
1496
1497 // Assign blocks to loops
1498 for(uint i = _num_blocks - 1; i > 0; i-- ) { // skip Root block
1499 Block *b = _blocks[i];
1500
1501 if (b->head()->is_Loop()) {
1502 Block* loop_head = b;
1503 assert(loop_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1504 Node* tail_n = loop_head->pred(LoopNode::LoopBackControl);
1505 Block* tail = get_block_for_node(tail_n);
1506
1507 // Defensively filter out Loop nodes for non-single-entry loops.
1508 // For all reasonable loops, the head occurs before the tail in RPO.
1509 if (i <= tail->_rpo) {
1510
1511 // The tail and (recursive) predecessors of the tail
1512 // are made members of a new loop.
1513
1514 assert(worklist.size() == 0, "nonempty worklist");
1515 CFGLoop* nloop = new CFGLoop(idct++);
1516 assert(loop_head->_loop == NULL, "just checking");
1517 loop_head->_loop = nloop;
1518 // Add to nloop so push_pred() will skip over inner loops
1519 nloop->add_member(loop_head);
1520 nloop->push_pred(loop_head, LoopNode::LoopBackControl, worklist, this);
1521
1522 while (worklist.size() > 0) {
1523 Block* member = worklist.pop();
1524 if (member != loop_head) {
1525 for (uint j = 1; j < member->num_preds(); j++) {
1526 nloop->push_pred(member, j, worklist, this);
1527 }
1528 }
1529 }
1530 }
1531 }
1532 }
1533
1534 // Create a member list for each loop consisting
1535 // of both blocks and (immediate child) loops.
1536 for (uint i = 0; i < _num_blocks; i++) {
1537 Block *b = _blocks[i];
1538 CFGLoop* lp = b->_loop;
1539 if (lp == NULL) {
1540 // Not assigned to a loop. Add it to the method's pseudo loop.
1541 b->_loop = root_loop;
1542 lp = root_loop;
1543 }
1544 if (lp == root_loop || b != lp->head()) { // loop heads are already members
1545 lp->add_member(b);
1546 }
1547 if (lp != root_loop) {
1548 if (lp->parent() == NULL) {
1549 // Not a nested loop. Make it a child of the method's pseudo loop.
1550 root_loop->add_nested_loop(lp);
1551 }
1552 if (b == lp->head()) {
1553 // Add nested loop to member list of parent loop.
1554 lp->parent()->add_member(lp);
1555 }
1556 }
1557 }
1558
1559 return root_loop;
1560 }
1561
1562 //------------------------------push_pred--------------------------------------
1563 void CFGLoop::push_pred(Block* blk, int i, Block_List& worklist, PhaseCFG* cfg) {
1564 Node* pred_n = blk->pred(i);
1565 Block* pred = cfg->get_block_for_node(pred_n);
1566 CFGLoop *pred_loop = pred->_loop;
1567 if (pred_loop == NULL) {
1568 // Filter out blocks for non-single-entry loops.
1569 // For all reasonable loops, the head occurs before the tail in RPO.
1570 if (pred->_rpo > head()->_rpo) {
1571 pred->_loop = this;
1572 worklist.push(pred);
1573 }
1574 } else if (pred_loop != this) {
1575 // Nested loop.
1576 while (pred_loop->_parent != NULL && pred_loop->_parent != this) {
1577 pred_loop = pred_loop->_parent;
1578 }
1579 // Make pred's loop be a child
1580 if (pred_loop->_parent == NULL) {
1581 add_nested_loop(pred_loop);
1582 // Continue with loop entry predecessor.
1583 Block* pred_head = pred_loop->head();
1584 assert(pred_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1585 assert(pred_head != head(), "loop head in only one loop");
1586 push_pred(pred_head, LoopNode::EntryControl, worklist, cfg);
1587 } else {
1588 assert(pred_loop->_parent == this && _parent == NULL, "just checking");
1589 }
1590 }
1591 }
1592
1593 //------------------------------add_nested_loop--------------------------------
1594 // Make cl a child of the current loop in the loop tree.
1595 void CFGLoop::add_nested_loop(CFGLoop* cl) {
1596 assert(_parent == NULL, "no parent yet");
1597 assert(cl != this, "not my own parent");
1598 cl->_parent = this;
1599 CFGLoop* ch = _child;
1600 if (ch == NULL) {
1601 _child = cl;
1602 } else {
1603 while (ch->_sibling != NULL) { ch = ch->_sibling; }
1604 ch->_sibling = cl;
1605 }
1606 }
|