1 /* 2 * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "libadt/vectset.hpp" 27 #include "memory/allocation.inline.hpp" 28 #include "opto/block.hpp" 29 #include "opto/c2compiler.hpp" 30 #include "opto/callnode.hpp" 31 #include "opto/cfgnode.hpp" 32 #include "opto/machnode.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/phaseX.hpp" 35 #include "opto/rootnode.hpp" 36 #include "opto/runtime.hpp" 37 #include "runtime/deoptimization.hpp" 38 39 // Portions of code courtesy of Clifford Click 40 41 // Optimization - Graph Style 42 43 // To avoid float value underflow 44 #define MIN_BLOCK_FREQUENCY 1.e-35f 45 46 //----------------------------schedule_node_into_block------------------------- 47 // Insert node n into block b. Look for projections of n and make sure they 48 // are in b also. 49 void PhaseCFG::schedule_node_into_block( Node *n, Block *b ) { 50 // Set basic block of n, Add n to b, 51 map_node_to_block(n, b); 52 b->add_inst(n); 53 54 // After Matching, nearly any old Node may have projections trailing it. 55 // These are usually machine-dependent flags. In any case, they might 56 // float to another block below this one. Move them up. 57 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) { 58 Node* use = n->fast_out(i); 59 if (use->is_Proj()) { 60 Block* buse = get_block_for_node(use); 61 if (buse != b) { // In wrong block? 62 if (buse != NULL) { 63 buse->find_remove(use); // Remove from wrong block 64 } 65 map_node_to_block(use, b); 66 b->add_inst(use); 67 } 68 } 69 } 70 } 71 72 //----------------------------replace_block_proj_ctrl------------------------- 73 // Nodes that have is_block_proj() nodes as their control need to use 74 // the appropriate Region for their actual block as their control since 75 // the projection will be in a predecessor block. 76 void PhaseCFG::replace_block_proj_ctrl( Node *n ) { 77 const Node *in0 = n->in(0); 78 assert(in0 != NULL, "Only control-dependent"); 79 const Node *p = in0->is_block_proj(); 80 if (p != NULL && p != n) { // Control from a block projection? 81 assert(!n->pinned() || n->is_MachConstantBase(), "only pinned MachConstantBase node is expected here"); 82 // Find trailing Region 83 Block *pb = get_block_for_node(in0); // Block-projection already has basic block 84 uint j = 0; 85 if (pb->_num_succs != 1) { // More then 1 successor? 86 // Search for successor 87 uint max = pb->number_of_nodes(); 88 assert( max > 1, "" ); 89 uint start = max - pb->_num_succs; 90 // Find which output path belongs to projection 91 for (j = start; j < max; j++) { 92 if( pb->get_node(j) == in0 ) 93 break; 94 } 95 assert( j < max, "must find" ); 96 // Change control to match head of successor basic block 97 j -= start; 98 } 99 n->set_req(0, pb->_succs[j]->head()); 100 } 101 } 102 103 static bool is_dominator(Block* d, Block* n) { 104 return d->dom_lca(n) == d; 105 } 106 107 //------------------------------schedule_pinned_nodes-------------------------- 108 // Set the basic block for Nodes pinned into blocks 109 void PhaseCFG::schedule_pinned_nodes(VectorSet &visited) { 110 // Allocate node stack of size C->unique()+8 to avoid frequent realloc 111 GrowableArray <Node *> spstack(C->unique() + 8); 112 spstack.push(_root); 113 while (spstack.is_nonempty()) { 114 Node* node = spstack.pop(); 115 if (!visited.test_set(node->_idx)) { // Test node and flag it as visited 116 if (node->pinned() && !has_block(node)) { // Pinned? Nail it down! 117 assert(node->in(0), "pinned Node must have Control"); 118 // Before setting block replace block_proj control edge 119 replace_block_proj_ctrl(node); 120 Node* input = node->in(0); 121 while (!input->is_block_start()) { 122 input = input->in(0); 123 } 124 Block* block = get_block_for_node(input); // Basic block of controlling input 125 schedule_node_into_block(node, block); 126 } 127 128 // If the node has precedence edges (added when CastPP nodes are 129 // removed in final_graph_reshaping), fix the control of the 130 // node to cover the precedence edges and remove the 131 // dependencies. 132 Node* n = NULL; 133 for (uint i = node->len()-1; i >= node->req(); i--) { 134 Node* m = node->in(i); 135 if (m == NULL) continue; 136 if (m->is_block_proj()) { 137 node->rm_prec(i); 138 if (n == NULL) { 139 n = m; 140 } else { 141 Block* bn = get_block_for_node(n); 142 Block* bm = get_block_for_node(m); 143 n = is_dominator(bn, bm) ? m : n; 144 } 145 } 146 } 147 if (n != NULL) { 148 assert(node->in(0), "control should have been set"); 149 Block* bn = get_block_for_node(n); 150 Block* bnode = get_block_for_node(node->in(0)); 151 if (!is_dominator(bn, bnode)) { 152 node->set_req(0, n); 153 } 154 } 155 156 // process all inputs that are non NULL 157 for (int i = node->req() - 1; i >= 0; --i) { 158 if (node->in(i) != NULL) { 159 spstack.push(node->in(i)); 160 } 161 } 162 } 163 } 164 } 165 166 #ifdef ASSERT 167 // Assert that new input b2 is dominated by all previous inputs. 168 // Check this by by seeing that it is dominated by b1, the deepest 169 // input observed until b2. 170 static void assert_dom(Block* b1, Block* b2, Node* n, const PhaseCFG* cfg) { 171 if (b1 == NULL) return; 172 assert(b1->_dom_depth < b2->_dom_depth, "sanity"); 173 Block* tmp = b2; 174 while (tmp != b1 && tmp != NULL) { 175 tmp = tmp->_idom; 176 } 177 if (tmp != b1) { 178 // Detected an unschedulable graph. Print some nice stuff and die. 179 tty->print_cr("!!! Unschedulable graph !!!"); 180 for (uint j=0; j<n->len(); j++) { // For all inputs 181 Node* inn = n->in(j); // Get input 182 if (inn == NULL) continue; // Ignore NULL, missing inputs 183 Block* inb = cfg->get_block_for_node(inn); 184 tty->print("B%d idom=B%d depth=%2d ",inb->_pre_order, 185 inb->_idom ? inb->_idom->_pre_order : 0, inb->_dom_depth); 186 inn->dump(); 187 } 188 tty->print("Failing node: "); 189 n->dump(); 190 assert(false, "unscheduable graph"); 191 } 192 } 193 #endif 194 195 static Block* find_deepest_input(Node* n, const PhaseCFG* cfg) { 196 // Find the last input dominated by all other inputs. 197 Block* deepb = NULL; // Deepest block so far 198 int deepb_dom_depth = 0; 199 for (uint k = 0; k < n->len(); k++) { // For all inputs 200 Node* inn = n->in(k); // Get input 201 if (inn == NULL) continue; // Ignore NULL, missing inputs 202 Block* inb = cfg->get_block_for_node(inn); 203 assert(inb != NULL, "must already have scheduled this input"); 204 if (deepb_dom_depth < (int) inb->_dom_depth) { 205 // The new inb must be dominated by the previous deepb. 206 // The various inputs must be linearly ordered in the dom 207 // tree, or else there will not be a unique deepest block. 208 DEBUG_ONLY(assert_dom(deepb, inb, n, cfg)); 209 deepb = inb; // Save deepest block 210 deepb_dom_depth = deepb->_dom_depth; 211 } 212 } 213 assert(deepb != NULL, "must be at least one input to n"); 214 return deepb; 215 } 216 217 218 //------------------------------schedule_early--------------------------------- 219 // Find the earliest Block any instruction can be placed in. Some instructions 220 // are pinned into Blocks. Unpinned instructions can appear in last block in 221 // which all their inputs occur. 222 bool PhaseCFG::schedule_early(VectorSet &visited, Node_List &roots) { 223 // Allocate stack with enough space to avoid frequent realloc 224 Node_Stack nstack(roots.Size() + 8); 225 // _root will be processed among C->top() inputs 226 roots.push(C->top()); 227 visited.set(C->top()->_idx); 228 229 while (roots.size() != 0) { 230 // Use local variables nstack_top_n & nstack_top_i to cache values 231 // on stack's top. 232 Node* parent_node = roots.pop(); 233 uint input_index = 0; 234 235 while (true) { 236 if (input_index == 0) { 237 // Fixup some control. Constants without control get attached 238 // to root and nodes that use is_block_proj() nodes should be attached 239 // to the region that starts their block. 240 const Node* control_input = parent_node->in(0); 241 if (control_input != NULL) { 242 replace_block_proj_ctrl(parent_node); 243 } else { 244 // Is a constant with NO inputs? 245 if (parent_node->req() == 1) { 246 parent_node->set_req(0, _root); 247 } 248 } 249 } 250 251 // First, visit all inputs and force them to get a block. If an 252 // input is already in a block we quit following inputs (to avoid 253 // cycles). Instead we put that Node on a worklist to be handled 254 // later (since IT'S inputs may not have a block yet). 255 256 // Assume all n's inputs will be processed 257 bool done = true; 258 259 while (input_index < parent_node->len()) { 260 Node* in = parent_node->in(input_index++); 261 if (in == NULL) { 262 continue; 263 } 264 265 int is_visited = visited.test_set(in->_idx); 266 if (!has_block(in)) { 267 if (is_visited) { 268 return false; 269 } 270 // Save parent node and next input's index. 271 nstack.push(parent_node, input_index); 272 // Process current input now. 273 parent_node = in; 274 input_index = 0; 275 // Not all n's inputs processed. 276 done = false; 277 break; 278 } else if (!is_visited) { 279 // Visit this guy later, using worklist 280 roots.push(in); 281 } 282 } 283 284 if (done) { 285 // All of n's inputs have been processed, complete post-processing. 286 287 // Some instructions are pinned into a block. These include Region, 288 // Phi, Start, Return, and other control-dependent instructions and 289 // any projections which depend on them. 290 if (!parent_node->pinned()) { 291 // Set earliest legal block. 292 Block* earliest_block = find_deepest_input(parent_node, this); 293 map_node_to_block(parent_node, earliest_block); 294 } else { 295 assert(get_block_for_node(parent_node) == get_block_for_node(parent_node->in(0)), "Pinned Node should be at the same block as its control edge"); 296 } 297 298 if (nstack.is_empty()) { 299 // Finished all nodes on stack. 300 // Process next node on the worklist 'roots'. 301 break; 302 } 303 // Get saved parent node and next input's index. 304 parent_node = nstack.node(); 305 input_index = nstack.index(); 306 nstack.pop(); 307 } 308 } 309 } 310 return true; 311 } 312 313 //------------------------------dom_lca---------------------------------------- 314 // Find least common ancestor in dominator tree 315 // LCA is a current notion of LCA, to be raised above 'this'. 316 // As a convenient boundary condition, return 'this' if LCA is NULL. 317 // Find the LCA of those two nodes. 318 Block* Block::dom_lca(Block* LCA) { 319 if (LCA == NULL || LCA == this) return this; 320 321 Block* anc = this; 322 while (anc->_dom_depth > LCA->_dom_depth) 323 anc = anc->_idom; // Walk up till anc is as high as LCA 324 325 while (LCA->_dom_depth > anc->_dom_depth) 326 LCA = LCA->_idom; // Walk up till LCA is as high as anc 327 328 while (LCA != anc) { // Walk both up till they are the same 329 LCA = LCA->_idom; 330 anc = anc->_idom; 331 } 332 333 return LCA; 334 } 335 336 //--------------------------raise_LCA_above_use-------------------------------- 337 // We are placing a definition, and have been given a def->use edge. 338 // The definition must dominate the use, so move the LCA upward in the 339 // dominator tree to dominate the use. If the use is a phi, adjust 340 // the LCA only with the phi input paths which actually use this def. 341 static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, const PhaseCFG* cfg) { 342 Block* buse = cfg->get_block_for_node(use); 343 if (buse == NULL) return LCA; // Unused killing Projs have no use block 344 if (!use->is_Phi()) return buse->dom_lca(LCA); 345 uint pmax = use->req(); // Number of Phi inputs 346 // Why does not this loop just break after finding the matching input to 347 // the Phi? Well...it's like this. I do not have true def-use/use-def 348 // chains. Means I cannot distinguish, from the def-use direction, which 349 // of many use-defs lead from the same use to the same def. That is, this 350 // Phi might have several uses of the same def. Each use appears in a 351 // different predecessor block. But when I enter here, I cannot distinguish 352 // which use-def edge I should find the predecessor block for. So I find 353 // them all. Means I do a little extra work if a Phi uses the same value 354 // more than once. 355 for (uint j=1; j<pmax; j++) { // For all inputs 356 if (use->in(j) == def) { // Found matching input? 357 Block* pred = cfg->get_block_for_node(buse->pred(j)); 358 LCA = pred->dom_lca(LCA); 359 } 360 } 361 return LCA; 362 } 363 364 //----------------------------raise_LCA_above_marks---------------------------- 365 // Return a new LCA that dominates LCA and any of its marked predecessors. 366 // Search all my parents up to 'early' (exclusive), looking for predecessors 367 // which are marked with the given index. Return the LCA (in the dom tree) 368 // of all marked blocks. If there are none marked, return the original 369 // LCA. 370 static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark, Block* early, const PhaseCFG* cfg) { 371 Block_List worklist; 372 worklist.push(LCA); 373 while (worklist.size() > 0) { 374 Block* mid = worklist.pop(); 375 if (mid == early) continue; // stop searching here 376 377 // Test and set the visited bit. 378 if (mid->raise_LCA_visited() == mark) continue; // already visited 379 380 // Don't process the current LCA, otherwise the search may terminate early 381 if (mid != LCA && mid->raise_LCA_mark() == mark) { 382 // Raise the LCA. 383 LCA = mid->dom_lca(LCA); 384 if (LCA == early) break; // stop searching everywhere 385 assert(early->dominates(LCA), "early is high enough"); 386 // Resume searching at that point, skipping intermediate levels. 387 worklist.push(LCA); 388 if (LCA == mid) 389 continue; // Don't mark as visited to avoid early termination. 390 } else { 391 // Keep searching through this block's predecessors. 392 for (uint j = 1, jmax = mid->num_preds(); j < jmax; j++) { 393 Block* mid_parent = cfg->get_block_for_node(mid->pred(j)); 394 worklist.push(mid_parent); 395 } 396 } 397 mid->set_raise_LCA_visited(mark); 398 } 399 return LCA; 400 } 401 402 //--------------------------memory_early_block-------------------------------- 403 // This is a variation of find_deepest_input, the heart of schedule_early. 404 // Find the "early" block for a load, if we considered only memory and 405 // address inputs, that is, if other data inputs were ignored. 406 // 407 // Because a subset of edges are considered, the resulting block will 408 // be earlier (at a shallower dom_depth) than the true schedule_early 409 // point of the node. We compute this earlier block as a more permissive 410 // site for anti-dependency insertion, but only if subsume_loads is enabled. 411 static Block* memory_early_block(Node* load, Block* early, const PhaseCFG* cfg) { 412 Node* base; 413 Node* index; 414 Node* store = load->in(MemNode::Memory); 415 load->as_Mach()->memory_inputs(base, index); 416 417 assert(base != NodeSentinel && index != NodeSentinel, 418 "unexpected base/index inputs"); 419 420 Node* mem_inputs[4]; 421 int mem_inputs_length = 0; 422 if (base != NULL) mem_inputs[mem_inputs_length++] = base; 423 if (index != NULL) mem_inputs[mem_inputs_length++] = index; 424 if (store != NULL) mem_inputs[mem_inputs_length++] = store; 425 426 // In the comparision below, add one to account for the control input, 427 // which may be null, but always takes up a spot in the in array. 428 if (mem_inputs_length + 1 < (int) load->req()) { 429 // This "load" has more inputs than just the memory, base and index inputs. 430 // For purposes of checking anti-dependences, we need to start 431 // from the early block of only the address portion of the instruction, 432 // and ignore other blocks that may have factored into the wider 433 // schedule_early calculation. 434 if (load->in(0) != NULL) mem_inputs[mem_inputs_length++] = load->in(0); 435 436 Block* deepb = NULL; // Deepest block so far 437 int deepb_dom_depth = 0; 438 for (int i = 0; i < mem_inputs_length; i++) { 439 Block* inb = cfg->get_block_for_node(mem_inputs[i]); 440 if (deepb_dom_depth < (int) inb->_dom_depth) { 441 // The new inb must be dominated by the previous deepb. 442 // The various inputs must be linearly ordered in the dom 443 // tree, or else there will not be a unique deepest block. 444 DEBUG_ONLY(assert_dom(deepb, inb, load, cfg)); 445 deepb = inb; // Save deepest block 446 deepb_dom_depth = deepb->_dom_depth; 447 } 448 } 449 early = deepb; 450 } 451 452 return early; 453 } 454 455 //--------------------------insert_anti_dependences--------------------------- 456 // A load may need to witness memory that nearby stores can overwrite. 457 // For each nearby store, either insert an "anti-dependence" edge 458 // from the load to the store, or else move LCA upward to force the 459 // load to (eventually) be scheduled in a block above the store. 460 // 461 // Do not add edges to stores on distinct control-flow paths; 462 // only add edges to stores which might interfere. 463 // 464 // Return the (updated) LCA. There will not be any possibly interfering 465 // store between the load's "early block" and the updated LCA. 466 // Any stores in the updated LCA will have new precedence edges 467 // back to the load. The caller is expected to schedule the load 468 // in the LCA, in which case the precedence edges will make LCM 469 // preserve anti-dependences. The caller may also hoist the load 470 // above the LCA, if it is not the early block. 471 Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) { 472 assert(load->needs_anti_dependence_check(), "must be a load of some sort"); 473 assert(LCA != NULL, ""); 474 DEBUG_ONLY(Block* LCA_orig = LCA); 475 476 // Compute the alias index. Loads and stores with different alias indices 477 // do not need anti-dependence edges. 478 uint load_alias_idx = C->get_alias_index(load->adr_type()); 479 #ifdef ASSERT 480 if (load_alias_idx == Compile::AliasIdxBot && C->AliasLevel() > 0 && 481 (PrintOpto || VerifyAliases || 482 PrintMiscellaneous && (WizardMode || Verbose))) { 483 // Load nodes should not consume all of memory. 484 // Reporting a bottom type indicates a bug in adlc. 485 // If some particular type of node validly consumes all of memory, 486 // sharpen the preceding "if" to exclude it, so we can catch bugs here. 487 tty->print_cr("*** Possible Anti-Dependence Bug: Load consumes all of memory."); 488 load->dump(2); 489 if (VerifyAliases) assert(load_alias_idx != Compile::AliasIdxBot, ""); 490 } 491 #endif 492 assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrComp), 493 "String compare is only known 'load' that does not conflict with any stores"); 494 assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrEquals), 495 "String equals is a 'load' that does not conflict with any stores"); 496 assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrIndexOf), 497 "String indexOf is a 'load' that does not conflict with any stores"); 498 assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_AryEq), 499 "Arrays equals is a 'load' that do not conflict with any stores"); 500 501 if (!C->alias_type(load_alias_idx)->is_rewritable()) { 502 // It is impossible to spoil this load by putting stores before it, 503 // because we know that the stores will never update the value 504 // which 'load' must witness. 505 return LCA; 506 } 507 508 node_idx_t load_index = load->_idx; 509 510 // Note the earliest legal placement of 'load', as determined by 511 // by the unique point in the dom tree where all memory effects 512 // and other inputs are first available. (Computed by schedule_early.) 513 // For normal loads, 'early' is the shallowest place (dom graph wise) 514 // to look for anti-deps between this load and any store. 515 Block* early = get_block_for_node(load); 516 517 // If we are subsuming loads, compute an "early" block that only considers 518 // memory or address inputs. This block may be different than the 519 // schedule_early block in that it could be at an even shallower depth in the 520 // dominator tree, and allow for a broader discovery of anti-dependences. 521 if (C->subsume_loads()) { 522 early = memory_early_block(load, early, this); 523 } 524 525 ResourceArea *area = Thread::current()->resource_area(); 526 Node_List worklist_mem(area); // prior memory state to store 527 Node_List worklist_store(area); // possible-def to explore 528 Node_List worklist_visited(area); // visited mergemem nodes 529 Node_List non_early_stores(area); // all relevant stores outside of early 530 bool must_raise_LCA = false; 531 532 #ifdef TRACK_PHI_INPUTS 533 // %%% This extra checking fails because MergeMem nodes are not GVNed. 534 // Provide "phi_inputs" to check if every input to a PhiNode is from the 535 // original memory state. This indicates a PhiNode for which should not 536 // prevent the load from sinking. For such a block, set_raise_LCA_mark 537 // may be overly conservative. 538 // Mechanism: count inputs seen for each Phi encountered in worklist_store. 539 DEBUG_ONLY(GrowableArray<uint> phi_inputs(area, C->unique(),0,0)); 540 #endif 541 542 // 'load' uses some memory state; look for users of the same state. 543 // Recurse through MergeMem nodes to the stores that use them. 544 545 // Each of these stores is a possible definition of memory 546 // that 'load' needs to use. We need to force 'load' 547 // to occur before each such store. When the store is in 548 // the same block as 'load', we insert an anti-dependence 549 // edge load->store. 550 551 // The relevant stores "nearby" the load consist of a tree rooted 552 // at initial_mem, with internal nodes of type MergeMem. 553 // Therefore, the branches visited by the worklist are of this form: 554 // initial_mem -> (MergeMem ->)* store 555 // The anti-dependence constraints apply only to the fringe of this tree. 556 557 Node* initial_mem = load->in(MemNode::Memory); 558 worklist_store.push(initial_mem); 559 worklist_visited.push(initial_mem); 560 worklist_mem.push(NULL); 561 while (worklist_store.size() > 0) { 562 // Examine a nearby store to see if it might interfere with our load. 563 Node* mem = worklist_mem.pop(); 564 Node* store = worklist_store.pop(); 565 uint op = store->Opcode(); 566 567 // MergeMems do not directly have anti-deps. 568 // Treat them as internal nodes in a forward tree of memory states, 569 // the leaves of which are each a 'possible-def'. 570 if (store == initial_mem // root (exclusive) of tree we are searching 571 || op == Op_MergeMem // internal node of tree we are searching 572 ) { 573 mem = store; // It's not a possibly interfering store. 574 if (store == initial_mem) 575 initial_mem = NULL; // only process initial memory once 576 577 for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) { 578 store = mem->fast_out(i); 579 if (store->is_MergeMem()) { 580 // Be sure we don't get into combinatorial problems. 581 // (Allow phis to be repeated; they can merge two relevant states.) 582 uint j = worklist_visited.size(); 583 for (; j > 0; j--) { 584 if (worklist_visited.at(j-1) == store) break; 585 } 586 if (j > 0) continue; // already on work list; do not repeat 587 worklist_visited.push(store); 588 } 589 worklist_mem.push(mem); 590 worklist_store.push(store); 591 } 592 continue; 593 } 594 595 if (op == Op_MachProj || op == Op_Catch) continue; 596 if (store->needs_anti_dependence_check()) continue; // not really a store 597 598 // Compute the alias index. Loads and stores with different alias 599 // indices do not need anti-dependence edges. Wide MemBar's are 600 // anti-dependent on everything (except immutable memories). 601 const TypePtr* adr_type = store->adr_type(); 602 if (!C->can_alias(adr_type, load_alias_idx)) continue; 603 604 // Most slow-path runtime calls do NOT modify Java memory, but 605 // they can block and so write Raw memory. 606 if (store->is_Mach()) { 607 MachNode* mstore = store->as_Mach(); 608 if (load_alias_idx != Compile::AliasIdxRaw) { 609 // Check for call into the runtime using the Java calling 610 // convention (and from there into a wrapper); it has no 611 // _method. Can't do this optimization for Native calls because 612 // they CAN write to Java memory. 613 if (mstore->ideal_Opcode() == Op_CallStaticJava) { 614 assert(mstore->is_MachSafePoint(), ""); 615 MachSafePointNode* ms = (MachSafePointNode*) mstore; 616 assert(ms->is_MachCallJava(), ""); 617 MachCallJavaNode* mcj = (MachCallJavaNode*) ms; 618 if (mcj->_method == NULL) { 619 // These runtime calls do not write to Java visible memory 620 // (other than Raw) and so do not require anti-dependence edges. 621 continue; 622 } 623 } 624 // Same for SafePoints: they read/write Raw but only read otherwise. 625 // This is basically a workaround for SafePoints only defining control 626 // instead of control + memory. 627 if (mstore->ideal_Opcode() == Op_SafePoint) 628 continue; 629 } else { 630 // Some raw memory, such as the load of "top" at an allocation, 631 // can be control dependent on the previous safepoint. See 632 // comments in GraphKit::allocate_heap() about control input. 633 // Inserting an anti-dep between such a safepoint and a use 634 // creates a cycle, and will cause a subsequent failure in 635 // local scheduling. (BugId 4919904) 636 // (%%% How can a control input be a safepoint and not a projection??) 637 if (mstore->ideal_Opcode() == Op_SafePoint && load->in(0) == mstore) 638 continue; 639 } 640 } 641 642 // Identify a block that the current load must be above, 643 // or else observe that 'store' is all the way up in the 644 // earliest legal block for 'load'. In the latter case, 645 // immediately insert an anti-dependence edge. 646 Block* store_block = get_block_for_node(store); 647 assert(store_block != NULL, "unused killing projections skipped above"); 648 649 if (store->is_Phi()) { 650 // 'load' uses memory which is one (or more) of the Phi's inputs. 651 // It must be scheduled not before the Phi, but rather before 652 // each of the relevant Phi inputs. 653 // 654 // Instead of finding the LCA of all inputs to a Phi that match 'mem', 655 // we mark each corresponding predecessor block and do a combined 656 // hoisting operation later (raise_LCA_above_marks). 657 // 658 // Do not assert(store_block != early, "Phi merging memory after access") 659 // PhiNode may be at start of block 'early' with backedge to 'early' 660 DEBUG_ONLY(bool found_match = false); 661 for (uint j = PhiNode::Input, jmax = store->req(); j < jmax; j++) { 662 if (store->in(j) == mem) { // Found matching input? 663 DEBUG_ONLY(found_match = true); 664 Block* pred_block = get_block_for_node(store_block->pred(j)); 665 if (pred_block != early) { 666 // If any predecessor of the Phi matches the load's "early block", 667 // we do not need a precedence edge between the Phi and 'load' 668 // since the load will be forced into a block preceding the Phi. 669 pred_block->set_raise_LCA_mark(load_index); 670 assert(!LCA_orig->dominates(pred_block) || 671 early->dominates(pred_block), "early is high enough"); 672 must_raise_LCA = true; 673 } else { 674 // anti-dependent upon PHI pinned below 'early', no edge needed 675 LCA = early; // but can not schedule below 'early' 676 } 677 } 678 } 679 assert(found_match, "no worklist bug"); 680 #ifdef TRACK_PHI_INPUTS 681 #ifdef ASSERT 682 // This assert asks about correct handling of PhiNodes, which may not 683 // have all input edges directly from 'mem'. See BugId 4621264 684 int num_mem_inputs = phi_inputs.at_grow(store->_idx,0) + 1; 685 // Increment by exactly one even if there are multiple copies of 'mem' 686 // coming into the phi, because we will run this block several times 687 // if there are several copies of 'mem'. (That's how DU iterators work.) 688 phi_inputs.at_put(store->_idx, num_mem_inputs); 689 assert(PhiNode::Input + num_mem_inputs < store->req(), 690 "Expect at least one phi input will not be from original memory state"); 691 #endif //ASSERT 692 #endif //TRACK_PHI_INPUTS 693 } else if (store_block != early) { 694 // 'store' is between the current LCA and earliest possible block. 695 // Label its block, and decide later on how to raise the LCA 696 // to include the effect on LCA of this store. 697 // If this store's block gets chosen as the raised LCA, we 698 // will find him on the non_early_stores list and stick him 699 // with a precedence edge. 700 // (But, don't bother if LCA is already raised all the way.) 701 if (LCA != early) { 702 store_block->set_raise_LCA_mark(load_index); 703 must_raise_LCA = true; 704 non_early_stores.push(store); 705 } 706 } else { 707 // Found a possibly-interfering store in the load's 'early' block. 708 // This means 'load' cannot sink at all in the dominator tree. 709 // Add an anti-dep edge, and squeeze 'load' into the highest block. 710 assert(store != load->in(0), "dependence cycle found"); 711 if (verify) { 712 assert(store->find_edge(load) != -1, "missing precedence edge"); 713 } else { 714 store->add_prec(load); 715 } 716 LCA = early; 717 // This turns off the process of gathering non_early_stores. 718 } 719 } 720 // (Worklist is now empty; all nearby stores have been visited.) 721 722 // Finished if 'load' must be scheduled in its 'early' block. 723 // If we found any stores there, they have already been given 724 // precedence edges. 725 if (LCA == early) return LCA; 726 727 // We get here only if there are no possibly-interfering stores 728 // in the load's 'early' block. Move LCA up above all predecessors 729 // which contain stores we have noted. 730 // 731 // The raised LCA block can be a home to such interfering stores, 732 // but its predecessors must not contain any such stores. 733 // 734 // The raised LCA will be a lower bound for placing the load, 735 // preventing the load from sinking past any block containing 736 // a store that may invalidate the memory state required by 'load'. 737 if (must_raise_LCA) 738 LCA = raise_LCA_above_marks(LCA, load->_idx, early, this); 739 if (LCA == early) return LCA; 740 741 // Insert anti-dependence edges from 'load' to each store 742 // in the non-early LCA block. 743 // Mine the non_early_stores list for such stores. 744 if (LCA->raise_LCA_mark() == load_index) { 745 while (non_early_stores.size() > 0) { 746 Node* store = non_early_stores.pop(); 747 Block* store_block = get_block_for_node(store); 748 if (store_block == LCA) { 749 // add anti_dependence from store to load in its own block 750 assert(store != load->in(0), "dependence cycle found"); 751 if (verify) { 752 assert(store->find_edge(load) != -1, "missing precedence edge"); 753 } else { 754 store->add_prec(load); 755 } 756 } else { 757 assert(store_block->raise_LCA_mark() == load_index, "block was marked"); 758 // Any other stores we found must be either inside the new LCA 759 // or else outside the original LCA. In the latter case, they 760 // did not interfere with any use of 'load'. 761 assert(LCA->dominates(store_block) 762 || !LCA_orig->dominates(store_block), "no stray stores"); 763 } 764 } 765 } 766 767 // Return the highest block containing stores; any stores 768 // within that block have been given anti-dependence edges. 769 return LCA; 770 } 771 772 // This class is used to iterate backwards over the nodes in the graph. 773 774 class Node_Backward_Iterator { 775 776 private: 777 Node_Backward_Iterator(); 778 779 public: 780 // Constructor for the iterator 781 Node_Backward_Iterator(Node *root, VectorSet &visited, Node_List &stack, PhaseCFG &cfg); 782 783 // Postincrement operator to iterate over the nodes 784 Node *next(); 785 786 private: 787 VectorSet &_visited; 788 Node_List &_stack; 789 PhaseCFG &_cfg; 790 }; 791 792 // Constructor for the Node_Backward_Iterator 793 Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_List &stack, PhaseCFG &cfg) 794 : _visited(visited), _stack(stack), _cfg(cfg) { 795 // The stack should contain exactly the root 796 stack.clear(); 797 stack.push(root); 798 799 // Clear the visited bits 800 visited.Clear(); 801 } 802 803 // Iterator for the Node_Backward_Iterator 804 Node *Node_Backward_Iterator::next() { 805 806 // If the _stack is empty, then just return NULL: finished. 807 if ( !_stack.size() ) 808 return NULL; 809 810 // '_stack' is emulating a real _stack. The 'visit-all-users' loop has been 811 // made stateless, so I do not need to record the index 'i' on my _stack. 812 // Instead I visit all users each time, scanning for unvisited users. 813 // I visit unvisited not-anti-dependence users first, then anti-dependent 814 // children next. 815 Node *self = _stack.pop(); 816 817 // I cycle here when I am entering a deeper level of recursion. 818 // The key variable 'self' was set prior to jumping here. 819 while( 1 ) { 820 821 _visited.set(self->_idx); 822 823 // Now schedule all uses as late as possible. 824 const Node* src = self->is_Proj() ? self->in(0) : self; 825 uint src_rpo = _cfg.get_block_for_node(src)->_rpo; 826 827 // Schedule all nodes in a post-order visit 828 Node *unvisited = NULL; // Unvisited anti-dependent Node, if any 829 830 // Scan for unvisited nodes 831 for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) { 832 // For all uses, schedule late 833 Node* n = self->fast_out(i); // Use 834 835 // Skip already visited children 836 if ( _visited.test(n->_idx) ) 837 continue; 838 839 // do not traverse backward control edges 840 Node *use = n->is_Proj() ? n->in(0) : n; 841 uint use_rpo = _cfg.get_block_for_node(use)->_rpo; 842 843 if ( use_rpo < src_rpo ) 844 continue; 845 846 // Phi nodes always precede uses in a basic block 847 if ( use_rpo == src_rpo && use->is_Phi() ) 848 continue; 849 850 unvisited = n; // Found unvisited 851 852 // Check for possible-anti-dependent 853 if( !n->needs_anti_dependence_check() ) 854 break; // Not visited, not anti-dep; schedule it NOW 855 } 856 857 // Did I find an unvisited not-anti-dependent Node? 858 if ( !unvisited ) 859 break; // All done with children; post-visit 'self' 860 861 // Visit the unvisited Node. Contains the obvious push to 862 // indicate I'm entering a deeper level of recursion. I push the 863 // old state onto the _stack and set a new state and loop (recurse). 864 _stack.push(self); 865 self = unvisited; 866 } // End recursion loop 867 868 return self; 869 } 870 871 //------------------------------ComputeLatenciesBackwards---------------------- 872 // Compute the latency of all the instructions. 873 void PhaseCFG::compute_latencies_backwards(VectorSet &visited, Node_List &stack) { 874 #ifndef PRODUCT 875 if (trace_opto_pipelining()) 876 tty->print("\n#---- ComputeLatenciesBackwards ----\n"); 877 #endif 878 879 Node_Backward_Iterator iter((Node *)_root, visited, stack, *this); 880 Node *n; 881 882 // Walk over all the nodes from last to first 883 while (n = iter.next()) { 884 // Set the latency for the definitions of this instruction 885 partial_latency_of_defs(n); 886 } 887 } // end ComputeLatenciesBackwards 888 889 //------------------------------partial_latency_of_defs------------------------ 890 // Compute the latency impact of this node on all defs. This computes 891 // a number that increases as we approach the beginning of the routine. 892 void PhaseCFG::partial_latency_of_defs(Node *n) { 893 // Set the latency for this instruction 894 #ifndef PRODUCT 895 if (trace_opto_pipelining()) { 896 tty->print("# latency_to_inputs: node_latency[%d] = %d for node", n->_idx, get_latency_for_node(n)); 897 dump(); 898 } 899 #endif 900 901 if (n->is_Proj()) { 902 n = n->in(0); 903 } 904 905 if (n->is_Root()) { 906 return; 907 } 908 909 uint nlen = n->len(); 910 uint use_latency = get_latency_for_node(n); 911 uint use_pre_order = get_block_for_node(n)->_pre_order; 912 913 for (uint j = 0; j < nlen; j++) { 914 Node *def = n->in(j); 915 916 if (!def || def == n) { 917 continue; 918 } 919 920 // Walk backwards thru projections 921 if (def->is_Proj()) { 922 def = def->in(0); 923 } 924 925 #ifndef PRODUCT 926 if (trace_opto_pipelining()) { 927 tty->print("# in(%2d): ", j); 928 def->dump(); 929 } 930 #endif 931 932 // If the defining block is not known, assume it is ok 933 Block *def_block = get_block_for_node(def); 934 uint def_pre_order = def_block ? def_block->_pre_order : 0; 935 936 if ((use_pre_order < def_pre_order) || (use_pre_order == def_pre_order && n->is_Phi())) { 937 continue; 938 } 939 940 uint delta_latency = n->latency(j); 941 uint current_latency = delta_latency + use_latency; 942 943 if (get_latency_for_node(def) < current_latency) { 944 set_latency_for_node(def, current_latency); 945 } 946 947 #ifndef PRODUCT 948 if (trace_opto_pipelining()) { 949 tty->print_cr("# %d + edge_latency(%d) == %d -> %d, node_latency[%d] = %d", use_latency, j, delta_latency, current_latency, def->_idx, get_latency_for_node(def)); 950 } 951 #endif 952 } 953 } 954 955 //------------------------------latency_from_use------------------------------- 956 // Compute the latency of a specific use 957 int PhaseCFG::latency_from_use(Node *n, const Node *def, Node *use) { 958 // If self-reference, return no latency 959 if (use == n || use->is_Root()) { 960 return 0; 961 } 962 963 uint def_pre_order = get_block_for_node(def)->_pre_order; 964 uint latency = 0; 965 966 // If the use is not a projection, then it is simple... 967 if (!use->is_Proj()) { 968 #ifndef PRODUCT 969 if (trace_opto_pipelining()) { 970 tty->print("# out(): "); 971 use->dump(); 972 } 973 #endif 974 975 uint use_pre_order = get_block_for_node(use)->_pre_order; 976 977 if (use_pre_order < def_pre_order) 978 return 0; 979 980 if (use_pre_order == def_pre_order && use->is_Phi()) 981 return 0; 982 983 uint nlen = use->len(); 984 uint nl = get_latency_for_node(use); 985 986 for ( uint j=0; j<nlen; j++ ) { 987 if (use->in(j) == n) { 988 // Change this if we want local latencies 989 uint ul = use->latency(j); 990 uint l = ul + nl; 991 if (latency < l) latency = l; 992 #ifndef PRODUCT 993 if (trace_opto_pipelining()) { 994 tty->print_cr("# %d + edge_latency(%d) == %d -> %d, latency = %d", 995 nl, j, ul, l, latency); 996 } 997 #endif 998 } 999 } 1000 } else { 1001 // This is a projection, just grab the latency of the use(s) 1002 for (DUIterator_Fast jmax, j = use->fast_outs(jmax); j < jmax; j++) { 1003 uint l = latency_from_use(use, def, use->fast_out(j)); 1004 if (latency < l) latency = l; 1005 } 1006 } 1007 1008 return latency; 1009 } 1010 1011 //------------------------------latency_from_uses------------------------------ 1012 // Compute the latency of this instruction relative to all of it's uses. 1013 // This computes a number that increases as we approach the beginning of the 1014 // routine. 1015 void PhaseCFG::latency_from_uses(Node *n) { 1016 // Set the latency for this instruction 1017 #ifndef PRODUCT 1018 if (trace_opto_pipelining()) { 1019 tty->print("# latency_from_outputs: node_latency[%d] = %d for node", n->_idx, get_latency_for_node(n)); 1020 dump(); 1021 } 1022 #endif 1023 uint latency=0; 1024 const Node *def = n->is_Proj() ? n->in(0): n; 1025 1026 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) { 1027 uint l = latency_from_use(n, def, n->fast_out(i)); 1028 1029 if (latency < l) latency = l; 1030 } 1031 1032 set_latency_for_node(n, latency); 1033 } 1034 1035 //------------------------------hoist_to_cheaper_block------------------------- 1036 // Pick a block for node self, between early and LCA, that is a cheaper 1037 // alternative to LCA. 1038 Block* PhaseCFG::hoist_to_cheaper_block(Block* LCA, Block* early, Node* self) { 1039 const double delta = 1+PROB_UNLIKELY_MAG(4); 1040 Block* least = LCA; 1041 double least_freq = least->_freq; 1042 uint target = get_latency_for_node(self); 1043 uint start_latency = get_latency_for_node(LCA->head()); 1044 uint end_latency = get_latency_for_node(LCA->get_node(LCA->end_idx())); 1045 bool in_latency = (target <= start_latency); 1046 const Block* root_block = get_block_for_node(_root); 1047 1048 // Turn off latency scheduling if scheduling is just plain off 1049 if (!C->do_scheduling()) 1050 in_latency = true; 1051 1052 // Do not hoist (to cover latency) instructions which target a 1053 // single register. Hoisting stretches the live range of the 1054 // single register and may force spilling. 1055 MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL; 1056 if (mach && mach->out_RegMask().is_bound1() && mach->out_RegMask().is_NotEmpty()) 1057 in_latency = true; 1058 1059 #ifndef PRODUCT 1060 if (trace_opto_pipelining()) { 1061 tty->print("# Find cheaper block for latency %d: ", get_latency_for_node(self)); 1062 self->dump(); 1063 tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g", 1064 LCA->_pre_order, 1065 LCA->head()->_idx, 1066 start_latency, 1067 LCA->get_node(LCA->end_idx())->_idx, 1068 end_latency, 1069 least_freq); 1070 } 1071 #endif 1072 1073 int cand_cnt = 0; // number of candidates tried 1074 1075 // Walk up the dominator tree from LCA (Lowest common ancestor) to 1076 // the earliest legal location. Capture the least execution frequency. 1077 while (LCA != early) { 1078 LCA = LCA->_idom; // Follow up the dominator tree 1079 1080 if (LCA == NULL) { 1081 // Bailout without retry 1082 C->record_method_not_compilable("late schedule failed: LCA == NULL"); 1083 return least; 1084 } 1085 1086 // Don't hoist machine instructions to the root basic block 1087 if (mach && LCA == root_block) 1088 break; 1089 1090 uint start_lat = get_latency_for_node(LCA->head()); 1091 uint end_idx = LCA->end_idx(); 1092 uint end_lat = get_latency_for_node(LCA->get_node(end_idx)); 1093 double LCA_freq = LCA->_freq; 1094 #ifndef PRODUCT 1095 if (trace_opto_pipelining()) { 1096 tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g", 1097 LCA->_pre_order, LCA->head()->_idx, start_lat, end_idx, end_lat, LCA_freq); 1098 } 1099 #endif 1100 cand_cnt++; 1101 if (LCA_freq < least_freq || // Better Frequency 1102 (StressGCM && Compile::randomized_select(cand_cnt)) || // Should be randomly accepted in stress mode 1103 (!StressGCM && // Otherwise, choose with latency 1104 !in_latency && // No block containing latency 1105 LCA_freq < least_freq * delta && // No worse frequency 1106 target >= end_lat && // within latency range 1107 !self->is_iteratively_computed() ) // But don't hoist IV increments 1108 // because they may end up above other uses of their phi forcing 1109 // their result register to be different from their input. 1110 ) { 1111 least = LCA; // Found cheaper block 1112 least_freq = LCA_freq; 1113 start_latency = start_lat; 1114 end_latency = end_lat; 1115 if (target <= start_lat) 1116 in_latency = true; 1117 } 1118 } 1119 1120 #ifndef PRODUCT 1121 if (trace_opto_pipelining()) { 1122 tty->print_cr("# Choose block B%d with start latency=%d and freq=%g", 1123 least->_pre_order, start_latency, least_freq); 1124 } 1125 #endif 1126 1127 // See if the latency needs to be updated 1128 if (target < end_latency) { 1129 #ifndef PRODUCT 1130 if (trace_opto_pipelining()) { 1131 tty->print_cr("# Change latency for [%4d] from %d to %d", self->_idx, target, end_latency); 1132 } 1133 #endif 1134 set_latency_for_node(self, end_latency); 1135 partial_latency_of_defs(self); 1136 } 1137 1138 return least; 1139 } 1140 1141 1142 //------------------------------schedule_late----------------------------------- 1143 // Now schedule all codes as LATE as possible. This is the LCA in the 1144 // dominator tree of all USES of a value. Pick the block with the least 1145 // loop nesting depth that is lowest in the dominator tree. 1146 extern const char must_clone[]; 1147 void PhaseCFG::schedule_late(VectorSet &visited, Node_List &stack) { 1148 #ifndef PRODUCT 1149 if (trace_opto_pipelining()) 1150 tty->print("\n#---- schedule_late ----\n"); 1151 #endif 1152 1153 Node_Backward_Iterator iter((Node *)_root, visited, stack, *this); 1154 Node *self; 1155 1156 // Walk over all the nodes from last to first 1157 while (self = iter.next()) { 1158 Block* early = get_block_for_node(self); // Earliest legal placement 1159 1160 if (self->is_top()) { 1161 // Top node goes in bb #2 with other constants. 1162 // It must be special-cased, because it has no out edges. 1163 early->add_inst(self); 1164 continue; 1165 } 1166 1167 // No uses, just terminate 1168 if (self->outcnt() == 0) { 1169 assert(self->is_MachProj(), "sanity"); 1170 continue; // Must be a dead machine projection 1171 } 1172 1173 // If node is pinned in the block, then no scheduling can be done. 1174 if( self->pinned() ) // Pinned in block? 1175 continue; 1176 1177 MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL; 1178 if (mach) { 1179 switch (mach->ideal_Opcode()) { 1180 case Op_CreateEx: 1181 // Don't move exception creation 1182 early->add_inst(self); 1183 continue; 1184 break; 1185 case Op_CheckCastPP: 1186 // Don't move CheckCastPP nodes away from their input, if the input 1187 // is a rawptr (5071820). 1188 Node *def = self->in(1); 1189 if (def != NULL && def->bottom_type()->base() == Type::RawPtr) { 1190 early->add_inst(self); 1191 #ifdef ASSERT 1192 _raw_oops.push(def); 1193 #endif 1194 continue; 1195 } 1196 break; 1197 } 1198 } 1199 1200 // Gather LCA of all uses 1201 Block *LCA = NULL; 1202 { 1203 for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) { 1204 // For all uses, find LCA 1205 Node* use = self->fast_out(i); 1206 LCA = raise_LCA_above_use(LCA, use, self, this); 1207 } 1208 } // (Hide defs of imax, i from rest of block.) 1209 1210 // Place temps in the block of their use. This isn't a 1211 // requirement for correctness but it reduces useless 1212 // interference between temps and other nodes. 1213 if (mach != NULL && mach->is_MachTemp()) { 1214 map_node_to_block(self, LCA); 1215 LCA->add_inst(self); 1216 continue; 1217 } 1218 1219 // Check if 'self' could be anti-dependent on memory 1220 if (self->needs_anti_dependence_check()) { 1221 // Hoist LCA above possible-defs and insert anti-dependences to 1222 // defs in new LCA block. 1223 LCA = insert_anti_dependences(LCA, self); 1224 } 1225 1226 if (early->_dom_depth > LCA->_dom_depth) { 1227 // Somehow the LCA has moved above the earliest legal point. 1228 // (One way this can happen is via memory_early_block.) 1229 if (C->subsume_loads() == true && !C->failing()) { 1230 // Retry with subsume_loads == false 1231 // If this is the first failure, the sentinel string will "stick" 1232 // to the Compile object, and the C2Compiler will see it and retry. 1233 C->record_failure(C2Compiler::retry_no_subsuming_loads()); 1234 } else { 1235 // Bailout without retry when (early->_dom_depth > LCA->_dom_depth) 1236 C->record_method_not_compilable("late schedule failed: incorrect graph"); 1237 } 1238 return; 1239 } 1240 1241 // If there is no opportunity to hoist, then we're done. 1242 // In stress mode, try to hoist even the single operations. 1243 bool try_to_hoist = StressGCM || (LCA != early); 1244 1245 // Must clone guys stay next to use; no hoisting allowed. 1246 // Also cannot hoist guys that alter memory or are otherwise not 1247 // allocatable (hoisting can make a value live longer, leading to 1248 // anti and output dependency problems which are normally resolved 1249 // by the register allocator giving everyone a different register). 1250 if (mach != NULL && must_clone[mach->ideal_Opcode()]) 1251 try_to_hoist = false; 1252 1253 Block* late = NULL; 1254 if (try_to_hoist) { 1255 // Now find the block with the least execution frequency. 1256 // Start at the latest schedule and work up to the earliest schedule 1257 // in the dominator tree. Thus the Node will dominate all its uses. 1258 late = hoist_to_cheaper_block(LCA, early, self); 1259 } else { 1260 // Just use the LCA of the uses. 1261 late = LCA; 1262 } 1263 1264 // Put the node into target block 1265 schedule_node_into_block(self, late); 1266 1267 #ifdef ASSERT 1268 if (self->needs_anti_dependence_check()) { 1269 // since precedence edges are only inserted when we're sure they 1270 // are needed make sure that after placement in a block we don't 1271 // need any new precedence edges. 1272 verify_anti_dependences(late, self); 1273 } 1274 #endif 1275 } // Loop until all nodes have been visited 1276 1277 } // end ScheduleLate 1278 1279 //------------------------------GlobalCodeMotion------------------------------- 1280 void PhaseCFG::global_code_motion() { 1281 ResourceMark rm; 1282 1283 #ifndef PRODUCT 1284 if (trace_opto_pipelining()) { 1285 tty->print("\n---- Start GlobalCodeMotion ----\n"); 1286 } 1287 #endif 1288 1289 // Initialize the node to block mapping for things on the proj_list 1290 for (uint i = 0; i < _matcher.number_of_projections(); i++) { 1291 unmap_node_from_block(_matcher.get_projection(i)); 1292 } 1293 1294 // Set the basic block for Nodes pinned into blocks 1295 Arena* arena = Thread::current()->resource_area(); 1296 VectorSet visited(arena); 1297 schedule_pinned_nodes(visited); 1298 1299 // Find the earliest Block any instruction can be placed in. Some 1300 // instructions are pinned into Blocks. Unpinned instructions can 1301 // appear in last block in which all their inputs occur. 1302 visited.Clear(); 1303 Node_List stack(arena); 1304 // Pre-grow the list 1305 stack.map((C->unique() >> 1) + 16, NULL); 1306 if (!schedule_early(visited, stack)) { 1307 // Bailout without retry 1308 C->record_method_not_compilable("early schedule failed"); 1309 return; 1310 } 1311 1312 // Build Def-Use edges. 1313 // Compute the latency information (via backwards walk) for all the 1314 // instructions in the graph 1315 _node_latency = new GrowableArray<uint>(); // resource_area allocation 1316 1317 if (C->do_scheduling()) { 1318 compute_latencies_backwards(visited, stack); 1319 } 1320 1321 // Now schedule all codes as LATE as possible. This is the LCA in the 1322 // dominator tree of all USES of a value. Pick the block with the least 1323 // loop nesting depth that is lowest in the dominator tree. 1324 // ( visited.Clear() called in schedule_late()->Node_Backward_Iterator() ) 1325 schedule_late(visited, stack); 1326 if (C->failing()) { 1327 // schedule_late fails only when graph is incorrect. 1328 assert(!VerifyGraphEdges, "verification should have failed"); 1329 return; 1330 } 1331 1332 #ifndef PRODUCT 1333 if (trace_opto_pipelining()) { 1334 tty->print("\n---- Detect implicit null checks ----\n"); 1335 } 1336 #endif 1337 1338 // Detect implicit-null-check opportunities. Basically, find NULL checks 1339 // with suitable memory ops nearby. Use the memory op to do the NULL check. 1340 // I can generate a memory op if there is not one nearby. 1341 if (C->is_method_compilation()) { 1342 // By reversing the loop direction we get a very minor gain on mpegaudio. 1343 // Feel free to revert to a forward loop for clarity. 1344 // for( int i=0; i < (int)matcher._null_check_tests.size(); i+=2 ) { 1345 for (int i = _matcher._null_check_tests.size() - 2; i >= 0; i -= 2) { 1346 Node* proj = _matcher._null_check_tests[i]; 1347 Node* val = _matcher._null_check_tests[i + 1]; 1348 Block* block = get_block_for_node(proj); 1349 implicit_null_check(block, proj, val, C->allowed_deopt_reasons()); 1350 // The implicit_null_check will only perform the transformation 1351 // if the null branch is truly uncommon, *and* it leads to an 1352 // uncommon trap. Combined with the too_many_traps guards 1353 // above, this prevents SEGV storms reported in 6366351, 1354 // by recompiling offending methods without this optimization. 1355 } 1356 } 1357 1358 #ifndef PRODUCT 1359 if (trace_opto_pipelining()) { 1360 tty->print("\n---- Start Local Scheduling ----\n"); 1361 } 1362 #endif 1363 1364 // Schedule locally. Right now a simple topological sort. 1365 // Later, do a real latency aware scheduler. 1366 GrowableArray<int> ready_cnt(C->unique(), C->unique(), -1); 1367 visited.Clear(); 1368 for (uint i = 0; i < number_of_blocks(); i++) { 1369 Block* block = get_block(i); 1370 if (!schedule_local(block, ready_cnt, visited)) { 1371 if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) { 1372 C->record_method_not_compilable("local schedule failed"); 1373 } 1374 return; 1375 } 1376 } 1377 1378 // If we inserted any instructions between a Call and his CatchNode, 1379 // clone the instructions on all paths below the Catch. 1380 for (uint i = 0; i < number_of_blocks(); i++) { 1381 Block* block = get_block(i); 1382 call_catch_cleanup(block); 1383 } 1384 1385 #ifndef PRODUCT 1386 if (trace_opto_pipelining()) { 1387 tty->print("\n---- After GlobalCodeMotion ----\n"); 1388 for (uint i = 0; i < number_of_blocks(); i++) { 1389 Block* block = get_block(i); 1390 block->dump(); 1391 } 1392 } 1393 #endif 1394 // Dead. 1395 _node_latency = (GrowableArray<uint> *)0xdeadbeef; 1396 } 1397 1398 bool PhaseCFG::do_global_code_motion() { 1399 1400 build_dominator_tree(); 1401 if (C->failing()) { 1402 return false; 1403 } 1404 1405 NOT_PRODUCT( C->verify_graph_edges(); ) 1406 1407 estimate_block_frequency(); 1408 1409 global_code_motion(); 1410 1411 if (C->failing()) { 1412 return false; 1413 } 1414 1415 return true; 1416 } 1417 1418 //------------------------------Estimate_Block_Frequency----------------------- 1419 // Estimate block frequencies based on IfNode probabilities. 1420 void PhaseCFG::estimate_block_frequency() { 1421 1422 // Force conditional branches leading to uncommon traps to be unlikely, 1423 // not because we get to the uncommon_trap with less relative frequency, 1424 // but because an uncommon_trap typically causes a deopt, so we only get 1425 // there once. 1426 if (C->do_freq_based_layout()) { 1427 Block_List worklist; 1428 Block* root_blk = get_block(0); 1429 for (uint i = 1; i < root_blk->num_preds(); i++) { 1430 Block *pb = get_block_for_node(root_blk->pred(i)); 1431 if (pb->has_uncommon_code()) { 1432 worklist.push(pb); 1433 } 1434 } 1435 while (worklist.size() > 0) { 1436 Block* uct = worklist.pop(); 1437 if (uct == get_root_block()) { 1438 continue; 1439 } 1440 for (uint i = 1; i < uct->num_preds(); i++) { 1441 Block *pb = get_block_for_node(uct->pred(i)); 1442 if (pb->_num_succs == 1) { 1443 worklist.push(pb); 1444 } else if (pb->num_fall_throughs() == 2) { 1445 pb->update_uncommon_branch(uct); 1446 } 1447 } 1448 } 1449 } 1450 1451 // Create the loop tree and calculate loop depth. 1452 _root_loop = create_loop_tree(); 1453 _root_loop->compute_loop_depth(0); 1454 1455 // Compute block frequency of each block, relative to a single loop entry. 1456 _root_loop->compute_freq(); 1457 1458 // Adjust all frequencies to be relative to a single method entry 1459 _root_loop->_freq = 1.0; 1460 _root_loop->scale_freq(); 1461 1462 // Save outmost loop frequency for LRG frequency threshold 1463 _outer_loop_frequency = _root_loop->outer_loop_freq(); 1464 1465 // force paths ending at uncommon traps to be infrequent 1466 if (!C->do_freq_based_layout()) { 1467 Block_List worklist; 1468 Block* root_blk = get_block(0); 1469 for (uint i = 1; i < root_blk->num_preds(); i++) { 1470 Block *pb = get_block_for_node(root_blk->pred(i)); 1471 if (pb->has_uncommon_code()) { 1472 worklist.push(pb); 1473 } 1474 } 1475 while (worklist.size() > 0) { 1476 Block* uct = worklist.pop(); 1477 uct->_freq = PROB_MIN; 1478 for (uint i = 1; i < uct->num_preds(); i++) { 1479 Block *pb = get_block_for_node(uct->pred(i)); 1480 if (pb->_num_succs == 1 && pb->_freq > PROB_MIN) { 1481 worklist.push(pb); 1482 } 1483 } 1484 } 1485 } 1486 1487 #ifdef ASSERT 1488 for (uint i = 0; i < number_of_blocks(); i++) { 1489 Block* b = get_block(i); 1490 assert(b->_freq >= MIN_BLOCK_FREQUENCY, "Register Allocator requires meaningful block frequency"); 1491 } 1492 #endif 1493 1494 #ifndef PRODUCT 1495 if (PrintCFGBlockFreq) { 1496 tty->print_cr("CFG Block Frequencies"); 1497 _root_loop->dump_tree(); 1498 if (Verbose) { 1499 tty->print_cr("PhaseCFG dump"); 1500 dump(); 1501 tty->print_cr("Node dump"); 1502 _root->dump(99999); 1503 } 1504 } 1505 #endif 1506 } 1507 1508 //----------------------------create_loop_tree-------------------------------- 1509 // Create a loop tree from the CFG 1510 CFGLoop* PhaseCFG::create_loop_tree() { 1511 1512 #ifdef ASSERT 1513 assert(get_block(0) == get_root_block(), "first block should be root block"); 1514 for (uint i = 0; i < number_of_blocks(); i++) { 1515 Block* block = get_block(i); 1516 // Check that _loop field are clear...we could clear them if not. 1517 assert(block->_loop == NULL, "clear _loop expected"); 1518 // Sanity check that the RPO numbering is reflected in the _blocks array. 1519 // It doesn't have to be for the loop tree to be built, but if it is not, 1520 // then the blocks have been reordered since dom graph building...which 1521 // may question the RPO numbering 1522 assert(block->_rpo == i, "unexpected reverse post order number"); 1523 } 1524 #endif 1525 1526 int idct = 0; 1527 CFGLoop* root_loop = new CFGLoop(idct++); 1528 1529 Block_List worklist; 1530 1531 // Assign blocks to loops 1532 for(uint i = number_of_blocks() - 1; i > 0; i-- ) { // skip Root block 1533 Block* block = get_block(i); 1534 1535 if (block->head()->is_Loop()) { 1536 Block* loop_head = block; 1537 assert(loop_head->num_preds() - 1 == 2, "loop must have 2 predecessors"); 1538 Node* tail_n = loop_head->pred(LoopNode::LoopBackControl); 1539 Block* tail = get_block_for_node(tail_n); 1540 1541 // Defensively filter out Loop nodes for non-single-entry loops. 1542 // For all reasonable loops, the head occurs before the tail in RPO. 1543 if (i <= tail->_rpo) { 1544 1545 // The tail and (recursive) predecessors of the tail 1546 // are made members of a new loop. 1547 1548 assert(worklist.size() == 0, "nonempty worklist"); 1549 CFGLoop* nloop = new CFGLoop(idct++); 1550 assert(loop_head->_loop == NULL, "just checking"); 1551 loop_head->_loop = nloop; 1552 // Add to nloop so push_pred() will skip over inner loops 1553 nloop->add_member(loop_head); 1554 nloop->push_pred(loop_head, LoopNode::LoopBackControl, worklist, this); 1555 1556 while (worklist.size() > 0) { 1557 Block* member = worklist.pop(); 1558 if (member != loop_head) { 1559 for (uint j = 1; j < member->num_preds(); j++) { 1560 nloop->push_pred(member, j, worklist, this); 1561 } 1562 } 1563 } 1564 } 1565 } 1566 } 1567 1568 // Create a member list for each loop consisting 1569 // of both blocks and (immediate child) loops. 1570 for (uint i = 0; i < number_of_blocks(); i++) { 1571 Block* block = get_block(i); 1572 CFGLoop* lp = block->_loop; 1573 if (lp == NULL) { 1574 // Not assigned to a loop. Add it to the method's pseudo loop. 1575 block->_loop = root_loop; 1576 lp = root_loop; 1577 } 1578 if (lp == root_loop || block != lp->head()) { // loop heads are already members 1579 lp->add_member(block); 1580 } 1581 if (lp != root_loop) { 1582 if (lp->parent() == NULL) { 1583 // Not a nested loop. Make it a child of the method's pseudo loop. 1584 root_loop->add_nested_loop(lp); 1585 } 1586 if (block == lp->head()) { 1587 // Add nested loop to member list of parent loop. 1588 lp->parent()->add_member(lp); 1589 } 1590 } 1591 } 1592 1593 return root_loop; 1594 } 1595 1596 //------------------------------push_pred-------------------------------------- 1597 void CFGLoop::push_pred(Block* blk, int i, Block_List& worklist, PhaseCFG* cfg) { 1598 Node* pred_n = blk->pred(i); 1599 Block* pred = cfg->get_block_for_node(pred_n); 1600 CFGLoop *pred_loop = pred->_loop; 1601 if (pred_loop == NULL) { 1602 // Filter out blocks for non-single-entry loops. 1603 // For all reasonable loops, the head occurs before the tail in RPO. 1604 if (pred->_rpo > head()->_rpo) { 1605 pred->_loop = this; 1606 worklist.push(pred); 1607 } 1608 } else if (pred_loop != this) { 1609 // Nested loop. 1610 while (pred_loop->_parent != NULL && pred_loop->_parent != this) { 1611 pred_loop = pred_loop->_parent; 1612 } 1613 // Make pred's loop be a child 1614 if (pred_loop->_parent == NULL) { 1615 add_nested_loop(pred_loop); 1616 // Continue with loop entry predecessor. 1617 Block* pred_head = pred_loop->head(); 1618 assert(pred_head->num_preds() - 1 == 2, "loop must have 2 predecessors"); 1619 assert(pred_head != head(), "loop head in only one loop"); 1620 push_pred(pred_head, LoopNode::EntryControl, worklist, cfg); 1621 } else { 1622 assert(pred_loop->_parent == this && _parent == NULL, "just checking"); 1623 } 1624 } 1625 } 1626 1627 //------------------------------add_nested_loop-------------------------------- 1628 // Make cl a child of the current loop in the loop tree. 1629 void CFGLoop::add_nested_loop(CFGLoop* cl) { 1630 assert(_parent == NULL, "no parent yet"); 1631 assert(cl != this, "not my own parent"); 1632 cl->_parent = this; 1633 CFGLoop* ch = _child; 1634 if (ch == NULL) { 1635 _child = cl; 1636 } else { 1637 while (ch->_sibling != NULL) { ch = ch->_sibling; } 1638 ch->_sibling = cl; 1639 } 1640 } 1641 1642 //------------------------------compute_loop_depth----------------------------- 1643 // Store the loop depth in each CFGLoop object. 1644 // Recursively walk the children to do the same for them. 1645 void CFGLoop::compute_loop_depth(int depth) { 1646 _depth = depth; 1647 CFGLoop* ch = _child; 1648 while (ch != NULL) { 1649 ch->compute_loop_depth(depth + 1); 1650 ch = ch->_sibling; 1651 } 1652 } 1653 1654 //------------------------------compute_freq----------------------------------- 1655 // Compute the frequency of each block and loop, relative to a single entry 1656 // into the dominating loop head. 1657 void CFGLoop::compute_freq() { 1658 // Bottom up traversal of loop tree (visit inner loops first.) 1659 // Set loop head frequency to 1.0, then transitively 1660 // compute frequency for all successors in the loop, 1661 // as well as for each exit edge. Inner loops are 1662 // treated as single blocks with loop exit targets 1663 // as the successor blocks. 1664 1665 // Nested loops first 1666 CFGLoop* ch = _child; 1667 while (ch != NULL) { 1668 ch->compute_freq(); 1669 ch = ch->_sibling; 1670 } 1671 assert (_members.length() > 0, "no empty loops"); 1672 Block* hd = head(); 1673 hd->_freq = 1.0; 1674 for (int i = 0; i < _members.length(); i++) { 1675 CFGElement* s = _members.at(i); 1676 double freq = s->_freq; 1677 if (s->is_block()) { 1678 Block* b = s->as_Block(); 1679 for (uint j = 0; j < b->_num_succs; j++) { 1680 Block* sb = b->_succs[j]; 1681 update_succ_freq(sb, freq * b->succ_prob(j)); 1682 } 1683 } else { 1684 CFGLoop* lp = s->as_CFGLoop(); 1685 assert(lp->_parent == this, "immediate child"); 1686 for (int k = 0; k < lp->_exits.length(); k++) { 1687 Block* eb = lp->_exits.at(k).get_target(); 1688 double prob = lp->_exits.at(k).get_prob(); 1689 update_succ_freq(eb, freq * prob); 1690 } 1691 } 1692 } 1693 1694 // For all loops other than the outer, "method" loop, 1695 // sum and normalize the exit probability. The "method" loop 1696 // should keep the initial exit probability of 1, so that 1697 // inner blocks do not get erroneously scaled. 1698 if (_depth != 0) { 1699 // Total the exit probabilities for this loop. 1700 double exits_sum = 0.0f; 1701 for (int i = 0; i < _exits.length(); i++) { 1702 exits_sum += _exits.at(i).get_prob(); 1703 } 1704 1705 // Normalize the exit probabilities. Until now, the 1706 // probabilities estimate the possibility of exit per 1707 // a single loop iteration; afterward, they estimate 1708 // the probability of exit per loop entry. 1709 for (int i = 0; i < _exits.length(); i++) { 1710 Block* et = _exits.at(i).get_target(); 1711 float new_prob = 0.0f; 1712 if (_exits.at(i).get_prob() > 0.0f) { 1713 new_prob = _exits.at(i).get_prob() / exits_sum; 1714 } 1715 BlockProbPair bpp(et, new_prob); 1716 _exits.at_put(i, bpp); 1717 } 1718 1719 // Save the total, but guard against unreasonable probability, 1720 // as the value is used to estimate the loop trip count. 1721 // An infinite trip count would blur relative block 1722 // frequencies. 1723 if (exits_sum > 1.0f) exits_sum = 1.0; 1724 if (exits_sum < PROB_MIN) exits_sum = PROB_MIN; 1725 _exit_prob = exits_sum; 1726 } 1727 } 1728 1729 //------------------------------succ_prob------------------------------------- 1730 // Determine the probability of reaching successor 'i' from the receiver block. 1731 float Block::succ_prob(uint i) { 1732 int eidx = end_idx(); 1733 Node *n = get_node(eidx); // Get ending Node 1734 1735 int op = n->Opcode(); 1736 if (n->is_Mach()) { 1737 if (n->is_MachNullCheck()) { 1738 // Can only reach here if called after lcm. The original Op_If is gone, 1739 // so we attempt to infer the probability from one or both of the 1740 // successor blocks. 1741 assert(_num_succs == 2, "expecting 2 successors of a null check"); 1742 // If either successor has only one predecessor, then the 1743 // probability estimate can be derived using the 1744 // relative frequency of the successor and this block. 1745 if (_succs[i]->num_preds() == 2) { 1746 return _succs[i]->_freq / _freq; 1747 } else if (_succs[1-i]->num_preds() == 2) { 1748 return 1 - (_succs[1-i]->_freq / _freq); 1749 } else { 1750 // Estimate using both successor frequencies 1751 float freq = _succs[i]->_freq; 1752 return freq / (freq + _succs[1-i]->_freq); 1753 } 1754 } 1755 op = n->as_Mach()->ideal_Opcode(); 1756 } 1757 1758 1759 // Switch on branch type 1760 switch( op ) { 1761 case Op_CountedLoopEnd: 1762 case Op_If: { 1763 assert (i < 2, "just checking"); 1764 // Conditionals pass on only part of their frequency 1765 float prob = n->as_MachIf()->_prob; 1766 assert(prob >= 0.0 && prob <= 1.0, "out of range probability"); 1767 // If succ[i] is the FALSE branch, invert path info 1768 if( get_node(i + eidx + 1)->Opcode() == Op_IfFalse ) { 1769 return 1.0f - prob; // not taken 1770 } else { 1771 return prob; // taken 1772 } 1773 } 1774 1775 case Op_Jump: 1776 // Divide the frequency between all successors evenly 1777 return 1.0f/_num_succs; 1778 1779 case Op_Catch: { 1780 const CatchProjNode *ci = get_node(i + eidx + 1)->as_CatchProj(); 1781 if (ci->_con == CatchProjNode::fall_through_index) { 1782 // Fall-thru path gets the lion's share. 1783 return 1.0f - PROB_UNLIKELY_MAG(5)*_num_succs; 1784 } else { 1785 // Presume exceptional paths are equally unlikely 1786 return PROB_UNLIKELY_MAG(5); 1787 } 1788 } 1789 1790 case Op_Root: 1791 case Op_Goto: 1792 // Pass frequency straight thru to target 1793 return 1.0f; 1794 1795 case Op_NeverBranch: 1796 return 0.0f; 1797 1798 case Op_TailCall: 1799 case Op_TailJump: 1800 case Op_Return: 1801 case Op_Halt: 1802 case Op_Rethrow: 1803 // Do not push out freq to root block 1804 return 0.0f; 1805 1806 default: 1807 ShouldNotReachHere(); 1808 } 1809 1810 return 0.0f; 1811 } 1812 1813 //------------------------------num_fall_throughs----------------------------- 1814 // Return the number of fall-through candidates for a block 1815 int Block::num_fall_throughs() { 1816 int eidx = end_idx(); 1817 Node *n = get_node(eidx); // Get ending Node 1818 1819 int op = n->Opcode(); 1820 if (n->is_Mach()) { 1821 if (n->is_MachNullCheck()) { 1822 // In theory, either side can fall-thru, for simplicity sake, 1823 // let's say only the false branch can now. 1824 return 1; 1825 } 1826 op = n->as_Mach()->ideal_Opcode(); 1827 } 1828 1829 // Switch on branch type 1830 switch( op ) { 1831 case Op_CountedLoopEnd: 1832 case Op_If: 1833 return 2; 1834 1835 case Op_Root: 1836 case Op_Goto: 1837 return 1; 1838 1839 case Op_Catch: { 1840 for (uint i = 0; i < _num_succs; i++) { 1841 const CatchProjNode *ci = get_node(i + eidx + 1)->as_CatchProj(); 1842 if (ci->_con == CatchProjNode::fall_through_index) { 1843 return 1; 1844 } 1845 } 1846 return 0; 1847 } 1848 1849 case Op_Jump: 1850 case Op_NeverBranch: 1851 case Op_TailCall: 1852 case Op_TailJump: 1853 case Op_Return: 1854 case Op_Halt: 1855 case Op_Rethrow: 1856 return 0; 1857 1858 default: 1859 ShouldNotReachHere(); 1860 } 1861 1862 return 0; 1863 } 1864 1865 //------------------------------succ_fall_through----------------------------- 1866 // Return true if a specific successor could be fall-through target. 1867 bool Block::succ_fall_through(uint i) { 1868 int eidx = end_idx(); 1869 Node *n = get_node(eidx); // Get ending Node 1870 1871 int op = n->Opcode(); 1872 if (n->is_Mach()) { 1873 if (n->is_MachNullCheck()) { 1874 // In theory, either side can fall-thru, for simplicity sake, 1875 // let's say only the false branch can now. 1876 return get_node(i + eidx + 1)->Opcode() == Op_IfFalse; 1877 } 1878 op = n->as_Mach()->ideal_Opcode(); 1879 } 1880 1881 // Switch on branch type 1882 switch( op ) { 1883 case Op_CountedLoopEnd: 1884 case Op_If: 1885 case Op_Root: 1886 case Op_Goto: 1887 return true; 1888 1889 case Op_Catch: { 1890 const CatchProjNode *ci = get_node(i + eidx + 1)->as_CatchProj(); 1891 return ci->_con == CatchProjNode::fall_through_index; 1892 } 1893 1894 case Op_Jump: 1895 case Op_NeverBranch: 1896 case Op_TailCall: 1897 case Op_TailJump: 1898 case Op_Return: 1899 case Op_Halt: 1900 case Op_Rethrow: 1901 return false; 1902 1903 default: 1904 ShouldNotReachHere(); 1905 } 1906 1907 return false; 1908 } 1909 1910 //------------------------------update_uncommon_branch------------------------ 1911 // Update the probability of a two-branch to be uncommon 1912 void Block::update_uncommon_branch(Block* ub) { 1913 int eidx = end_idx(); 1914 Node *n = get_node(eidx); // Get ending Node 1915 1916 int op = n->as_Mach()->ideal_Opcode(); 1917 1918 assert(op == Op_CountedLoopEnd || op == Op_If, "must be a If"); 1919 assert(num_fall_throughs() == 2, "must be a two way branch block"); 1920 1921 // Which successor is ub? 1922 uint s; 1923 for (s = 0; s <_num_succs; s++) { 1924 if (_succs[s] == ub) break; 1925 } 1926 assert(s < 2, "uncommon successor must be found"); 1927 1928 // If ub is the true path, make the proability small, else 1929 // ub is the false path, and make the probability large 1930 bool invert = (get_node(s + eidx + 1)->Opcode() == Op_IfFalse); 1931 1932 // Get existing probability 1933 float p = n->as_MachIf()->_prob; 1934 1935 if (invert) p = 1.0 - p; 1936 if (p > PROB_MIN) { 1937 p = PROB_MIN; 1938 } 1939 if (invert) p = 1.0 - p; 1940 1941 n->as_MachIf()->_prob = p; 1942 } 1943 1944 //------------------------------update_succ_freq------------------------------- 1945 // Update the appropriate frequency associated with block 'b', a successor of 1946 // a block in this loop. 1947 void CFGLoop::update_succ_freq(Block* b, double freq) { 1948 if (b->_loop == this) { 1949 if (b == head()) { 1950 // back branch within the loop 1951 // Do nothing now, the loop carried frequency will be 1952 // adjust later in scale_freq(). 1953 } else { 1954 // simple branch within the loop 1955 b->_freq += freq; 1956 } 1957 } else if (!in_loop_nest(b)) { 1958 // branch is exit from this loop 1959 BlockProbPair bpp(b, freq); 1960 _exits.append(bpp); 1961 } else { 1962 // branch into nested loop 1963 CFGLoop* ch = b->_loop; 1964 ch->_freq += freq; 1965 } 1966 } 1967 1968 //------------------------------in_loop_nest----------------------------------- 1969 // Determine if block b is in the receiver's loop nest. 1970 bool CFGLoop::in_loop_nest(Block* b) { 1971 int depth = _depth; 1972 CFGLoop* b_loop = b->_loop; 1973 int b_depth = b_loop->_depth; 1974 if (depth == b_depth) { 1975 return true; 1976 } 1977 while (b_depth > depth) { 1978 b_loop = b_loop->_parent; 1979 b_depth = b_loop->_depth; 1980 } 1981 return b_loop == this; 1982 } 1983 1984 //------------------------------scale_freq------------------------------------- 1985 // Scale frequency of loops and blocks by trip counts from outer loops 1986 // Do a top down traversal of loop tree (visit outer loops first.) 1987 void CFGLoop::scale_freq() { 1988 double loop_freq = _freq * trip_count(); 1989 _freq = loop_freq; 1990 for (int i = 0; i < _members.length(); i++) { 1991 CFGElement* s = _members.at(i); 1992 double block_freq = s->_freq * loop_freq; 1993 if (g_isnan(block_freq) || block_freq < MIN_BLOCK_FREQUENCY) 1994 block_freq = MIN_BLOCK_FREQUENCY; 1995 s->_freq = block_freq; 1996 } 1997 CFGLoop* ch = _child; 1998 while (ch != NULL) { 1999 ch->scale_freq(); 2000 ch = ch->_sibling; 2001 } 2002 } 2003 2004 // Frequency of outer loop 2005 double CFGLoop::outer_loop_freq() const { 2006 if (_child != NULL) { 2007 return _child->_freq; 2008 } 2009 return _freq; 2010 } 2011 2012 #ifndef PRODUCT 2013 //------------------------------dump_tree-------------------------------------- 2014 void CFGLoop::dump_tree() const { 2015 dump(); 2016 if (_child != NULL) _child->dump_tree(); 2017 if (_sibling != NULL) _sibling->dump_tree(); 2018 } 2019 2020 //------------------------------dump------------------------------------------- 2021 void CFGLoop::dump() const { 2022 for (int i = 0; i < _depth; i++) tty->print(" "); 2023 tty->print("%s: %d trip_count: %6.0f freq: %6.0f\n", 2024 _depth == 0 ? "Method" : "Loop", _id, trip_count(), _freq); 2025 for (int i = 0; i < _depth; i++) tty->print(" "); 2026 tty->print(" members:"); 2027 int k = 0; 2028 for (int i = 0; i < _members.length(); i++) { 2029 if (k++ >= 6) { 2030 tty->print("\n "); 2031 for (int j = 0; j < _depth+1; j++) tty->print(" "); 2032 k = 0; 2033 } 2034 CFGElement *s = _members.at(i); 2035 if (s->is_block()) { 2036 Block *b = s->as_Block(); 2037 tty->print(" B%d(%6.3f)", b->_pre_order, b->_freq); 2038 } else { 2039 CFGLoop* lp = s->as_CFGLoop(); 2040 tty->print(" L%d(%6.3f)", lp->_id, lp->_freq); 2041 } 2042 } 2043 tty->print("\n"); 2044 for (int i = 0; i < _depth; i++) tty->print(" "); 2045 tty->print(" exits: "); 2046 k = 0; 2047 for (int i = 0; i < _exits.length(); i++) { 2048 if (k++ >= 7) { 2049 tty->print("\n "); 2050 for (int j = 0; j < _depth+1; j++) tty->print(" "); 2051 k = 0; 2052 } 2053 Block *blk = _exits.at(i).get_target(); 2054 double prob = _exits.at(i).get_prob(); 2055 tty->print(" ->%d@%d%%", blk->_pre_order, (int)(prob*100)); 2056 } 2057 tty->print("\n"); 2058 } 2059 #endif