Old src/share/vm/opto/gcm.cpp

   1 /*
   2  * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "libadt/vectset.hpp"
  27 #include "memory/allocation.inline.hpp"
  28 #include "opto/block.hpp"
  29 #include "opto/c2compiler.hpp"
  30 #include "opto/callnode.hpp"
  31 #include "opto/cfgnode.hpp"
  32 #include "opto/machnode.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/phaseX.hpp"
  35 #include "opto/rootnode.hpp"
  36 #include "opto/runtime.hpp"
  37 #include "runtime/deoptimization.hpp"
  38 
  39 // Portions of code courtesy of Clifford Click
  40 
  41 // Optimization - Graph Style
  42 
  43 // To avoid float value underflow
  44 #define MIN_BLOCK_FREQUENCY 1.e-35f
  45 
  46 //----------------------------schedule_node_into_block-------------------------
  47 // Insert node n into block b. Look for projections of n and make sure they
  48 // are in b also.
  49 void PhaseCFG::schedule_node_into_block( Node *n, Block *b ) {
  50   // Set basic block of n, Add n to b,
  51   map_node_to_block(n, b);
  52   b->add_inst(n);
  53 
  54   // After Matching, nearly any old Node may have projections trailing it.
  55   // These are usually machine-dependent flags.  In any case, they might
  56   // float to another block below this one.  Move them up.
  57   for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
  58     Node*  use  = n->fast_out(i);
  59     if (use->is_Proj()) {
  60       Block* buse = get_block_for_node(use);
  61       if (buse != b) {              // In wrong block?
  62         if (buse != NULL) {
  63           buse->find_remove(use);   // Remove from wrong block
  64         }
  65         map_node_to_block(use, b);
  66         b->add_inst(use);
  67       }
  68     }
  69   }
  70 }
  71 
  72 //----------------------------replace_block_proj_ctrl-------------------------
  73 // Nodes that have is_block_proj() nodes as their control need to use
  74 // the appropriate Region for their actual block as their control since
  75 // the projection will be in a predecessor block.
  76 void PhaseCFG::replace_block_proj_ctrl( Node *n ) {
  77   const Node *in0 = n->in(0);
  78   assert(in0 != NULL, "Only control-dependent");
  79   const Node *p = in0->is_block_proj();
  80   if (p != NULL && p != n) {    // Control from a block projection?
  81     assert(!n->pinned() || n->is_MachConstantBase(), "only pinned MachConstantBase node is expected here");
  82     // Find trailing Region
  83     Block *pb = get_block_for_node(in0); // Block-projection already has basic block
  84     uint j = 0;
  85     if (pb->_num_succs != 1) {  // More then 1 successor?
  86       // Search for successor
  87       uint max = pb->number_of_nodes();
  88       assert( max > 1, "" );
  89       uint start = max - pb->_num_succs;
  90       // Find which output path belongs to projection
  91       for (j = start; j < max; j++) {
  92         if( pb->get_node(j) == in0 )
  93           break;
  94       }
  95       assert( j < max, "must find" );
  96       // Change control to match head of successor basic block
  97       j -= start;
  98     }
  99     n->set_req(0, pb->_succs[j]->head());
 100   }
 101 }
 102 
 103 
 104 //------------------------------schedule_pinned_nodes--------------------------
 105 // Set the basic block for Nodes pinned into blocks
 106 void PhaseCFG::schedule_pinned_nodes(VectorSet &visited) {
 107   // Allocate node stack of size C->unique()+8 to avoid frequent realloc
 108   GrowableArray <Node *> spstack(C->unique() + 8);
 109   spstack.push(_root);
 110   while (spstack.is_nonempty()) {
 111     Node* node = spstack.pop();
 112     if (!visited.test_set(node->_idx)) { // Test node and flag it as visited
 113       if (node->pinned() && !has_block(node)) {  // Pinned?  Nail it down!
 114         assert(node->in(0), "pinned Node must have Control");
 115         // Before setting block replace block_proj control edge
 116         replace_block_proj_ctrl(node);
 117         Node* input = node->in(0);
 118         while (!input->is_block_start()) {
 119           input = input->in(0);
 120         }
 121         Block* block = get_block_for_node(input); // Basic block of controlling input
 122         schedule_node_into_block(node, block);
 123       }
 124 
 125       // process all inputs that are non NULL
 126       for (int i = node->req() - 1; i >= 0; --i) {
 127         if (node->in(i) != NULL) {
 128           spstack.push(node->in(i));
 129         }
 130       }
 131     }
 132   }
 133 }
 134 
 135 #ifdef ASSERT
 136 // Assert that new input b2 is dominated by all previous inputs.
 137 // Check this by by seeing that it is dominated by b1, the deepest
 138 // input observed until b2.
 139 static void assert_dom(Block* b1, Block* b2, Node* n, const PhaseCFG* cfg) {
 140   if (b1 == NULL)  return;
 141   assert(b1->_dom_depth < b2->_dom_depth, "sanity");
 142   Block* tmp = b2;
 143   while (tmp != b1 && tmp != NULL) {
 144     tmp = tmp->_idom;
 145   }
 146   if (tmp != b1) {
 147     // Detected an unschedulable graph.  Print some nice stuff and die.
 148     tty->print_cr("!!! Unschedulable graph !!!");
 149     for (uint j=0; j<n->len(); j++) { // For all inputs
 150       Node* inn = n->in(j); // Get input
 151       if (inn == NULL)  continue;  // Ignore NULL, missing inputs
 152       Block* inb = cfg->get_block_for_node(inn);
 153       tty->print("B%d idom=B%d depth=%2d ",inb->_pre_order,
 154                  inb->_idom ? inb->_idom->_pre_order : 0, inb->_dom_depth);
 155       inn->dump();
 156     }
 157     tty->print("Failing node: ");
 158     n->dump();
 159     assert(false, "unscheduable graph");
 160   }
 161 }
 162 #endif
 163 
 164 static Block* find_deepest_input(Node* n, const PhaseCFG* cfg) {
 165   // Find the last input dominated by all other inputs.
 166   Block* deepb           = NULL;        // Deepest block so far
 167   int    deepb_dom_depth = 0;
 168   for (uint k = 0; k < n->len(); k++) { // For all inputs
 169     Node* inn = n->in(k);               // Get input
 170     if (inn == NULL)  continue;         // Ignore NULL, missing inputs
 171     Block* inb = cfg->get_block_for_node(inn);
 172     assert(inb != NULL, "must already have scheduled this input");
 173     if (deepb_dom_depth < (int) inb->_dom_depth) {
 174       // The new inb must be dominated by the previous deepb.
 175       // The various inputs must be linearly ordered in the dom
 176       // tree, or else there will not be a unique deepest block.
 177       DEBUG_ONLY(assert_dom(deepb, inb, n, cfg));
 178       deepb = inb;                      // Save deepest block
 179       deepb_dom_depth = deepb->_dom_depth;
 180     }
 181   }
 182   assert(deepb != NULL, "must be at least one input to n");
 183   return deepb;
 184 }
 185 
 186 
 187 //------------------------------schedule_early---------------------------------
 188 // Find the earliest Block any instruction can be placed in.  Some instructions
 189 // are pinned into Blocks.  Unpinned instructions can appear in last block in
 190 // which all their inputs occur.
 191 bool PhaseCFG::schedule_early(VectorSet &visited, Node_List &roots) {
 192   // Allocate stack with enough space to avoid frequent realloc
 193   Node_Stack nstack(roots.Size() + 8);
 194   // _root will be processed among C->top() inputs
 195   roots.push(C->top());
 196   visited.set(C->top()->_idx);
 197 
 198   while (roots.size() != 0) {
 199     // Use local variables nstack_top_n & nstack_top_i to cache values
 200     // on stack's top.
 201     Node* parent_node = roots.pop();
 202     uint  input_index = 0;
 203 
 204     while (true) {
 205       if (input_index == 0) {
 206         // Fixup some control.  Constants without control get attached
 207         // to root and nodes that use is_block_proj() nodes should be attached
 208         // to the region that starts their block.
 209         const Node* control_input = parent_node->in(0);
 210         if (control_input != NULL) {
 211           replace_block_proj_ctrl(parent_node);
 212         } else {
 213           // Is a constant with NO inputs?
 214           if (parent_node->req() == 1) {
 215             parent_node->set_req(0, _root);
 216           }
 217         }
 218       }
 219 
 220       // First, visit all inputs and force them to get a block.  If an
 221       // input is already in a block we quit following inputs (to avoid
 222       // cycles). Instead we put that Node on a worklist to be handled
 223       // later (since IT'S inputs may not have a block yet).
 224 
 225       // Assume all n's inputs will be processed
 226       bool done = true;
 227 
 228       while (input_index < parent_node->len()) {
 229         Node* in = parent_node->in(input_index++);
 230         if (in == NULL) {
 231           continue;
 232         }
 233 
 234         int is_visited = visited.test_set(in->_idx);
 235         if (!has_block(in)) {
 236           if (is_visited) {
 237             return false;
 238           }
 239           // Save parent node and next input's index.
 240           nstack.push(parent_node, input_index);
 241           // Process current input now.
 242           parent_node = in;
 243           input_index = 0;
 244           // Not all n's inputs processed.
 245           done = false;
 246           break;
 247         } else if (!is_visited) {
 248           // Visit this guy later, using worklist
 249           roots.push(in);
 250         }
 251       }
 252 
 253       if (done) {
 254         // All of n's inputs have been processed, complete post-processing.
 255 
 256         // Some instructions are pinned into a block.  These include Region,
 257         // Phi, Start, Return, and other control-dependent instructions and
 258         // any projections which depend on them.
 259         if (!parent_node->pinned()) {
 260           // Set earliest legal block.
 261           Block* earliest_block = find_deepest_input(parent_node, this);
 262           map_node_to_block(parent_node, earliest_block);
 263         } else {
 264           assert(get_block_for_node(parent_node) == get_block_for_node(parent_node->in(0)), "Pinned Node should be at the same block as its control edge");
 265         }
 266 
 267         if (nstack.is_empty()) {
 268           // Finished all nodes on stack.
 269           // Process next node on the worklist 'roots'.
 270           break;
 271         }
 272         // Get saved parent node and next input's index.
 273         parent_node = nstack.node();
 274         input_index = nstack.index();
 275         nstack.pop();
 276       }
 277     }
 278   }
 279   return true;
 280 }
 281 
 282 //------------------------------dom_lca----------------------------------------
 283 // Find least common ancestor in dominator tree
 284 // LCA is a current notion of LCA, to be raised above 'this'.
 285 // As a convenient boundary condition, return 'this' if LCA is NULL.
 286 // Find the LCA of those two nodes.
 287 Block* Block::dom_lca(Block* LCA) {
 288   if (LCA == NULL || LCA == this)  return this;
 289 
 290   Block* anc = this;
 291   while (anc->_dom_depth > LCA->_dom_depth)
 292     anc = anc->_idom;           // Walk up till anc is as high as LCA
 293 
 294   while (LCA->_dom_depth > anc->_dom_depth)
 295     LCA = LCA->_idom;           // Walk up till LCA is as high as anc
 296 
 297   while (LCA != anc) {          // Walk both up till they are the same
 298     LCA = LCA->_idom;
 299     anc = anc->_idom;
 300   }
 301 
 302   return LCA;
 303 }
 304 
 305 //--------------------------raise_LCA_above_use--------------------------------
 306 // We are placing a definition, and have been given a def->use edge.
 307 // The definition must dominate the use, so move the LCA upward in the
 308 // dominator tree to dominate the use.  If the use is a phi, adjust
 309 // the LCA only with the phi input paths which actually use this def.
 310 static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, const PhaseCFG* cfg) {
 311   Block* buse = cfg->get_block_for_node(use);
 312   if (buse == NULL)    return LCA;   // Unused killing Projs have no use block
 313   if (!use->is_Phi())  return buse->dom_lca(LCA);
 314   uint pmax = use->req();       // Number of Phi inputs
 315   // Why does not this loop just break after finding the matching input to
 316   // the Phi?  Well...it's like this.  I do not have true def-use/use-def
 317   // chains.  Means I cannot distinguish, from the def-use direction, which
 318   // of many use-defs lead from the same use to the same def.  That is, this
 319   // Phi might have several uses of the same def.  Each use appears in a
 320   // different predecessor block.  But when I enter here, I cannot distinguish
 321   // which use-def edge I should find the predecessor block for.  So I find
 322   // them all.  Means I do a little extra work if a Phi uses the same value
 323   // more than once.
 324   for (uint j=1; j<pmax; j++) { // For all inputs
 325     if (use->in(j) == def) {    // Found matching input?
 326       Block* pred = cfg->get_block_for_node(buse->pred(j));
 327       LCA = pred->dom_lca(LCA);
 328     }
 329   }
 330   return LCA;
 331 }
 332 
 333 //----------------------------raise_LCA_above_marks----------------------------
 334 // Return a new LCA that dominates LCA and any of its marked predecessors.
 335 // Search all my parents up to 'early' (exclusive), looking for predecessors
 336 // which are marked with the given index.  Return the LCA (in the dom tree)
 337 // of all marked blocks.  If there are none marked, return the original
 338 // LCA.
 339 static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark, Block* early, const PhaseCFG* cfg) {
 340   Block_List worklist;
 341   worklist.push(LCA);
 342   while (worklist.size() > 0) {
 343     Block* mid = worklist.pop();
 344     if (mid == early)  continue;  // stop searching here
 345 
 346     // Test and set the visited bit.
 347     if (mid->raise_LCA_visited() == mark)  continue;  // already visited
 348 
 349     // Don't process the current LCA, otherwise the search may terminate early
 350     if (mid != LCA && mid->raise_LCA_mark() == mark) {
 351       // Raise the LCA.
 352       LCA = mid->dom_lca(LCA);
 353       if (LCA == early)  break;   // stop searching everywhere
 354       assert(early->dominates(LCA), "early is high enough");
 355       // Resume searching at that point, skipping intermediate levels.
 356       worklist.push(LCA);
 357       if (LCA == mid)
 358         continue; // Don't mark as visited to avoid early termination.
 359     } else {
 360       // Keep searching through this block's predecessors.
 361       for (uint j = 1, jmax = mid->num_preds(); j < jmax; j++) {
 362         Block* mid_parent = cfg->get_block_for_node(mid->pred(j));
 363         worklist.push(mid_parent);
 364       }
 365     }
 366     mid->set_raise_LCA_visited(mark);
 367   }
 368   return LCA;
 369 }
 370 
 371 //--------------------------memory_early_block--------------------------------
 372 // This is a variation of find_deepest_input, the heart of schedule_early.
 373 // Find the "early" block for a load, if we considered only memory and
 374 // address inputs, that is, if other data inputs were ignored.
 375 //
 376 // Because a subset of edges are considered, the resulting block will
 377 // be earlier (at a shallower dom_depth) than the true schedule_early
 378 // point of the node. We compute this earlier block as a more permissive
 379 // site for anti-dependency insertion, but only if subsume_loads is enabled.
 380 static Block* memory_early_block(Node* load, Block* early, const PhaseCFG* cfg) {
 381   Node* base;
 382   Node* index;
 383   Node* store = load->in(MemNode::Memory);
 384   load->as_Mach()->memory_inputs(base, index);
 385 
 386   assert(base != NodeSentinel && index != NodeSentinel,
 387          "unexpected base/index inputs");
 388 
 389   Node* mem_inputs[4];
 390   int mem_inputs_length = 0;
 391   if (base != NULL)  mem_inputs[mem_inputs_length++] = base;
 392   if (index != NULL) mem_inputs[mem_inputs_length++] = index;
 393   if (store != NULL) mem_inputs[mem_inputs_length++] = store;
 394 
 395   // In the comparision below, add one to account for the control input,
 396   // which may be null, but always takes up a spot in the in array.
 397   if (mem_inputs_length + 1 < (int) load->req()) {
 398     // This "load" has more inputs than just the memory, base and index inputs.
 399     // For purposes of checking anti-dependences, we need to start
 400     // from the early block of only the address portion of the instruction,
 401     // and ignore other blocks that may have factored into the wider
 402     // schedule_early calculation.
 403     if (load->in(0) != NULL) mem_inputs[mem_inputs_length++] = load->in(0);
 404 
 405     Block* deepb           = NULL;        // Deepest block so far
 406     int    deepb_dom_depth = 0;
 407     for (int i = 0; i < mem_inputs_length; i++) {
 408       Block* inb = cfg->get_block_for_node(mem_inputs[i]);
 409       if (deepb_dom_depth < (int) inb->_dom_depth) {
 410         // The new inb must be dominated by the previous deepb.
 411         // The various inputs must be linearly ordered in the dom
 412         // tree, or else there will not be a unique deepest block.
 413         DEBUG_ONLY(assert_dom(deepb, inb, load, cfg));
 414         deepb = inb;                      // Save deepest block
 415         deepb_dom_depth = deepb->_dom_depth;
 416       }
 417     }
 418     early = deepb;
 419   }
 420 
 421   return early;
 422 }
 423 
 424 //--------------------------insert_anti_dependences---------------------------
 425 // A load may need to witness memory that nearby stores can overwrite.
 426 // For each nearby store, either insert an "anti-dependence" edge
 427 // from the load to the store, or else move LCA upward to force the
 428 // load to (eventually) be scheduled in a block above the store.
 429 //
 430 // Do not add edges to stores on distinct control-flow paths;
 431 // only add edges to stores which might interfere.
 432 //
 433 // Return the (updated) LCA.  There will not be any possibly interfering
 434 // store between the load's "early block" and the updated LCA.
 435 // Any stores in the updated LCA will have new precedence edges
 436 // back to the load.  The caller is expected to schedule the load
 437 // in the LCA, in which case the precedence edges will make LCM
 438 // preserve anti-dependences.  The caller may also hoist the load
 439 // above the LCA, if it is not the early block.
 440 Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
 441   assert(load->needs_anti_dependence_check(), "must be a load of some sort");
 442   assert(LCA != NULL, "");
 443   DEBUG_ONLY(Block* LCA_orig = LCA);
 444 
 445   // Compute the alias index.  Loads and stores with different alias indices
 446   // do not need anti-dependence edges.
 447   uint load_alias_idx = C->get_alias_index(load->adr_type());
 448 #ifdef ASSERT
 449   if (load_alias_idx == Compile::AliasIdxBot && C->AliasLevel() > 0 &&
 450       (PrintOpto || VerifyAliases ||
 451        PrintMiscellaneous && (WizardMode || Verbose))) {
 452     // Load nodes should not consume all of memory.
 453     // Reporting a bottom type indicates a bug in adlc.
 454     // If some particular type of node validly consumes all of memory,
 455     // sharpen the preceding "if" to exclude it, so we can catch bugs here.
 456     tty->print_cr("*** Possible Anti-Dependence Bug:  Load consumes all of memory.");
 457     load->dump(2);
 458     if (VerifyAliases)  assert(load_alias_idx != Compile::AliasIdxBot, "");
 459   }
 460 #endif
 461   assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrComp),
 462          "String compare is only known 'load' that does not conflict with any stores");
 463   assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrEquals),
 464          "String equals is a 'load' that does not conflict with any stores");
 465   assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrIndexOf),
 466          "String indexOf is a 'load' that does not conflict with any stores");
 467   assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_AryEq),
 468          "Arrays equals is a 'load' that do not conflict with any stores");
 469 
 470   if (!C->alias_type(load_alias_idx)->is_rewritable()) {
 471     // It is impossible to spoil this load by putting stores before it,
 472     // because we know that the stores will never update the value
 473     // which 'load' must witness.
 474     return LCA;
 475   }
 476 
 477   node_idx_t load_index = load->_idx;
 478 
 479   // Note the earliest legal placement of 'load', as determined by
 480   // by the unique point in the dom tree where all memory effects
 481   // and other inputs are first available.  (Computed by schedule_early.)
 482   // For normal loads, 'early' is the shallowest place (dom graph wise)
 483   // to look for anti-deps between this load and any store.
 484   Block* early = get_block_for_node(load);
 485 
 486   // If we are subsuming loads, compute an "early" block that only considers
 487   // memory or address inputs. This block may be different than the
 488   // schedule_early block in that it could be at an even shallower depth in the
 489   // dominator tree, and allow for a broader discovery of anti-dependences.
 490   if (C->subsume_loads()) {
 491     early = memory_early_block(load, early, this);
 492   }
 493 
 494   ResourceArea *area = Thread::current()->resource_area();
 495   Node_List worklist_mem(area);     // prior memory state to store
 496   Node_List worklist_store(area);   // possible-def to explore
 497   Node_List worklist_visited(area); // visited mergemem nodes
 498   Node_List non_early_stores(area); // all relevant stores outside of early
 499   bool must_raise_LCA = false;
 500 
 501 #ifdef TRACK_PHI_INPUTS
 502   // %%% This extra checking fails because MergeMem nodes are not GVNed.
 503   // Provide "phi_inputs" to check if every input to a PhiNode is from the
 504   // original memory state.  This indicates a PhiNode for which should not
 505   // prevent the load from sinking.  For such a block, set_raise_LCA_mark
 506   // may be overly conservative.
 507   // Mechanism: count inputs seen for each Phi encountered in worklist_store.
 508   DEBUG_ONLY(GrowableArray<uint> phi_inputs(area, C->unique(),0,0));
 509 #endif
 510 
 511   // 'load' uses some memory state; look for users of the same state.
 512   // Recurse through MergeMem nodes to the stores that use them.
 513 
 514   // Each of these stores is a possible definition of memory
 515   // that 'load' needs to use.  We need to force 'load'
 516   // to occur before each such store.  When the store is in
 517   // the same block as 'load', we insert an anti-dependence
 518   // edge load->store.
 519 
 520   // The relevant stores "nearby" the load consist of a tree rooted
 521   // at initial_mem, with internal nodes of type MergeMem.
 522   // Therefore, the branches visited by the worklist are of this form:
 523   //    initial_mem -> (MergeMem ->)* store
 524   // The anti-dependence constraints apply only to the fringe of this tree.
 525 
 526   Node* initial_mem = load->in(MemNode::Memory);
 527   worklist_store.push(initial_mem);
 528   worklist_visited.push(initial_mem);
 529   worklist_mem.push(NULL);
 530   while (worklist_store.size() > 0) {
 531     // Examine a nearby store to see if it might interfere with our load.
 532     Node* mem   = worklist_mem.pop();
 533     Node* store = worklist_store.pop();
 534     uint op = store->Opcode();
 535 
 536     // MergeMems do not directly have anti-deps.
 537     // Treat them as internal nodes in a forward tree of memory states,
 538     // the leaves of which are each a 'possible-def'.
 539     if (store == initial_mem    // root (exclusive) of tree we are searching
 540         || op == Op_MergeMem    // internal node of tree we are searching
 541         ) {
 542       mem = store;   // It's not a possibly interfering store.
 543       if (store == initial_mem)
 544         initial_mem = NULL;  // only process initial memory once
 545 
 546       for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
 547         store = mem->fast_out(i);
 548         if (store->is_MergeMem()) {
 549           // Be sure we don't get into combinatorial problems.
 550           // (Allow phis to be repeated; they can merge two relevant states.)
 551           uint j = worklist_visited.size();
 552           for (; j > 0; j--) {
 553             if (worklist_visited.at(j-1) == store)  break;
 554           }
 555           if (j > 0)  continue; // already on work list; do not repeat
 556           worklist_visited.push(store);
 557         }
 558         worklist_mem.push(mem);
 559         worklist_store.push(store);
 560       }
 561       continue;
 562     }
 563 
 564     if (op == Op_MachProj || op == Op_Catch)   continue;
 565     if (store->needs_anti_dependence_check())  continue;  // not really a store
 566 
 567     // Compute the alias index.  Loads and stores with different alias
 568     // indices do not need anti-dependence edges.  Wide MemBar's are
 569     // anti-dependent on everything (except immutable memories).
 570     const TypePtr* adr_type = store->adr_type();
 571     if (!C->can_alias(adr_type, load_alias_idx))  continue;
 572 
 573     // Most slow-path runtime calls do NOT modify Java memory, but
 574     // they can block and so write Raw memory.
 575     if (store->is_Mach()) {
 576       MachNode* mstore = store->as_Mach();
 577       if (load_alias_idx != Compile::AliasIdxRaw) {
 578         // Check for call into the runtime using the Java calling
 579         // convention (and from there into a wrapper); it has no
 580         // _method.  Can't do this optimization for Native calls because
 581         // they CAN write to Java memory.
 582         if (mstore->ideal_Opcode() == Op_CallStaticJava) {
 583           assert(mstore->is_MachSafePoint(), "");
 584           MachSafePointNode* ms = (MachSafePointNode*) mstore;
 585           assert(ms->is_MachCallJava(), "");
 586           MachCallJavaNode* mcj = (MachCallJavaNode*) ms;
 587           if (mcj->_method == NULL) {
 588             // These runtime calls do not write to Java visible memory
 589             // (other than Raw) and so do not require anti-dependence edges.
 590             continue;
 591           }
 592         }
 593         // Same for SafePoints: they read/write Raw but only read otherwise.
 594         // This is basically a workaround for SafePoints only defining control
 595         // instead of control + memory.
 596         if (mstore->ideal_Opcode() == Op_SafePoint)
 597           continue;
 598       } else {
 599         // Some raw memory, such as the load of "top" at an allocation,
 600         // can be control dependent on the previous safepoint. See
 601         // comments in GraphKit::allocate_heap() about control input.
 602         // Inserting an anti-dep between such a safepoint and a use
 603         // creates a cycle, and will cause a subsequent failure in
 604         // local scheduling.  (BugId 4919904)
 605         // (%%% How can a control input be a safepoint and not a projection??)
 606         if (mstore->ideal_Opcode() == Op_SafePoint && load->in(0) == mstore)
 607           continue;
 608       }
 609     }
 610 
 611     // Identify a block that the current load must be above,
 612     // or else observe that 'store' is all the way up in the
 613     // earliest legal block for 'load'.  In the latter case,
 614     // immediately insert an anti-dependence edge.
 615     Block* store_block = get_block_for_node(store);
 616     assert(store_block != NULL, "unused killing projections skipped above");
 617 
 618     if (store->is_Phi()) {
 619       // 'load' uses memory which is one (or more) of the Phi's inputs.
 620       // It must be scheduled not before the Phi, but rather before
 621       // each of the relevant Phi inputs.
 622       //
 623       // Instead of finding the LCA of all inputs to a Phi that match 'mem',
 624       // we mark each corresponding predecessor block and do a combined
 625       // hoisting operation later (raise_LCA_above_marks).
 626       //
 627       // Do not assert(store_block != early, "Phi merging memory after access")
 628       // PhiNode may be at start of block 'early' with backedge to 'early'
 629       DEBUG_ONLY(bool found_match = false);
 630       for (uint j = PhiNode::Input, jmax = store->req(); j < jmax; j++) {
 631         if (store->in(j) == mem) {   // Found matching input?
 632           DEBUG_ONLY(found_match = true);
 633           Block* pred_block = get_block_for_node(store_block->pred(j));
 634           if (pred_block != early) {
 635             // If any predecessor of the Phi matches the load's "early block",
 636             // we do not need a precedence edge between the Phi and 'load'
 637             // since the load will be forced into a block preceding the Phi.
 638             pred_block->set_raise_LCA_mark(load_index);
 639             assert(!LCA_orig->dominates(pred_block) ||
 640                    early->dominates(pred_block), "early is high enough");
 641             must_raise_LCA = true;
 642           } else {
 643             // anti-dependent upon PHI pinned below 'early', no edge needed
 644             LCA = early;             // but can not schedule below 'early'
 645           }
 646         }
 647       }
 648       assert(found_match, "no worklist bug");
 649 #ifdef TRACK_PHI_INPUTS
 650 #ifdef ASSERT
 651       // This assert asks about correct handling of PhiNodes, which may not
 652       // have all input edges directly from 'mem'. See BugId 4621264
 653       int num_mem_inputs = phi_inputs.at_grow(store->_idx,0) + 1;
 654       // Increment by exactly one even if there are multiple copies of 'mem'
 655       // coming into the phi, because we will run this block several times
 656       // if there are several copies of 'mem'.  (That's how DU iterators work.)
 657       phi_inputs.at_put(store->_idx, num_mem_inputs);
 658       assert(PhiNode::Input + num_mem_inputs < store->req(),
 659              "Expect at least one phi input will not be from original memory state");
 660 #endif //ASSERT
 661 #endif //TRACK_PHI_INPUTS
 662     } else if (store_block != early) {
 663       // 'store' is between the current LCA and earliest possible block.
 664       // Label its block, and decide later on how to raise the LCA
 665       // to include the effect on LCA of this store.
 666       // If this store's block gets chosen as the raised LCA, we
 667       // will find him on the non_early_stores list and stick him
 668       // with a precedence edge.
 669       // (But, don't bother if LCA is already raised all the way.)
 670       if (LCA != early) {
 671         store_block->set_raise_LCA_mark(load_index);
 672         must_raise_LCA = true;
 673         non_early_stores.push(store);
 674       }
 675     } else {
 676       // Found a possibly-interfering store in the load's 'early' block.
 677       // This means 'load' cannot sink at all in the dominator tree.
 678       // Add an anti-dep edge, and squeeze 'load' into the highest block.
 679       assert(store != load->in(0), "dependence cycle found");
 680       if (verify) {
 681         assert(store->find_edge(load) != -1, "missing precedence edge");
 682       } else {
 683         store->add_prec(load);
 684       }
 685       LCA = early;
 686       // This turns off the process of gathering non_early_stores.
 687     }
 688   }
 689   // (Worklist is now empty; all nearby stores have been visited.)
 690 
 691   // Finished if 'load' must be scheduled in its 'early' block.
 692   // If we found any stores there, they have already been given
 693   // precedence edges.
 694   if (LCA == early)  return LCA;
 695 
 696   // We get here only if there are no possibly-interfering stores
 697   // in the load's 'early' block.  Move LCA up above all predecessors
 698   // which contain stores we have noted.
 699   //
 700   // The raised LCA block can be a home to such interfering stores,
 701   // but its predecessors must not contain any such stores.
 702   //
 703   // The raised LCA will be a lower bound for placing the load,
 704   // preventing the load from sinking past any block containing
 705   // a store that may invalidate the memory state required by 'load'.
 706   if (must_raise_LCA)
 707     LCA = raise_LCA_above_marks(LCA, load->_idx, early, this);
 708   if (LCA == early)  return LCA;
 709 
 710   // Insert anti-dependence edges from 'load' to each store
 711   // in the non-early LCA block.
 712   // Mine the non_early_stores list for such stores.
 713   if (LCA->raise_LCA_mark() == load_index) {
 714     while (non_early_stores.size() > 0) {
 715       Node* store = non_early_stores.pop();
 716       Block* store_block = get_block_for_node(store);
 717       if (store_block == LCA) {
 718         // add anti_dependence from store to load in its own block
 719         assert(store != load->in(0), "dependence cycle found");
 720         if (verify) {
 721           assert(store->find_edge(load) != -1, "missing precedence edge");
 722         } else {
 723           store->add_prec(load);
 724         }
 725       } else {
 726         assert(store_block->raise_LCA_mark() == load_index, "block was marked");
 727         // Any other stores we found must be either inside the new LCA
 728         // or else outside the original LCA.  In the latter case, they
 729         // did not interfere with any use of 'load'.
 730         assert(LCA->dominates(store_block)
 731                || !LCA_orig->dominates(store_block), "no stray stores");
 732       }
 733     }
 734   }
 735 
 736   // Return the highest block containing stores; any stores
 737   // within that block have been given anti-dependence edges.
 738   return LCA;
 739 }
 740 
 741 // This class is used to iterate backwards over the nodes in the graph.
 742 
 743 class Node_Backward_Iterator {
 744 
 745 private:
 746   Node_Backward_Iterator();
 747 
 748 public:
 749   // Constructor for the iterator
 750   Node_Backward_Iterator(Node *root, VectorSet &visited, Node_List &stack, PhaseCFG &cfg);
 751 
 752   // Postincrement operator to iterate over the nodes
 753   Node *next();
 754 
 755 private:
 756   VectorSet   &_visited;
 757   Node_List   &_stack;
 758   PhaseCFG &_cfg;
 759 };
 760 
 761 // Constructor for the Node_Backward_Iterator
 762 Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_List &stack, PhaseCFG &cfg)
 763   : _visited(visited), _stack(stack), _cfg(cfg) {
 764   // The stack should contain exactly the root
 765   stack.clear();
 766   stack.push(root);
 767 
 768   // Clear the visited bits
 769   visited.Clear();
 770 }
 771 
 772 // Iterator for the Node_Backward_Iterator
 773 Node *Node_Backward_Iterator::next() {
 774 
 775   // If the _stack is empty, then just return NULL: finished.
 776   if ( !_stack.size() )
 777     return NULL;
 778 
 779   // '_stack' is emulating a real _stack.  The 'visit-all-users' loop has been
 780   // made stateless, so I do not need to record the index 'i' on my _stack.
 781   // Instead I visit all users each time, scanning for unvisited users.
 782   // I visit unvisited not-anti-dependence users first, then anti-dependent
 783   // children next.
 784   Node *self = _stack.pop();
 785 
 786   // I cycle here when I am entering a deeper level of recursion.
 787   // The key variable 'self' was set prior to jumping here.
 788   while( 1 ) {
 789 
 790     _visited.set(self->_idx);
 791 
 792     // Now schedule all uses as late as possible.
 793     const Node* src = self->is_Proj() ? self->in(0) : self;
 794     uint src_rpo = _cfg.get_block_for_node(src)->_rpo;
 795 
 796     // Schedule all nodes in a post-order visit
 797     Node *unvisited = NULL;  // Unvisited anti-dependent Node, if any
 798 
 799     // Scan for unvisited nodes
 800     for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
 801       // For all uses, schedule late
 802       Node* n = self->fast_out(i); // Use
 803 
 804       // Skip already visited children
 805       if ( _visited.test(n->_idx) )
 806         continue;
 807 
 808       // do not traverse backward control edges
 809       Node *use = n->is_Proj() ? n->in(0) : n;
 810       uint use_rpo = _cfg.get_block_for_node(use)->_rpo;
 811 
 812       if ( use_rpo < src_rpo )
 813         continue;
 814 
 815       // Phi nodes always precede uses in a basic block
 816       if ( use_rpo == src_rpo && use->is_Phi() )
 817         continue;
 818 
 819       unvisited = n;      // Found unvisited
 820 
 821       // Check for possible-anti-dependent
 822       if( !n->needs_anti_dependence_check() )
 823         break;            // Not visited, not anti-dep; schedule it NOW
 824     }
 825 
 826     // Did I find an unvisited not-anti-dependent Node?
 827     if ( !unvisited )
 828       break;                  // All done with children; post-visit 'self'
 829 
 830     // Visit the unvisited Node.  Contains the obvious push to
 831     // indicate I'm entering a deeper level of recursion.  I push the
 832     // old state onto the _stack and set a new state and loop (recurse).
 833     _stack.push(self);
 834     self = unvisited;
 835   } // End recursion loop
 836 
 837   return self;
 838 }
 839 
 840 //------------------------------ComputeLatenciesBackwards----------------------
 841 // Compute the latency of all the instructions.
 842 void PhaseCFG::compute_latencies_backwards(VectorSet &visited, Node_List &stack) {
 843 #ifndef PRODUCT
 844   if (trace_opto_pipelining())
 845     tty->print("\n#---- ComputeLatenciesBackwards ----\n");
 846 #endif
 847 
 848   Node_Backward_Iterator iter((Node *)_root, visited, stack, *this);
 849   Node *n;
 850 
 851   // Walk over all the nodes from last to first
 852   while (n = iter.next()) {
 853     // Set the latency for the definitions of this instruction
 854     partial_latency_of_defs(n);
 855   }
 856 } // end ComputeLatenciesBackwards
 857 
 858 //------------------------------partial_latency_of_defs------------------------
 859 // Compute the latency impact of this node on all defs.  This computes
 860 // a number that increases as we approach the beginning of the routine.
 861 void PhaseCFG::partial_latency_of_defs(Node *n) {
 862   // Set the latency for this instruction
 863 #ifndef PRODUCT
 864   if (trace_opto_pipelining()) {
 865     tty->print("# latency_to_inputs: node_latency[%d] = %d for node", n->_idx, get_latency_for_node(n));
 866     dump();
 867   }
 868 #endif
 869 
 870   if (n->is_Proj()) {
 871     n = n->in(0);
 872   }
 873 
 874   if (n->is_Root()) {
 875     return;
 876   }
 877 
 878   uint nlen = n->len();
 879   uint use_latency = get_latency_for_node(n);
 880   uint use_pre_order = get_block_for_node(n)->_pre_order;
 881 
 882   for (uint j = 0; j < nlen; j++) {
 883     Node *def = n->in(j);
 884 
 885     if (!def || def == n) {
 886       continue;
 887     }
 888 
 889     // Walk backwards thru projections
 890     if (def->is_Proj()) {
 891       def = def->in(0);
 892     }
 893 
 894 #ifndef PRODUCT
 895     if (trace_opto_pipelining()) {
 896       tty->print("#    in(%2d): ", j);
 897       def->dump();
 898     }
 899 #endif
 900 
 901     // If the defining block is not known, assume it is ok
 902     Block *def_block = get_block_for_node(def);
 903     uint def_pre_order = def_block ? def_block->_pre_order : 0;
 904 
 905     if ((use_pre_order <  def_pre_order) || (use_pre_order == def_pre_order && n->is_Phi())) {
 906       continue;
 907     }
 908 
 909     uint delta_latency = n->latency(j);
 910     uint current_latency = delta_latency + use_latency;
 911 
 912     if (get_latency_for_node(def) < current_latency) {
 913       set_latency_for_node(def, current_latency);
 914     }
 915 
 916 #ifndef PRODUCT
 917     if (trace_opto_pipelining()) {
 918       tty->print_cr("#      %d + edge_latency(%d) == %d -> %d, node_latency[%d] = %d", use_latency, j, delta_latency, current_latency, def->_idx, get_latency_for_node(def));
 919     }
 920 #endif
 921   }
 922 }
 923 
 924 //------------------------------latency_from_use-------------------------------
 925 // Compute the latency of a specific use
 926 int PhaseCFG::latency_from_use(Node *n, const Node *def, Node *use) {
 927   // If self-reference, return no latency
 928   if (use == n || use->is_Root()) {
 929     return 0;
 930   }
 931 
 932   uint def_pre_order = get_block_for_node(def)->_pre_order;
 933   uint latency = 0;
 934 
 935   // If the use is not a projection, then it is simple...
 936   if (!use->is_Proj()) {
 937 #ifndef PRODUCT
 938     if (trace_opto_pipelining()) {
 939       tty->print("#    out(): ");
 940       use->dump();
 941     }
 942 #endif
 943 
 944     uint use_pre_order = get_block_for_node(use)->_pre_order;
 945 
 946     if (use_pre_order < def_pre_order)
 947       return 0;
 948 
 949     if (use_pre_order == def_pre_order && use->is_Phi())
 950       return 0;
 951 
 952     uint nlen = use->len();
 953     uint nl = get_latency_for_node(use);
 954 
 955     for ( uint j=0; j<nlen; j++ ) {
 956       if (use->in(j) == n) {
 957         // Change this if we want local latencies
 958         uint ul = use->latency(j);
 959         uint  l = ul + nl;
 960         if (latency < l) latency = l;
 961 #ifndef PRODUCT
 962         if (trace_opto_pipelining()) {
 963           tty->print_cr("#      %d + edge_latency(%d) == %d -> %d, latency = %d",
 964                         nl, j, ul, l, latency);
 965         }
 966 #endif
 967       }
 968     }
 969   } else {
 970     // This is a projection, just grab the latency of the use(s)
 971     for (DUIterator_Fast jmax, j = use->fast_outs(jmax); j < jmax; j++) {
 972       uint l = latency_from_use(use, def, use->fast_out(j));
 973       if (latency < l) latency = l;
 974     }
 975   }
 976 
 977   return latency;
 978 }
 979 
 980 //------------------------------latency_from_uses------------------------------
 981 // Compute the latency of this instruction relative to all of it's uses.
 982 // This computes a number that increases as we approach the beginning of the
 983 // routine.
 984 void PhaseCFG::latency_from_uses(Node *n) {
 985   // Set the latency for this instruction
 986 #ifndef PRODUCT
 987   if (trace_opto_pipelining()) {
 988     tty->print("# latency_from_outputs: node_latency[%d] = %d for node", n->_idx, get_latency_for_node(n));
 989     dump();
 990   }
 991 #endif
 992   uint latency=0;
 993   const Node *def = n->is_Proj() ? n->in(0): n;
 994 
 995   for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
 996     uint l = latency_from_use(n, def, n->fast_out(i));
 997 
 998     if (latency < l) latency = l;
 999   }
1000 
1001   set_latency_for_node(n, latency);
1002 }
1003 
1004 //------------------------------hoist_to_cheaper_block-------------------------
1005 // Pick a block for node self, between early and LCA, that is a cheaper
1006 // alternative to LCA.
1007 Block* PhaseCFG::hoist_to_cheaper_block(Block* LCA, Block* early, Node* self) {
1008   const double delta = 1+PROB_UNLIKELY_MAG(4);
1009   Block* least       = LCA;
1010   double least_freq  = least->_freq;
1011   uint target        = get_latency_for_node(self);
1012   uint start_latency = get_latency_for_node(LCA->head());
1013   uint end_latency   = get_latency_for_node(LCA->get_node(LCA->end_idx()));
1014   bool in_latency    = (target <= start_latency);
1015   const Block* root_block = get_block_for_node(_root);
1016 
1017   // Turn off latency scheduling if scheduling is just plain off
1018   if (!C->do_scheduling())
1019     in_latency = true;
1020 
1021   // Do not hoist (to cover latency) instructions which target a
1022   // single register.  Hoisting stretches the live range of the
1023   // single register and may force spilling.
1024   MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;
1025   if (mach && mach->out_RegMask().is_bound1() && mach->out_RegMask().is_NotEmpty())
1026     in_latency = true;
1027 
1028 #ifndef PRODUCT
1029   if (trace_opto_pipelining()) {
1030     tty->print("# Find cheaper block for latency %d: ", get_latency_for_node(self));
1031     self->dump();
1032     tty->print_cr("#   B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",
1033       LCA->_pre_order,
1034       LCA->head()->_idx,
1035       start_latency,
1036       LCA->get_node(LCA->end_idx())->_idx,
1037       end_latency,
1038       least_freq);
1039   }
1040 #endif
1041 
1042   int cand_cnt = 0;  // number of candidates tried
1043 
1044   // Walk up the dominator tree from LCA (Lowest common ancestor) to
1045   // the earliest legal location.  Capture the least execution frequency.
1046   while (LCA != early) {
1047     LCA = LCA->_idom;         // Follow up the dominator tree
1048 
1049     if (LCA == NULL) {
1050       // Bailout without retry
1051       C->record_method_not_compilable("late schedule failed: LCA == NULL");
1052       return least;
1053     }
1054 
1055     // Don't hoist machine instructions to the root basic block
1056     if (mach && LCA == root_block)
1057       break;
1058 
1059     uint start_lat = get_latency_for_node(LCA->head());
1060     uint end_idx   = LCA->end_idx();
1061     uint end_lat   = get_latency_for_node(LCA->get_node(end_idx));
1062     double LCA_freq = LCA->_freq;
1063 #ifndef PRODUCT
1064     if (trace_opto_pipelining()) {
1065       tty->print_cr("#   B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",
1066         LCA->_pre_order, LCA->head()->_idx, start_lat, end_idx, end_lat, LCA_freq);
1067     }
1068 #endif
1069     cand_cnt++;
1070     if (LCA_freq < least_freq              || // Better Frequency
1071         (StressGCM && Compile::randomized_select(cand_cnt)) || // Should be randomly accepted in stress mode
1072          (!StressGCM                    &&    // Otherwise, choose with latency
1073           !in_latency                   &&    // No block containing latency
1074           LCA_freq < least_freq * delta &&    // No worse frequency
1075           target >= end_lat             &&    // within latency range
1076           !self->is_iteratively_computed() )  // But don't hoist IV increments
1077              // because they may end up above other uses of their phi forcing
1078              // their result register to be different from their input.
1079        ) {
1080       least = LCA;            // Found cheaper block
1081       least_freq = LCA_freq;
1082       start_latency = start_lat;
1083       end_latency = end_lat;
1084       if (target <= start_lat)
1085         in_latency = true;
1086     }
1087   }
1088 
1089 #ifndef PRODUCT
1090   if (trace_opto_pipelining()) {
1091     tty->print_cr("#  Choose block B%d with start latency=%d and freq=%g",
1092       least->_pre_order, start_latency, least_freq);
1093   }
1094 #endif
1095 
1096   // See if the latency needs to be updated
1097   if (target < end_latency) {
1098 #ifndef PRODUCT
1099     if (trace_opto_pipelining()) {
1100       tty->print_cr("#  Change latency for [%4d] from %d to %d", self->_idx, target, end_latency);
1101     }
1102 #endif
1103     set_latency_for_node(self, end_latency);
1104     partial_latency_of_defs(self);
1105   }
1106 
1107   return least;
1108 }
1109 
1110 
1111 //------------------------------schedule_late-----------------------------------
1112 // Now schedule all codes as LATE as possible.  This is the LCA in the
1113 // dominator tree of all USES of a value.  Pick the block with the least
1114 // loop nesting depth that is lowest in the dominator tree.
1115 extern const char must_clone[];
1116 void PhaseCFG::schedule_late(VectorSet &visited, Node_List &stack) {
1117 #ifndef PRODUCT
1118   if (trace_opto_pipelining())
1119     tty->print("\n#---- schedule_late ----\n");
1120 #endif
1121 
1122   Node_Backward_Iterator iter((Node *)_root, visited, stack, *this);
1123   Node *self;
1124 
1125   // Walk over all the nodes from last to first
1126   while (self = iter.next()) {
1127     Block* early = get_block_for_node(self); // Earliest legal placement
1128 
1129     if (self->is_top()) {
1130       // Top node goes in bb #2 with other constants.
1131       // It must be special-cased, because it has no out edges.
1132       early->add_inst(self);
1133       continue;
1134     }
1135 
1136     // No uses, just terminate
1137     if (self->outcnt() == 0) {
1138       assert(self->is_MachProj(), "sanity");
1139       continue;                   // Must be a dead machine projection
1140     }
1141 
1142     // If node is pinned in the block, then no scheduling can be done.
1143     if( self->pinned() )          // Pinned in block?
1144       continue;
1145 
1146     MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;
1147     if (mach) {
1148       switch (mach->ideal_Opcode()) {
1149       case Op_CreateEx:
1150         // Don't move exception creation
1151         early->add_inst(self);
1152         continue;
1153         break;
1154       case Op_CheckCastPP:
1155         // Don't move CheckCastPP nodes away from their input, if the input
1156         // is a rawptr (5071820).
1157         Node *def = self->in(1);
1158         if (def != NULL && def->bottom_type()->base() == Type::RawPtr) {
1159           early->add_inst(self);
1160 #ifdef ASSERT
1161           _raw_oops.push(def);
1162 #endif
1163           continue;
1164         }
1165         break;
1166       }
1167     }
1168 
1169     // Gather LCA of all uses
1170     Block *LCA = NULL;
1171     {
1172       for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
1173         // For all uses, find LCA
1174         Node* use = self->fast_out(i);
1175         LCA = raise_LCA_above_use(LCA, use, self, this);
1176       }
1177     }  // (Hide defs of imax, i from rest of block.)
1178 
1179     // Place temps in the block of their use.  This isn't a
1180     // requirement for correctness but it reduces useless
1181     // interference between temps and other nodes.
1182     if (mach != NULL && mach->is_MachTemp()) {
1183       map_node_to_block(self, LCA);
1184       LCA->add_inst(self);
1185       continue;
1186     }
1187 
1188     // Check if 'self' could be anti-dependent on memory
1189     if (self->needs_anti_dependence_check()) {
1190       // Hoist LCA above possible-defs and insert anti-dependences to
1191       // defs in new LCA block.
1192       LCA = insert_anti_dependences(LCA, self);
1193     }
1194 
1195     if (early->_dom_depth > LCA->_dom_depth) {
1196       // Somehow the LCA has moved above the earliest legal point.
1197       // (One way this can happen is via memory_early_block.)
1198       if (C->subsume_loads() == true && !C->failing()) {
1199         // Retry with subsume_loads == false
1200         // If this is the first failure, the sentinel string will "stick"
1201         // to the Compile object, and the C2Compiler will see it and retry.
1202         C->record_failure(C2Compiler::retry_no_subsuming_loads());
1203       } else {
1204         // Bailout without retry when (early->_dom_depth > LCA->_dom_depth)
1205         C->record_method_not_compilable("late schedule failed: incorrect graph");
1206       }
1207       return;
1208     }
1209 
1210     // If there is no opportunity to hoist, then we're done.
1211     // In stress mode, try to hoist even the single operations.
1212     bool try_to_hoist = StressGCM || (LCA != early);
1213 
1214     // Must clone guys stay next to use; no hoisting allowed.
1215     // Also cannot hoist guys that alter memory or are otherwise not
1216     // allocatable (hoisting can make a value live longer, leading to
1217     // anti and output dependency problems which are normally resolved
1218     // by the register allocator giving everyone a different register).
1219     if (mach != NULL && must_clone[mach->ideal_Opcode()])
1220       try_to_hoist = false;
1221 
1222     Block* late = NULL;
1223     if (try_to_hoist) {
1224       // Now find the block with the least execution frequency.
1225       // Start at the latest schedule and work up to the earliest schedule
1226       // in the dominator tree.  Thus the Node will dominate all its uses.
1227       late = hoist_to_cheaper_block(LCA, early, self);
1228     } else {
1229       // Just use the LCA of the uses.
1230       late = LCA;
1231     }
1232 
1233     // Put the node into target block
1234     schedule_node_into_block(self, late);
1235 
1236 #ifdef ASSERT
1237     if (self->needs_anti_dependence_check()) {
1238       // since precedence edges are only inserted when we're sure they
1239       // are needed make sure that after placement in a block we don't
1240       // need any new precedence edges.
1241       verify_anti_dependences(late, self);
1242     }
1243 #endif
1244   } // Loop until all nodes have been visited
1245 
1246 } // end ScheduleLate
1247 
1248 //------------------------------GlobalCodeMotion-------------------------------
1249 void PhaseCFG::global_code_motion() {
1250   ResourceMark rm;
1251 
1252 #ifndef PRODUCT
1253   if (trace_opto_pipelining()) {
1254     tty->print("\n---- Start GlobalCodeMotion ----\n");
1255   }
1256 #endif
1257 
1258   // Initialize the node to block mapping for things on the proj_list
1259   for (uint i = 0; i < _matcher.number_of_projections(); i++) {
1260     unmap_node_from_block(_matcher.get_projection(i));
1261   }
1262 
1263   // Set the basic block for Nodes pinned into blocks
1264   Arena* arena = Thread::current()->resource_area();
1265   VectorSet visited(arena);
1266   schedule_pinned_nodes(visited);
1267 
1268   // Find the earliest Block any instruction can be placed in.  Some
1269   // instructions are pinned into Blocks.  Unpinned instructions can
1270   // appear in last block in which all their inputs occur.
1271   visited.Clear();
1272   Node_List stack(arena);
1273   // Pre-grow the list
1274   stack.map((C->unique() >> 1) + 16, NULL);
1275   if (!schedule_early(visited, stack)) {
1276     // Bailout without retry
1277     C->record_method_not_compilable("early schedule failed");
1278     return;
1279   }
1280 
1281   // Build Def-Use edges.
1282   // Compute the latency information (via backwards walk) for all the
1283   // instructions in the graph
1284   _node_latency = new GrowableArray<uint>(); // resource_area allocation
1285 
1286   if (C->do_scheduling()) {
1287     compute_latencies_backwards(visited, stack);
1288   }
1289 
1290   // Now schedule all codes as LATE as possible.  This is the LCA in the
1291   // dominator tree of all USES of a value.  Pick the block with the least
1292   // loop nesting depth that is lowest in the dominator tree.
1293   // ( visited.Clear() called in schedule_late()->Node_Backward_Iterator() )
1294   schedule_late(visited, stack);
1295   if (C->failing()) {
1296     // schedule_late fails only when graph is incorrect.
1297     assert(!VerifyGraphEdges, "verification should have failed");
1298     return;
1299   }
1300 
1301 #ifndef PRODUCT
1302   if (trace_opto_pipelining()) {
1303     tty->print("\n---- Detect implicit null checks ----\n");
1304   }
1305 #endif
1306 
1307   // Detect implicit-null-check opportunities.  Basically, find NULL checks
1308   // with suitable memory ops nearby.  Use the memory op to do the NULL check.
1309   // I can generate a memory op if there is not one nearby.
1310   if (C->is_method_compilation()) {
1311     // By reversing the loop direction we get a very minor gain on mpegaudio.
1312     // Feel free to revert to a forward loop for clarity.
1313     // for( int i=0; i < (int)matcher._null_check_tests.size(); i+=2 ) {
1314     for (int i = _matcher._null_check_tests.size() - 2; i >= 0; i -= 2) {
1315       Node* proj = _matcher._null_check_tests[i];
1316       Node* val  = _matcher._null_check_tests[i + 1];
1317       Block* block = get_block_for_node(proj);
1318       implicit_null_check(block, proj, val, C->allowed_deopt_reasons());
1319       // The implicit_null_check will only perform the transformation
1320       // if the null branch is truly uncommon, *and* it leads to an
1321       // uncommon trap.  Combined with the too_many_traps guards
1322       // above, this prevents SEGV storms reported in 6366351,
1323       // by recompiling offending methods without this optimization.
1324     }
1325   }
1326 
1327 #ifndef PRODUCT
1328   if (trace_opto_pipelining()) {
1329     tty->print("\n---- Start Local Scheduling ----\n");
1330   }
1331 #endif
1332 
1333   // Schedule locally.  Right now a simple topological sort.
1334   // Later, do a real latency aware scheduler.
1335   GrowableArray<int> ready_cnt(C->unique(), C->unique(), -1);
1336   visited.Clear();
1337   for (uint i = 0; i < number_of_blocks(); i++) {
1338     Block* block = get_block(i);
1339     if (!schedule_local(block, ready_cnt, visited)) {
1340       if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) {
1341         C->record_method_not_compilable("local schedule failed");
1342       }
1343       return;
1344     }
1345   }
1346 
1347   // If we inserted any instructions between a Call and his CatchNode,
1348   // clone the instructions on all paths below the Catch.
1349   for (uint i = 0; i < number_of_blocks(); i++) {
1350     Block* block = get_block(i);
1351     call_catch_cleanup(block);
1352   }
1353 
1354 #ifndef PRODUCT
1355   if (trace_opto_pipelining()) {
1356     tty->print("\n---- After GlobalCodeMotion ----\n");
1357     for (uint i = 0; i < number_of_blocks(); i++) {
1358       Block* block = get_block(i);
1359       block->dump();
1360     }
1361   }
1362 #endif
1363   // Dead.
1364   _node_latency = (GrowableArray<uint> *)0xdeadbeef;
1365 }
1366 
1367 bool PhaseCFG::do_global_code_motion() {
1368 
1369   build_dominator_tree();
1370   if (C->failing()) {
1371     return false;
1372   }
1373 
1374   NOT_PRODUCT( C->verify_graph_edges(); )
1375 
1376   estimate_block_frequency();
1377 
1378   global_code_motion();
1379 
1380   if (C->failing()) {
1381     return false;
1382   }
1383 
1384   return true;
1385 }
1386 
1387 //------------------------------Estimate_Block_Frequency-----------------------
1388 // Estimate block frequencies based on IfNode probabilities.
1389 void PhaseCFG::estimate_block_frequency() {
1390 
1391   // Force conditional branches leading to uncommon traps to be unlikely,
1392   // not because we get to the uncommon_trap with less relative frequency,
1393   // but because an uncommon_trap typically causes a deopt, so we only get
1394   // there once.
1395   if (C->do_freq_based_layout()) {
1396     Block_List worklist;
1397     Block* root_blk = get_block(0);
1398     for (uint i = 1; i < root_blk->num_preds(); i++) {
1399       Block *pb = get_block_for_node(root_blk->pred(i));
1400       if (pb->has_uncommon_code()) {
1401         worklist.push(pb);
1402       }
1403     }
1404     while (worklist.size() > 0) {
1405       Block* uct = worklist.pop();
1406       if (uct == get_root_block()) {
1407         continue;
1408       }
1409       for (uint i = 1; i < uct->num_preds(); i++) {
1410         Block *pb = get_block_for_node(uct->pred(i));
1411         if (pb->_num_succs == 1) {
1412           worklist.push(pb);
1413         } else if (pb->num_fall_throughs() == 2) {
1414           pb->update_uncommon_branch(uct);
1415         }
1416       }
1417     }
1418   }
1419 
1420   // Create the loop tree and calculate loop depth.
1421   _root_loop = create_loop_tree();
1422   _root_loop->compute_loop_depth(0);
1423 
1424   // Compute block frequency of each block, relative to a single loop entry.
1425   _root_loop->compute_freq();
1426 
1427   // Adjust all frequencies to be relative to a single method entry
1428   _root_loop->_freq = 1.0;
1429   _root_loop->scale_freq();
1430 
1431   // Save outmost loop frequency for LRG frequency threshold
1432   _outer_loop_frequency = _root_loop->outer_loop_freq();
1433 
1434   // force paths ending at uncommon traps to be infrequent
1435   if (!C->do_freq_based_layout()) {
1436     Block_List worklist;
1437     Block* root_blk = get_block(0);
1438     for (uint i = 1; i < root_blk->num_preds(); i++) {
1439       Block *pb = get_block_for_node(root_blk->pred(i));
1440       if (pb->has_uncommon_code()) {
1441         worklist.push(pb);
1442       }
1443     }
1444     while (worklist.size() > 0) {
1445       Block* uct = worklist.pop();
1446       uct->_freq = PROB_MIN;
1447       for (uint i = 1; i < uct->num_preds(); i++) {
1448         Block *pb = get_block_for_node(uct->pred(i));
1449         if (pb->_num_succs == 1 && pb->_freq > PROB_MIN) {
1450           worklist.push(pb);
1451         }
1452       }
1453     }
1454   }
1455 
1456 #ifdef ASSERT
1457   for (uint i = 0; i < number_of_blocks(); i++) {
1458     Block* b = get_block(i);
1459     assert(b->_freq >= MIN_BLOCK_FREQUENCY, "Register Allocator requires meaningful block frequency");
1460   }
1461 #endif
1462 
1463 #ifndef PRODUCT
1464   if (PrintCFGBlockFreq) {
1465     tty->print_cr("CFG Block Frequencies");
1466     _root_loop->dump_tree();
1467     if (Verbose) {
1468       tty->print_cr("PhaseCFG dump");
1469       dump();
1470       tty->print_cr("Node dump");
1471       _root->dump(99999);
1472     }
1473   }
1474 #endif
1475 }
1476 
1477 //----------------------------create_loop_tree--------------------------------
1478 // Create a loop tree from the CFG
1479 CFGLoop* PhaseCFG::create_loop_tree() {
1480 
1481 #ifdef ASSERT
1482   assert(get_block(0) == get_root_block(), "first block should be root block");
1483   for (uint i = 0; i < number_of_blocks(); i++) {
1484     Block* block = get_block(i);
1485     // Check that _loop field are clear...we could clear them if not.
1486     assert(block->_loop == NULL, "clear _loop expected");
1487     // Sanity check that the RPO numbering is reflected in the _blocks array.
1488     // It doesn't have to be for the loop tree to be built, but if it is not,
1489     // then the blocks have been reordered since dom graph building...which
1490     // may question the RPO numbering
1491     assert(block->_rpo == i, "unexpected reverse post order number");
1492   }
1493 #endif
1494 
1495   int idct = 0;
1496   CFGLoop* root_loop = new CFGLoop(idct++);
1497 
1498   Block_List worklist;
1499 
1500   // Assign blocks to loops
1501   for(uint i = number_of_blocks() - 1; i > 0; i-- ) { // skip Root block
1502     Block* block = get_block(i);
1503 
1504     if (block->head()->is_Loop()) {
1505       Block* loop_head = block;
1506       assert(loop_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1507       Node* tail_n = loop_head->pred(LoopNode::LoopBackControl);
1508       Block* tail = get_block_for_node(tail_n);
1509 
1510       // Defensively filter out Loop nodes for non-single-entry loops.
1511       // For all reasonable loops, the head occurs before the tail in RPO.
1512       if (i <= tail->_rpo) {
1513 
1514         // The tail and (recursive) predecessors of the tail
1515         // are made members of a new loop.
1516 
1517         assert(worklist.size() == 0, "nonempty worklist");
1518         CFGLoop* nloop = new CFGLoop(idct++);
1519         assert(loop_head->_loop == NULL, "just checking");
1520         loop_head->_loop = nloop;
1521         // Add to nloop so push_pred() will skip over inner loops
1522         nloop->add_member(loop_head);
1523         nloop->push_pred(loop_head, LoopNode::LoopBackControl, worklist, this);
1524 
1525         while (worklist.size() > 0) {
1526           Block* member = worklist.pop();
1527           if (member != loop_head) {
1528             for (uint j = 1; j < member->num_preds(); j++) {
1529               nloop->push_pred(member, j, worklist, this);
1530             }
1531           }
1532         }
1533       }
1534     }
1535   }
1536 
1537   // Create a member list for each loop consisting
1538   // of both blocks and (immediate child) loops.
1539   for (uint i = 0; i < number_of_blocks(); i++) {
1540     Block* block = get_block(i);
1541     CFGLoop* lp = block->_loop;
1542     if (lp == NULL) {
1543       // Not assigned to a loop. Add it to the method's pseudo loop.
1544       block->_loop = root_loop;
1545       lp = root_loop;
1546     }
1547     if (lp == root_loop || block != lp->head()) { // loop heads are already members
1548       lp->add_member(block);
1549     }
1550     if (lp != root_loop) {
1551       if (lp->parent() == NULL) {
1552         // Not a nested loop. Make it a child of the method's pseudo loop.
1553         root_loop->add_nested_loop(lp);
1554       }
1555       if (block == lp->head()) {
1556         // Add nested loop to member list of parent loop.
1557         lp->parent()->add_member(lp);
1558       }
1559     }
1560   }
1561 
1562   return root_loop;
1563 }
1564 
1565 //------------------------------push_pred--------------------------------------
1566 void CFGLoop::push_pred(Block* blk, int i, Block_List& worklist, PhaseCFG* cfg) {
1567   Node* pred_n = blk->pred(i);
1568   Block* pred = cfg->get_block_for_node(pred_n);
1569   CFGLoop *pred_loop = pred->_loop;
1570   if (pred_loop == NULL) {
1571     // Filter out blocks for non-single-entry loops.
1572     // For all reasonable loops, the head occurs before the tail in RPO.
1573     if (pred->_rpo > head()->_rpo) {
1574       pred->_loop = this;
1575       worklist.push(pred);
1576     }
1577   } else if (pred_loop != this) {
1578     // Nested loop.
1579     while (pred_loop->_parent != NULL && pred_loop->_parent != this) {
1580       pred_loop = pred_loop->_parent;
1581     }
1582     // Make pred's loop be a child
1583     if (pred_loop->_parent == NULL) {
1584       add_nested_loop(pred_loop);
1585       // Continue with loop entry predecessor.
1586       Block* pred_head = pred_loop->head();
1587       assert(pred_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1588       assert(pred_head != head(), "loop head in only one loop");
1589       push_pred(pred_head, LoopNode::EntryControl, worklist, cfg);
1590     } else {
1591       assert(pred_loop->_parent == this && _parent == NULL, "just checking");
1592     }
1593   }
1594 }
1595 
1596 //------------------------------add_nested_loop--------------------------------
1597 // Make cl a child of the current loop in the loop tree.
1598 void CFGLoop::add_nested_loop(CFGLoop* cl) {
1599   assert(_parent == NULL, "no parent yet");
1600   assert(cl != this, "not my own parent");
1601   cl->_parent = this;
1602   CFGLoop* ch = _child;
1603   if (ch == NULL) {
1604     _child = cl;
1605   } else {
1606     while (ch->_sibling != NULL) { ch = ch->_sibling; }
1607     ch->_sibling = cl;
1608   }
1609 }
1610 
1611 //------------------------------compute_loop_depth-----------------------------
1612 // Store the loop depth in each CFGLoop object.
1613 // Recursively walk the children to do the same for them.
1614 void CFGLoop::compute_loop_depth(int depth) {
1615   _depth = depth;
1616   CFGLoop* ch = _child;
1617   while (ch != NULL) {
1618     ch->compute_loop_depth(depth + 1);
1619     ch = ch->_sibling;
1620   }
1621 }
1622 
1623 //------------------------------compute_freq-----------------------------------
1624 // Compute the frequency of each block and loop, relative to a single entry
1625 // into the dominating loop head.
1626 void CFGLoop::compute_freq() {
1627   // Bottom up traversal of loop tree (visit inner loops first.)
1628   // Set loop head frequency to 1.0, then transitively
1629   // compute frequency for all successors in the loop,
1630   // as well as for each exit edge.  Inner loops are
1631   // treated as single blocks with loop exit targets
1632   // as the successor blocks.
1633 
1634   // Nested loops first
1635   CFGLoop* ch = _child;
1636   while (ch != NULL) {
1637     ch->compute_freq();
1638     ch = ch->_sibling;
1639   }
1640   assert (_members.length() > 0, "no empty loops");
1641   Block* hd = head();
1642   hd->_freq = 1.0;
1643   for (int i = 0; i < _members.length(); i++) {
1644     CFGElement* s = _members.at(i);
1645     double freq = s->_freq;
1646     if (s->is_block()) {
1647       Block* b = s->as_Block();
1648       for (uint j = 0; j < b->_num_succs; j++) {
1649         Block* sb = b->_succs[j];
1650         update_succ_freq(sb, freq * b->succ_prob(j));
1651       }
1652     } else {
1653       CFGLoop* lp = s->as_CFGLoop();
1654       assert(lp->_parent == this, "immediate child");
1655       for (int k = 0; k < lp->_exits.length(); k++) {
1656         Block* eb = lp->_exits.at(k).get_target();
1657         double prob = lp->_exits.at(k).get_prob();
1658         update_succ_freq(eb, freq * prob);
1659       }
1660     }
1661   }
1662 
1663   // For all loops other than the outer, "method" loop,
1664   // sum and normalize the exit probability. The "method" loop
1665   // should keep the initial exit probability of 1, so that
1666   // inner blocks do not get erroneously scaled.
1667   if (_depth != 0) {
1668     // Total the exit probabilities for this loop.
1669     double exits_sum = 0.0f;
1670     for (int i = 0; i < _exits.length(); i++) {
1671       exits_sum += _exits.at(i).get_prob();
1672     }
1673 
1674     // Normalize the exit probabilities. Until now, the
1675     // probabilities estimate the possibility of exit per
1676     // a single loop iteration; afterward, they estimate
1677     // the probability of exit per loop entry.
1678     for (int i = 0; i < _exits.length(); i++) {
1679       Block* et = _exits.at(i).get_target();
1680       float new_prob = 0.0f;
1681       if (_exits.at(i).get_prob() > 0.0f) {
1682         new_prob = _exits.at(i).get_prob() / exits_sum;
1683       }
1684       BlockProbPair bpp(et, new_prob);
1685       _exits.at_put(i, bpp);
1686     }
1687 
1688     // Save the total, but guard against unreasonable probability,
1689     // as the value is used to estimate the loop trip count.
1690     // An infinite trip count would blur relative block
1691     // frequencies.
1692     if (exits_sum > 1.0f) exits_sum = 1.0;
1693     if (exits_sum < PROB_MIN) exits_sum = PROB_MIN;
1694     _exit_prob = exits_sum;
1695   }
1696 }
1697 
1698 //------------------------------succ_prob-------------------------------------
1699 // Determine the probability of reaching successor 'i' from the receiver block.
1700 float Block::succ_prob(uint i) {
1701   int eidx = end_idx();
1702   Node *n = get_node(eidx);  // Get ending Node
1703 
1704   int op = n->Opcode();
1705   if (n->is_Mach()) {
1706     if (n->is_MachNullCheck()) {
1707       // Can only reach here if called after lcm. The original Op_If is gone,
1708       // so we attempt to infer the probability from one or both of the
1709       // successor blocks.
1710       assert(_num_succs == 2, "expecting 2 successors of a null check");
1711       // If either successor has only one predecessor, then the
1712       // probability estimate can be derived using the
1713       // relative frequency of the successor and this block.
1714       if (_succs[i]->num_preds() == 2) {
1715         return _succs[i]->_freq / _freq;
1716       } else if (_succs[1-i]->num_preds() == 2) {
1717         return 1 - (_succs[1-i]->_freq / _freq);
1718       } else {
1719         // Estimate using both successor frequencies
1720         float freq = _succs[i]->_freq;
1721         return freq / (freq + _succs[1-i]->_freq);
1722       }
1723     }
1724     op = n->as_Mach()->ideal_Opcode();
1725   }
1726 
1727 
1728   // Switch on branch type
1729   switch( op ) {
1730   case Op_CountedLoopEnd:
1731   case Op_If: {
1732     assert (i < 2, "just checking");
1733     // Conditionals pass on only part of their frequency
1734     float prob  = n->as_MachIf()->_prob;
1735     assert(prob >= 0.0 && prob <= 1.0, "out of range probability");
1736     // If succ[i] is the FALSE branch, invert path info
1737     if( get_node(i + eidx + 1)->Opcode() == Op_IfFalse ) {
1738       return 1.0f - prob; // not taken
1739     } else {
1740       return prob; // taken
1741     }
1742   }
1743 
1744   case Op_Jump:
1745     // Divide the frequency between all successors evenly
1746     return 1.0f/_num_succs;
1747 
1748   case Op_Catch: {
1749     const CatchProjNode *ci = get_node(i + eidx + 1)->as_CatchProj();
1750     if (ci->_con == CatchProjNode::fall_through_index) {
1751       // Fall-thru path gets the lion's share.
1752       return 1.0f - PROB_UNLIKELY_MAG(5)*_num_succs;
1753     } else {
1754       // Presume exceptional paths are equally unlikely
1755       return PROB_UNLIKELY_MAG(5);
1756     }
1757   }
1758 
1759   case Op_Root:
1760   case Op_Goto:
1761     // Pass frequency straight thru to target
1762     return 1.0f;
1763 
1764   case Op_NeverBranch:
1765     return 0.0f;
1766 
1767   case Op_TailCall:
1768   case Op_TailJump:
1769   case Op_Return:
1770   case Op_Halt:
1771   case Op_Rethrow:
1772     // Do not push out freq to root block
1773     return 0.0f;
1774 
1775   default:
1776     ShouldNotReachHere();
1777   }
1778 
1779   return 0.0f;
1780 }
1781 
1782 //------------------------------num_fall_throughs-----------------------------
1783 // Return the number of fall-through candidates for a block
1784 int Block::num_fall_throughs() {
1785   int eidx = end_idx();
1786   Node *n = get_node(eidx);  // Get ending Node
1787 
1788   int op = n->Opcode();
1789   if (n->is_Mach()) {
1790     if (n->is_MachNullCheck()) {
1791       // In theory, either side can fall-thru, for simplicity sake,
1792       // let's say only the false branch can now.
1793       return 1;
1794     }
1795     op = n->as_Mach()->ideal_Opcode();
1796   }
1797 
1798   // Switch on branch type
1799   switch( op ) {
1800   case Op_CountedLoopEnd:
1801   case Op_If:
1802     return 2;
1803 
1804   case Op_Root:
1805   case Op_Goto:
1806     return 1;
1807 
1808   case Op_Catch: {
1809     for (uint i = 0; i < _num_succs; i++) {
1810       const CatchProjNode *ci = get_node(i + eidx + 1)->as_CatchProj();
1811       if (ci->_con == CatchProjNode::fall_through_index) {
1812         return 1;
1813       }
1814     }
1815     return 0;
1816   }
1817 
1818   case Op_Jump:
1819   case Op_NeverBranch:
1820   case Op_TailCall:
1821   case Op_TailJump:
1822   case Op_Return:
1823   case Op_Halt:
1824   case Op_Rethrow:
1825     return 0;
1826 
1827   default:
1828     ShouldNotReachHere();
1829   }
1830 
1831   return 0;
1832 }
1833 
1834 //------------------------------succ_fall_through-----------------------------
1835 // Return true if a specific successor could be fall-through target.
1836 bool Block::succ_fall_through(uint i) {
1837   int eidx = end_idx();
1838   Node *n = get_node(eidx);  // Get ending Node
1839 
1840   int op = n->Opcode();
1841   if (n->is_Mach()) {
1842     if (n->is_MachNullCheck()) {
1843       // In theory, either side can fall-thru, for simplicity sake,
1844       // let's say only the false branch can now.
1845       return get_node(i + eidx + 1)->Opcode() == Op_IfFalse;
1846     }
1847     op = n->as_Mach()->ideal_Opcode();
1848   }
1849 
1850   // Switch on branch type
1851   switch( op ) {
1852   case Op_CountedLoopEnd:
1853   case Op_If:
1854   case Op_Root:
1855   case Op_Goto:
1856     return true;
1857 
1858   case Op_Catch: {
1859     const CatchProjNode *ci = get_node(i + eidx + 1)->as_CatchProj();
1860     return ci->_con == CatchProjNode::fall_through_index;
1861   }
1862 
1863   case Op_Jump:
1864   case Op_NeverBranch:
1865   case Op_TailCall:
1866   case Op_TailJump:
1867   case Op_Return:
1868   case Op_Halt:
1869   case Op_Rethrow:
1870     return false;
1871 
1872   default:
1873     ShouldNotReachHere();
1874   }
1875 
1876   return false;
1877 }
1878 
1879 //------------------------------update_uncommon_branch------------------------
1880 // Update the probability of a two-branch to be uncommon
1881 void Block::update_uncommon_branch(Block* ub) {
1882   int eidx = end_idx();
1883   Node *n = get_node(eidx);  // Get ending Node
1884 
1885   int op = n->as_Mach()->ideal_Opcode();
1886 
1887   assert(op == Op_CountedLoopEnd || op == Op_If, "must be a If");
1888   assert(num_fall_throughs() == 2, "must be a two way branch block");
1889 
1890   // Which successor is ub?
1891   uint s;
1892   for (s = 0; s <_num_succs; s++) {
1893     if (_succs[s] == ub) break;
1894   }
1895   assert(s < 2, "uncommon successor must be found");
1896 
1897   // If ub is the true path, make the proability small, else
1898   // ub is the false path, and make the probability large
1899   bool invert = (get_node(s + eidx + 1)->Opcode() == Op_IfFalse);
1900 
1901   // Get existing probability
1902   float p = n->as_MachIf()->_prob;
1903 
1904   if (invert) p = 1.0 - p;
1905   if (p > PROB_MIN) {
1906     p = PROB_MIN;
1907   }
1908   if (invert) p = 1.0 - p;
1909 
1910   n->as_MachIf()->_prob = p;
1911 }
1912 
1913 //------------------------------update_succ_freq-------------------------------
1914 // Update the appropriate frequency associated with block 'b', a successor of
1915 // a block in this loop.
1916 void CFGLoop::update_succ_freq(Block* b, double freq) {
1917   if (b->_loop == this) {
1918     if (b == head()) {
1919       // back branch within the loop
1920       // Do nothing now, the loop carried frequency will be
1921       // adjust later in scale_freq().
1922     } else {
1923       // simple branch within the loop
1924       b->_freq += freq;
1925     }
1926   } else if (!in_loop_nest(b)) {
1927     // branch is exit from this loop
1928     BlockProbPair bpp(b, freq);
1929     _exits.append(bpp);
1930   } else {
1931     // branch into nested loop
1932     CFGLoop* ch = b->_loop;
1933     ch->_freq += freq;
1934   }
1935 }
1936 
1937 //------------------------------in_loop_nest-----------------------------------
1938 // Determine if block b is in the receiver's loop nest.
1939 bool CFGLoop::in_loop_nest(Block* b) {
1940   int depth = _depth;
1941   CFGLoop* b_loop = b->_loop;
1942   int b_depth = b_loop->_depth;
1943   if (depth == b_depth) {
1944     return true;
1945   }
1946   while (b_depth > depth) {
1947     b_loop = b_loop->_parent;
1948     b_depth = b_loop->_depth;
1949   }
1950   return b_loop == this;
1951 }
1952 
1953 //------------------------------scale_freq-------------------------------------
1954 // Scale frequency of loops and blocks by trip counts from outer loops
1955 // Do a top down traversal of loop tree (visit outer loops first.)
1956 void CFGLoop::scale_freq() {
1957   double loop_freq = _freq * trip_count();
1958   _freq = loop_freq;
1959   for (int i = 0; i < _members.length(); i++) {
1960     CFGElement* s = _members.at(i);
1961     double block_freq = s->_freq * loop_freq;
1962     if (g_isnan(block_freq) || block_freq < MIN_BLOCK_FREQUENCY)
1963       block_freq = MIN_BLOCK_FREQUENCY;
1964     s->_freq = block_freq;
1965   }
1966   CFGLoop* ch = _child;
1967   while (ch != NULL) {
1968     ch->scale_freq();
1969     ch = ch->_sibling;
1970   }
1971 }
1972 
1973 // Frequency of outer loop
1974 double CFGLoop::outer_loop_freq() const {
1975   if (_child != NULL) {
1976     return _child->_freq;
1977   }
1978   return _freq;
1979 }
1980 
1981 #ifndef PRODUCT
1982 //------------------------------dump_tree--------------------------------------
1983 void CFGLoop::dump_tree() const {
1984   dump();
1985   if (_child != NULL)   _child->dump_tree();
1986   if (_sibling != NULL) _sibling->dump_tree();
1987 }
1988 
1989 //------------------------------dump-------------------------------------------
1990 void CFGLoop::dump() const {
1991   for (int i = 0; i < _depth; i++) tty->print("   ");
1992   tty->print("%s: %d  trip_count: %6.0f freq: %6.0f\n",
1993              _depth == 0 ? "Method" : "Loop", _id, trip_count(), _freq);
1994   for (int i = 0; i < _depth; i++) tty->print("   ");
1995   tty->print("         members:");
1996   int k = 0;
1997   for (int i = 0; i < _members.length(); i++) {
1998     if (k++ >= 6) {
1999       tty->print("\n              ");
2000       for (int j = 0; j < _depth+1; j++) tty->print("   ");
2001       k = 0;
2002     }
2003     CFGElement *s = _members.at(i);
2004     if (s->is_block()) {
2005       Block *b = s->as_Block();
2006       tty->print(" B%d(%6.3f)", b->_pre_order, b->_freq);
2007     } else {
2008       CFGLoop* lp = s->as_CFGLoop();
2009       tty->print(" L%d(%6.3f)", lp->_id, lp->_freq);
2010     }
2011   }
2012   tty->print("\n");
2013   for (int i = 0; i < _depth; i++) tty->print("   ");
2014   tty->print("         exits:  ");
2015   k = 0;
2016   for (int i = 0; i < _exits.length(); i++) {
2017     if (k++ >= 7) {
2018       tty->print("\n              ");
2019       for (int j = 0; j < _depth+1; j++) tty->print("   ");
2020       k = 0;
2021     }
2022     Block *blk = _exits.at(i).get_target();
2023     double prob = _exits.at(i).get_prob();
2024     tty->print(" ->%d@%d%%", blk->_pre_order, (int)(prob*100));
2025   }
2026   tty->print("\n");
2027 }
2028 #endif