6961690 Wdiff src/share/vm/opto/gcm.cpp

Print this page

rev 1839 : 6961690: load oops from constant table on SPARC
Summary: oops should be loaded from the constant table of an nmethod instead of materializing them with a long code sequence.
Reviewed-by:

Split	Close
Expand all
Collapse all

          --- old/src/share/vm/opto/gcm.cpp
          +++ new/src/share/vm/opto/gcm.cpp
   1    1  /*
   2      - * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
        2 + * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
   3    3   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4    4   *
   5    5   * This code is free software; you can redistribute it and/or modify it
   6    6   * under the terms of the GNU General Public License version 2 only, as
   7    7   * published by the Free Software Foundation.
   8    8   *
   9    9   * This code is distributed in the hope that it will be useful, but WITHOUT
  10   10   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11   11   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12   12   * version 2 for more details (a copy is included in the LICENSE file that

  13   13   * accompanied this code).
  14   14   *
  15   15   * You should have received a copy of the GNU General Public License version
  16   16   * 2 along with this work; if not, write to the Free Software Foundation,
  17   17   * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18   18   *
  19   19   * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20   20   * or visit www.oracle.com if you need additional information or have any
  21   21   * questions.
  22   22   *
  23   23   */
  24   24  
  25   25  // Portions of code courtesy of Clifford Click
  26   26  
  27   27  // Optimization - Graph Style
  28   28  
  29   29  #include "incls/_precompiled.incl"
  30   30  #include "incls/_gcm.cpp.incl"
  31   31  
  32   32  // To avoid float value underflow
  33   33  #define MIN_BLOCK_FREQUENCY 1.e-35f
  34   34  
  35   35  //----------------------------schedule_node_into_block-------------------------
  36   36  // Insert node n into block b. Look for projections of n and make sure they
  37   37  // are in b also.
  38   38  void PhaseCFG::schedule_node_into_block( Node *n, Block *b ) {
  39   39    // Set basic block of n, Add n to b,
  40   40    _bbs.map(n->_idx, b);
  41   41    b->add_inst(n);
  42   42  
  43   43    // After Matching, nearly any old Node may have projections trailing it.
  44   44    // These are usually machine-dependent flags.  In any case, they might
  45   45    // float to another block below this one.  Move them up.
  46   46    for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
  47   47      Node*  use  = n->fast_out(i);
  48   48      if (use->is_Proj()) {
  49   49        Block* buse = _bbs[use->_idx];
  50   50        if (buse != b) {              // In wrong block?
  51   51          if (buse != NULL)
  52   52            buse->find_remove(use);   // Remove from wrong block
  53   53          _bbs.map(use->_idx, b);     // Re-insert in this block
  54   54          b->add_inst(use);
  55   55        }
  56   56      }
  57   57    }
  58   58  }

↓ open down ↓

46 lines elided

↑ open up ↑

  59   59  
  60   60  //----------------------------replace_block_proj_ctrl-------------------------
  61   61  // Nodes that have is_block_proj() nodes as their control need to use
  62   62  // the appropriate Region for their actual block as their control since
  63   63  // the projection will be in a predecessor block.
  64   64  void PhaseCFG::replace_block_proj_ctrl( Node *n ) {
  65   65    const Node *in0 = n->in(0);
  66   66    assert(in0 != NULL, "Only control-dependent");
  67   67    const Node *p = in0->is_block_proj();
  68   68    if (p != NULL && p != n) {    // Control from a block projection?
  69      -    assert(!n->pinned() || n->is_SafePointScalarObject(), "only SafePointScalarObject pinned node is expected here");
       69 +    assert(!n->pinned() || n->is_MachConstantBase() || n->is_SafePointScalarObject(), "only pinned MachConstantBase or SafePointScalarObject node is expected here");
  70   70      // Find trailing Region
  71   71      Block *pb = _bbs[in0->_idx]; // Block-projection already has basic block
  72   72      uint j = 0;
  73   73      if (pb->_num_succs != 1) {  // More then 1 successor?
  74   74        // Search for successor
  75   75        uint max = pb->_nodes.size();
  76   76        assert( max > 1, "" );
  77   77        uint start = max - pb->_num_succs;
  78   78        // Find which output path belongs to projection
  79   79        for (j = start; j < max; j++) {

  80   80          if( pb->_nodes[j] == in0 )
  81   81            break;
  82   82        }
  83   83        assert( j < max, "must find" );
  84   84        // Change control to match head of successor basic block
  85   85        j -= start;
  86   86      }
  87   87      n->set_req(0, pb->_succs[j]->head());
  88   88    }
  89   89  }
  90   90  
  91   91  
  92   92  //------------------------------schedule_pinned_nodes--------------------------
  93   93  // Set the basic block for Nodes pinned into blocks
  94   94  void PhaseCFG::schedule_pinned_nodes( VectorSet &visited ) {
  95   95    // Allocate node stack of size C->unique()+8 to avoid frequent realloc
  96   96    GrowableArray <Node *> spstack(C->unique()+8);
  97   97    spstack.push(_root);
  98   98    while ( spstack.is_nonempty() ) {
  99   99      Node *n = spstack.pop();
 100  100      if( !visited.test_set(n->_idx) ) { // Test node and flag it as visited
 101  101        if( n->pinned() && !_bbs.lookup(n->_idx) ) {  // Pinned?  Nail it down!
 102  102          assert( n->in(0), "pinned Node must have Control" );
 103  103          // Before setting block replace block_proj control edge
 104  104          replace_block_proj_ctrl(n);
 105  105          Node *input = n->in(0);
 106  106          while( !input->is_block_start() )
 107  107            input = input->in(0);
 108  108          Block *b = _bbs[input->_idx];  // Basic block of controlling input
 109  109          schedule_node_into_block(n, b);
 110  110        }
 111  111        for( int i = n->req() - 1; i >= 0; --i ) {  // For all inputs
 112  112          if( n->in(i) != NULL )
 113  113            spstack.push(n->in(i));
 114  114        }
 115  115      }
 116  116    }
 117  117  }
 118  118  
 119  119  #ifdef ASSERT
 120  120  // Assert that new input b2 is dominated by all previous inputs.
 121  121  // Check this by by seeing that it is dominated by b1, the deepest
 122  122  // input observed until b2.
 123  123  static void assert_dom(Block* b1, Block* b2, Node* n, Block_Array &bbs) {
 124  124    if (b1 == NULL)  return;
 125  125    assert(b1->_dom_depth < b2->_dom_depth, "sanity");
 126  126    Block* tmp = b2;
 127  127    while (tmp != b1 && tmp != NULL) {
 128  128      tmp = tmp->_idom;
 129  129    }
 130  130    if (tmp != b1) {
 131  131      // Detected an unschedulable graph.  Print some nice stuff and die.
 132  132      tty->print_cr("!!! Unschedulable graph !!!");
 133  133      for (uint j=0; j<n->len(); j++) { // For all inputs
 134  134        Node* inn = n->in(j); // Get input
 135  135        if (inn == NULL)  continue;  // Ignore NULL, missing inputs
 136  136        Block* inb = bbs[inn->_idx];
 137  137        tty->print("B%d idom=B%d depth=%2d ",inb->_pre_order,
 138  138                   inb->_idom ? inb->_idom->_pre_order : 0, inb->_dom_depth);
 139  139        inn->dump();
 140  140      }
 141  141      tty->print("Failing node: ");
 142  142      n->dump();
 143  143      assert(false, "unscheduable graph");
 144  144    }
 145  145  }
 146  146  #endif
 147  147  
 148  148  static Block* find_deepest_input(Node* n, Block_Array &bbs) {
 149  149    // Find the last input dominated by all other inputs.
 150  150    Block* deepb           = NULL;        // Deepest block so far
 151  151    int    deepb_dom_depth = 0;
 152  152    for (uint k = 0; k < n->len(); k++) { // For all inputs
 153  153      Node* inn = n->in(k);               // Get input
 154  154      if (inn == NULL)  continue;         // Ignore NULL, missing inputs
 155  155      Block* inb = bbs[inn->_idx];
 156  156      assert(inb != NULL, "must already have scheduled this input");
 157  157      if (deepb_dom_depth < (int) inb->_dom_depth) {
 158  158        // The new inb must be dominated by the previous deepb.
 159  159        // The various inputs must be linearly ordered in the dom
 160  160        // tree, or else there will not be a unique deepest block.
 161  161        DEBUG_ONLY(assert_dom(deepb, inb, n, bbs));
 162  162        deepb = inb;                      // Save deepest block
 163  163        deepb_dom_depth = deepb->_dom_depth;
 164  164      }
 165  165    }
 166  166    assert(deepb != NULL, "must be at least one input to n");
 167  167    return deepb;
 168  168  }
 169  169  
 170  170  
 171  171  //------------------------------schedule_early---------------------------------
 172  172  // Find the earliest Block any instruction can be placed in.  Some instructions
 173  173  // are pinned into Blocks.  Unpinned instructions can appear in last block in
 174  174  // which all their inputs occur.
 175  175  bool PhaseCFG::schedule_early(VectorSet &visited, Node_List &roots) {
 176  176    // Allocate stack with enough space to avoid frequent realloc
 177  177    Node_Stack nstack(roots.Size() + 8); // (unique >> 1) + 24 from Java2D stats
 178  178    // roots.push(_root); _root will be processed among C->top() inputs
 179  179    roots.push(C->top());
 180  180    visited.set(C->top()->_idx);
 181  181  
 182  182    while (roots.size() != 0) {
 183  183      // Use local variables nstack_top_n & nstack_top_i to cache values
 184  184      // on stack's top.
 185  185      Node *nstack_top_n = roots.pop();
 186  186      uint  nstack_top_i = 0;
 187  187  //while_nstack_nonempty:
 188  188      while (true) {
 189  189        // Get parent node and next input's index from stack's top.
 190  190        Node *n = nstack_top_n;
 191  191        uint  i = nstack_top_i;
 192  192  
 193  193        if (i == 0) {
 194  194          // Fixup some control.  Constants without control get attached
 195  195          // to root and nodes that use is_block_proj() nodes should be attached
 196  196          // to the region that starts their block.
 197  197          const Node *in0 = n->in(0);
 198  198          if (in0 != NULL) {              // Control-dependent?
 199  199            replace_block_proj_ctrl(n);
 200  200          } else {               // n->in(0) == NULL
 201  201            if (n->req() == 1) { // This guy is a constant with NO inputs?
 202  202              n->set_req(0, _root);
 203  203            }
 204  204          }
 205  205        }
 206  206  
 207  207        // First, visit all inputs and force them to get a block.  If an
 208  208        // input is already in a block we quit following inputs (to avoid
 209  209        // cycles). Instead we put that Node on a worklist to be handled
 210  210        // later (since IT'S inputs may not have a block yet).
 211  211        bool done = true;              // Assume all n's inputs will be processed
 212  212        while (i < n->len()) {         // For all inputs
 213  213          Node *in = n->in(i);         // Get input
 214  214          ++i;
 215  215          if (in == NULL) continue;    // Ignore NULL, missing inputs
 216  216          int is_visited = visited.test_set(in->_idx);
 217  217          if (!_bbs.lookup(in->_idx)) { // Missing block selection?
 218  218            if (is_visited) {
 219  219              // assert( !visited.test(in->_idx), "did not schedule early" );
 220  220              return false;
 221  221            }
 222  222            nstack.push(n, i);         // Save parent node and next input's index.
 223  223            nstack_top_n = in;         // Process current input now.
 224  224            nstack_top_i = 0;
 225  225            done = false;              // Not all n's inputs processed.
 226  226            break; // continue while_nstack_nonempty;
 227  227          } else if (!is_visited) {    // Input not yet visited?
 228  228            roots.push(in);            // Visit this guy later, using worklist
 229  229          }
 230  230        }
 231  231        if (done) {
 232  232          // All of n's inputs have been processed, complete post-processing.
 233  233  
 234  234          // Some instructions are pinned into a block.  These include Region,
 235  235          // Phi, Start, Return, and other control-dependent instructions and
 236  236          // any projections which depend on them.
 237  237          if (!n->pinned()) {
 238  238            // Set earliest legal block.
 239  239            _bbs.map(n->_idx, find_deepest_input(n, _bbs));
 240  240          } else {
 241  241            assert(_bbs[n->_idx] == _bbs[n->in(0)->_idx], "Pinned Node should be at the same block as its control edge");
 242  242          }
 243  243  
 244  244          if (nstack.is_empty()) {
 245  245            // Finished all nodes on stack.
 246  246            // Process next node on the worklist 'roots'.
 247  247            break;
 248  248          }
 249  249          // Get saved parent node and next input's index.
 250  250          nstack_top_n = nstack.node();
 251  251          nstack_top_i = nstack.index();
 252  252          nstack.pop();
 253  253        } //    if (done)
 254  254      }   // while (true)
 255  255    }     // while (roots.size() != 0)
 256  256    return true;
 257  257  }
 258  258  
 259  259  //------------------------------dom_lca----------------------------------------
 260  260  // Find least common ancestor in dominator tree
 261  261  // LCA is a current notion of LCA, to be raised above 'this'.
 262  262  // As a convenient boundary condition, return 'this' if LCA is NULL.
 263  263  // Find the LCA of those two nodes.
 264  264  Block* Block::dom_lca(Block* LCA) {
 265  265    if (LCA == NULL || LCA == this)  return this;
 266  266  
 267  267    Block* anc = this;
 268  268    while (anc->_dom_depth > LCA->_dom_depth)
 269  269      anc = anc->_idom;           // Walk up till anc is as high as LCA
 270  270  
 271  271    while (LCA->_dom_depth > anc->_dom_depth)
 272  272      LCA = LCA->_idom;           // Walk up till LCA is as high as anc
 273  273  
 274  274    while (LCA != anc) {          // Walk both up till they are the same
 275  275      LCA = LCA->_idom;
 276  276      anc = anc->_idom;
 277  277    }
 278  278  
 279  279    return LCA;
 280  280  }
 281  281  
 282  282  //--------------------------raise_LCA_above_use--------------------------------
 283  283  // We are placing a definition, and have been given a def->use edge.
 284  284  // The definition must dominate the use, so move the LCA upward in the
 285  285  // dominator tree to dominate the use.  If the use is a phi, adjust
 286  286  // the LCA only with the phi input paths which actually use this def.
 287  287  static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, Block_Array &bbs) {
 288  288    Block* buse = bbs[use->_idx];
 289  289    if (buse == NULL)    return LCA;   // Unused killing Projs have no use block
 290  290    if (!use->is_Phi())  return buse->dom_lca(LCA);
 291  291    uint pmax = use->req();       // Number of Phi inputs
 292  292    // Why does not this loop just break after finding the matching input to
 293  293    // the Phi?  Well...it's like this.  I do not have true def-use/use-def
 294  294    // chains.  Means I cannot distinguish, from the def-use direction, which
 295  295    // of many use-defs lead from the same use to the same def.  That is, this
 296  296    // Phi might have several uses of the same def.  Each use appears in a
 297  297    // different predecessor block.  But when I enter here, I cannot distinguish
 298  298    // which use-def edge I should find the predecessor block for.  So I find
 299  299    // them all.  Means I do a little extra work if a Phi uses the same value
 300  300    // more than once.
 301  301    for (uint j=1; j<pmax; j++) { // For all inputs
 302  302      if (use->in(j) == def) {    // Found matching input?
 303  303        Block* pred = bbs[buse->pred(j)->_idx];
 304  304        LCA = pred->dom_lca(LCA);
 305  305      }
 306  306    }
 307  307    return LCA;
 308  308  }
 309  309  
 310  310  //----------------------------raise_LCA_above_marks----------------------------
 311  311  // Return a new LCA that dominates LCA and any of its marked predecessors.
 312  312  // Search all my parents up to 'early' (exclusive), looking for predecessors
 313  313  // which are marked with the given index.  Return the LCA (in the dom tree)
 314  314  // of all marked blocks.  If there are none marked, return the original
 315  315  // LCA.
 316  316  static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark,
 317  317                                      Block* early, Block_Array &bbs) {
 318  318    Block_List worklist;
 319  319    worklist.push(LCA);
 320  320    while (worklist.size() > 0) {
 321  321      Block* mid = worklist.pop();
 322  322      if (mid == early)  continue;  // stop searching here
 323  323  
 324  324      // Test and set the visited bit.
 325  325      if (mid->raise_LCA_visited() == mark)  continue;  // already visited
 326  326  
 327  327      // Don't process the current LCA, otherwise the search may terminate early
 328  328      if (mid != LCA && mid->raise_LCA_mark() == mark) {
 329  329        // Raise the LCA.
 330  330        LCA = mid->dom_lca(LCA);
 331  331        if (LCA == early)  break;   // stop searching everywhere
 332  332        assert(early->dominates(LCA), "early is high enough");
 333  333        // Resume searching at that point, skipping intermediate levels.
 334  334        worklist.push(LCA);
 335  335        if (LCA == mid)
 336  336          continue; // Don't mark as visited to avoid early termination.
 337  337      } else {
 338  338        // Keep searching through this block's predecessors.
 339  339        for (uint j = 1, jmax = mid->num_preds(); j < jmax; j++) {
 340  340          Block* mid_parent = bbs[ mid->pred(j)->_idx ];
 341  341          worklist.push(mid_parent);
 342  342        }
 343  343      }
 344  344      mid->set_raise_LCA_visited(mark);
 345  345    }
 346  346    return LCA;
 347  347  }
 348  348  
 349  349  //--------------------------memory_early_block--------------------------------
 350  350  // This is a variation of find_deepest_input, the heart of schedule_early.
 351  351  // Find the "early" block for a load, if we considered only memory and
 352  352  // address inputs, that is, if other data inputs were ignored.
 353  353  //
 354  354  // Because a subset of edges are considered, the resulting block will
 355  355  // be earlier (at a shallower dom_depth) than the true schedule_early
 356  356  // point of the node. We compute this earlier block as a more permissive
 357  357  // site for anti-dependency insertion, but only if subsume_loads is enabled.
 358  358  static Block* memory_early_block(Node* load, Block* early, Block_Array &bbs) {
 359  359    Node* base;
 360  360    Node* index;
 361  361    Node* store = load->in(MemNode::Memory);
 362  362    load->as_Mach()->memory_inputs(base, index);
 363  363  
 364  364    assert(base != NodeSentinel && index != NodeSentinel,
 365  365           "unexpected base/index inputs");
 366  366  
 367  367    Node* mem_inputs[4];
 368  368    int mem_inputs_length = 0;
 369  369    if (base != NULL)  mem_inputs[mem_inputs_length++] = base;
 370  370    if (index != NULL) mem_inputs[mem_inputs_length++] = index;
 371  371    if (store != NULL) mem_inputs[mem_inputs_length++] = store;
 372  372  
 373  373    // In the comparision below, add one to account for the control input,
 374  374    // which may be null, but always takes up a spot in the in array.
 375  375    if (mem_inputs_length + 1 < (int) load->req()) {
 376  376      // This "load" has more inputs than just the memory, base and index inputs.
 377  377      // For purposes of checking anti-dependences, we need to start
 378  378      // from the early block of only the address portion of the instruction,
 379  379      // and ignore other blocks that may have factored into the wider
 380  380      // schedule_early calculation.
 381  381      if (load->in(0) != NULL) mem_inputs[mem_inputs_length++] = load->in(0);
 382  382  
 383  383      Block* deepb           = NULL;        // Deepest block so far
 384  384      int    deepb_dom_depth = 0;
 385  385      for (int i = 0; i < mem_inputs_length; i++) {
 386  386        Block* inb = bbs[mem_inputs[i]->_idx];
 387  387        if (deepb_dom_depth < (int) inb->_dom_depth) {
 388  388          // The new inb must be dominated by the previous deepb.
 389  389          // The various inputs must be linearly ordered in the dom
 390  390          // tree, or else there will not be a unique deepest block.
 391  391          DEBUG_ONLY(assert_dom(deepb, inb, load, bbs));
 392  392          deepb = inb;                      // Save deepest block
 393  393          deepb_dom_depth = deepb->_dom_depth;
 394  394        }
 395  395      }
 396  396      early = deepb;
 397  397    }
 398  398  
 399  399    return early;
 400  400  }
 401  401  
 402  402  //--------------------------insert_anti_dependences---------------------------
 403  403  // A load may need to witness memory that nearby stores can overwrite.
 404  404  // For each nearby store, either insert an "anti-dependence" edge
 405  405  // from the load to the store, or else move LCA upward to force the
 406  406  // load to (eventually) be scheduled in a block above the store.
 407  407  //
 408  408  // Do not add edges to stores on distinct control-flow paths;
 409  409  // only add edges to stores which might interfere.
 410  410  //
 411  411  // Return the (updated) LCA.  There will not be any possibly interfering
 412  412  // store between the load's "early block" and the updated LCA.
 413  413  // Any stores in the updated LCA will have new precedence edges
 414  414  // back to the load.  The caller is expected to schedule the load
 415  415  // in the LCA, in which case the precedence edges will make LCM
 416  416  // preserve anti-dependences.  The caller may also hoist the load
 417  417  // above the LCA, if it is not the early block.
 418  418  Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
 419  419    assert(load->needs_anti_dependence_check(), "must be a load of some sort");
 420  420    assert(LCA != NULL, "");
 421  421    DEBUG_ONLY(Block* LCA_orig = LCA);
 422  422  
 423  423    // Compute the alias index.  Loads and stores with different alias indices
 424  424    // do not need anti-dependence edges.
 425  425    uint load_alias_idx = C->get_alias_index(load->adr_type());
 426  426  #ifdef ASSERT
 427  427    if (load_alias_idx == Compile::AliasIdxBot && C->AliasLevel() > 0 &&
 428  428        (PrintOpto || VerifyAliases ||
 429  429         PrintMiscellaneous && (WizardMode || Verbose))) {
 430  430      // Load nodes should not consume all of memory.
 431  431      // Reporting a bottom type indicates a bug in adlc.
 432  432      // If some particular type of node validly consumes all of memory,
 433  433      // sharpen the preceding "if" to exclude it, so we can catch bugs here.
 434  434      tty->print_cr("*** Possible Anti-Dependence Bug:  Load consumes all of memory.");
 435  435      load->dump(2);
 436  436      if (VerifyAliases)  assert(load_alias_idx != Compile::AliasIdxBot, "");
 437  437    }
 438  438  #endif
 439  439    assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrComp),
 440  440           "String compare is only known 'load' that does not conflict with any stores");
 441  441    assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrEquals),
 442  442           "String equals is a 'load' that does not conflict with any stores");
 443  443    assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrIndexOf),
 444  444           "String indexOf is a 'load' that does not conflict with any stores");
 445  445    assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_AryEq),
 446  446           "Arrays equals is a 'load' that do not conflict with any stores");
 447  447  
 448  448    if (!C->alias_type(load_alias_idx)->is_rewritable()) {
 449  449      // It is impossible to spoil this load by putting stores before it,
 450  450      // because we know that the stores will never update the value
 451  451      // which 'load' must witness.
 452  452      return LCA;
 453  453    }
 454  454  
 455  455    node_idx_t load_index = load->_idx;
 456  456  
 457  457    // Note the earliest legal placement of 'load', as determined by
 458  458    // by the unique point in the dom tree where all memory effects
 459  459    // and other inputs are first available.  (Computed by schedule_early.)
 460  460    // For normal loads, 'early' is the shallowest place (dom graph wise)
 461  461    // to look for anti-deps between this load and any store.
 462  462    Block* early = _bbs[load_index];
 463  463  
 464  464    // If we are subsuming loads, compute an "early" block that only considers
 465  465    // memory or address inputs. This block may be different than the
 466  466    // schedule_early block in that it could be at an even shallower depth in the
 467  467    // dominator tree, and allow for a broader discovery of anti-dependences.
 468  468    if (C->subsume_loads()) {
 469  469      early = memory_early_block(load, early, _bbs);
 470  470    }
 471  471  
 472  472    ResourceArea *area = Thread::current()->resource_area();
 473  473    Node_List worklist_mem(area);     // prior memory state to store
 474  474    Node_List worklist_store(area);   // possible-def to explore
 475  475    Node_List worklist_visited(area); // visited mergemem nodes
 476  476    Node_List non_early_stores(area); // all relevant stores outside of early
 477  477    bool must_raise_LCA = false;
 478  478  
 479  479  #ifdef TRACK_PHI_INPUTS
 480  480    // %%% This extra checking fails because MergeMem nodes are not GVNed.
 481  481    // Provide "phi_inputs" to check if every input to a PhiNode is from the
 482  482    // original memory state.  This indicates a PhiNode for which should not
 483  483    // prevent the load from sinking.  For such a block, set_raise_LCA_mark
 484  484    // may be overly conservative.
 485  485    // Mechanism: count inputs seen for each Phi encountered in worklist_store.
 486  486    DEBUG_ONLY(GrowableArray<uint> phi_inputs(area, C->unique(),0,0));
 487  487  #endif
 488  488  
 489  489    // 'load' uses some memory state; look for users of the same state.
 490  490    // Recurse through MergeMem nodes to the stores that use them.
 491  491  
 492  492    // Each of these stores is a possible definition of memory
 493  493    // that 'load' needs to use.  We need to force 'load'
 494  494    // to occur before each such store.  When the store is in
 495  495    // the same block as 'load', we insert an anti-dependence
 496  496    // edge load->store.
 497  497  
 498  498    // The relevant stores "nearby" the load consist of a tree rooted
 499  499    // at initial_mem, with internal nodes of type MergeMem.
 500  500    // Therefore, the branches visited by the worklist are of this form:
 501  501    //    initial_mem -> (MergeMem ->)* store
 502  502    // The anti-dependence constraints apply only to the fringe of this tree.
 503  503  
 504  504    Node* initial_mem = load->in(MemNode::Memory);
 505  505    worklist_store.push(initial_mem);
 506  506    worklist_visited.push(initial_mem);
 507  507    worklist_mem.push(NULL);
 508  508    while (worklist_store.size() > 0) {
 509  509      // Examine a nearby store to see if it might interfere with our load.
 510  510      Node* mem   = worklist_mem.pop();
 511  511      Node* store = worklist_store.pop();
 512  512      uint op = store->Opcode();
 513  513  
 514  514      // MergeMems do not directly have anti-deps.
 515  515      // Treat them as internal nodes in a forward tree of memory states,
 516  516      // the leaves of which are each a 'possible-def'.
 517  517      if (store == initial_mem    // root (exclusive) of tree we are searching
 518  518          || op == Op_MergeMem    // internal node of tree we are searching
 519  519          ) {
 520  520        mem = store;   // It's not a possibly interfering store.
 521  521        if (store == initial_mem)
 522  522          initial_mem = NULL;  // only process initial memory once
 523  523  
 524  524        for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
 525  525          store = mem->fast_out(i);
 526  526          if (store->is_MergeMem()) {
 527  527            // Be sure we don't get into combinatorial problems.
 528  528            // (Allow phis to be repeated; they can merge two relevant states.)
 529  529            uint j = worklist_visited.size();
 530  530            for (; j > 0; j--) {
 531  531              if (worklist_visited.at(j-1) == store)  break;
 532  532            }
 533  533            if (j > 0)  continue; // already on work list; do not repeat
 534  534            worklist_visited.push(store);
 535  535          }
 536  536          worklist_mem.push(mem);
 537  537          worklist_store.push(store);
 538  538        }
 539  539        continue;
 540  540      }
 541  541  
 542  542      if (op == Op_MachProj || op == Op_Catch)   continue;
 543  543      if (store->needs_anti_dependence_check())  continue;  // not really a store
 544  544  
 545  545      // Compute the alias index.  Loads and stores with different alias
 546  546      // indices do not need anti-dependence edges.  Wide MemBar's are
 547  547      // anti-dependent on everything (except immutable memories).
 548  548      const TypePtr* adr_type = store->adr_type();
 549  549      if (!C->can_alias(adr_type, load_alias_idx))  continue;
 550  550  
 551  551      // Most slow-path runtime calls do NOT modify Java memory, but
 552  552      // they can block and so write Raw memory.
 553  553      if (store->is_Mach()) {
 554  554        MachNode* mstore = store->as_Mach();
 555  555        if (load_alias_idx != Compile::AliasIdxRaw) {
 556  556          // Check for call into the runtime using the Java calling
 557  557          // convention (and from there into a wrapper); it has no
 558  558          // _method.  Can't do this optimization for Native calls because
 559  559          // they CAN write to Java memory.
 560  560          if (mstore->ideal_Opcode() == Op_CallStaticJava) {
 561  561            assert(mstore->is_MachSafePoint(), "");
 562  562            MachSafePointNode* ms = (MachSafePointNode*) mstore;
 563  563            assert(ms->is_MachCallJava(), "");
 564  564            MachCallJavaNode* mcj = (MachCallJavaNode*) ms;
 565  565            if (mcj->_method == NULL) {
 566  566              // These runtime calls do not write to Java visible memory
 567  567              // (other than Raw) and so do not require anti-dependence edges.
 568  568              continue;
 569  569            }
 570  570          }
 571  571          // Same for SafePoints: they read/write Raw but only read otherwise.
 572  572          // This is basically a workaround for SafePoints only defining control
 573  573          // instead of control + memory.
 574  574          if (mstore->ideal_Opcode() == Op_SafePoint)
 575  575            continue;
 576  576        } else {
 577  577          // Some raw memory, such as the load of "top" at an allocation,
 578  578          // can be control dependent on the previous safepoint. See
 579  579          // comments in GraphKit::allocate_heap() about control input.
 580  580          // Inserting an anti-dep between such a safepoint and a use
 581  581          // creates a cycle, and will cause a subsequent failure in
 582  582          // local scheduling.  (BugId 4919904)
 583  583          // (%%% How can a control input be a safepoint and not a projection??)
 584  584          if (mstore->ideal_Opcode() == Op_SafePoint && load->in(0) == mstore)
 585  585            continue;
 586  586        }
 587  587      }
 588  588  
 589  589      // Identify a block that the current load must be above,
 590  590      // or else observe that 'store' is all the way up in the
 591  591      // earliest legal block for 'load'.  In the latter case,
 592  592      // immediately insert an anti-dependence edge.
 593  593      Block* store_block = _bbs[store->_idx];
 594  594      assert(store_block != NULL, "unused killing projections skipped above");
 595  595  
 596  596      if (store->is_Phi()) {
 597  597        // 'load' uses memory which is one (or more) of the Phi's inputs.
 598  598        // It must be scheduled not before the Phi, but rather before
 599  599        // each of the relevant Phi inputs.
 600  600        //
 601  601        // Instead of finding the LCA of all inputs to a Phi that match 'mem',
 602  602        // we mark each corresponding predecessor block and do a combined
 603  603        // hoisting operation later (raise_LCA_above_marks).
 604  604        //
 605  605        // Do not assert(store_block != early, "Phi merging memory after access")
 606  606        // PhiNode may be at start of block 'early' with backedge to 'early'
 607  607        DEBUG_ONLY(bool found_match = false);
 608  608        for (uint j = PhiNode::Input, jmax = store->req(); j < jmax; j++) {
 609  609          if (store->in(j) == mem) {   // Found matching input?
 610  610            DEBUG_ONLY(found_match = true);
 611  611            Block* pred_block = _bbs[store_block->pred(j)->_idx];
 612  612            if (pred_block != early) {
 613  613              // If any predecessor of the Phi matches the load's "early block",
 614  614              // we do not need a precedence edge between the Phi and 'load'
 615  615              // since the load will be forced into a block preceding the Phi.
 616  616              pred_block->set_raise_LCA_mark(load_index);
 617  617              assert(!LCA_orig->dominates(pred_block) ||
 618  618                     early->dominates(pred_block), "early is high enough");
 619  619              must_raise_LCA = true;
 620  620            } else {
 621  621              // anti-dependent upon PHI pinned below 'early', no edge needed
 622  622              LCA = early;             // but can not schedule below 'early'
 623  623            }
 624  624          }
 625  625        }
 626  626        assert(found_match, "no worklist bug");
 627  627  #ifdef TRACK_PHI_INPUTS
 628  628  #ifdef ASSERT
 629  629        // This assert asks about correct handling of PhiNodes, which may not
 630  630        // have all input edges directly from 'mem'. See BugId 4621264
 631  631        int num_mem_inputs = phi_inputs.at_grow(store->_idx,0) + 1;
 632  632        // Increment by exactly one even if there are multiple copies of 'mem'
 633  633        // coming into the phi, because we will run this block several times
 634  634        // if there are several copies of 'mem'.  (That's how DU iterators work.)
 635  635        phi_inputs.at_put(store->_idx, num_mem_inputs);
 636  636        assert(PhiNode::Input + num_mem_inputs < store->req(),
 637  637               "Expect at least one phi input will not be from original memory state");
 638  638  #endif //ASSERT
 639  639  #endif //TRACK_PHI_INPUTS
 640  640      } else if (store_block != early) {
 641  641        // 'store' is between the current LCA and earliest possible block.
 642  642        // Label its block, and decide later on how to raise the LCA
 643  643        // to include the effect on LCA of this store.
 644  644        // If this store's block gets chosen as the raised LCA, we
 645  645        // will find him on the non_early_stores list and stick him
 646  646        // with a precedence edge.
 647  647        // (But, don't bother if LCA is already raised all the way.)
 648  648        if (LCA != early) {
 649  649          store_block->set_raise_LCA_mark(load_index);
 650  650          must_raise_LCA = true;
 651  651          non_early_stores.push(store);
 652  652        }
 653  653      } else {
 654  654        // Found a possibly-interfering store in the load's 'early' block.
 655  655        // This means 'load' cannot sink at all in the dominator tree.
 656  656        // Add an anti-dep edge, and squeeze 'load' into the highest block.
 657  657        assert(store != load->in(0), "dependence cycle found");
 658  658        if (verify) {
 659  659          assert(store->find_edge(load) != -1, "missing precedence edge");
 660  660        } else {
 661  661          store->add_prec(load);
 662  662        }
 663  663        LCA = early;
 664  664        // This turns off the process of gathering non_early_stores.
 665  665      }
 666  666    }
 667  667    // (Worklist is now empty; all nearby stores have been visited.)
 668  668  
 669  669    // Finished if 'load' must be scheduled in its 'early' block.
 670  670    // If we found any stores there, they have already been given
 671  671    // precedence edges.
 672  672    if (LCA == early)  return LCA;
 673  673  
 674  674    // We get here only if there are no possibly-interfering stores
 675  675    // in the load's 'early' block.  Move LCA up above all predecessors
 676  676    // which contain stores we have noted.
 677  677    //
 678  678    // The raised LCA block can be a home to such interfering stores,
 679  679    // but its predecessors must not contain any such stores.
 680  680    //
 681  681    // The raised LCA will be a lower bound for placing the load,
 682  682    // preventing the load from sinking past any block containing
 683  683    // a store that may invalidate the memory state required by 'load'.
 684  684    if (must_raise_LCA)
 685  685      LCA = raise_LCA_above_marks(LCA, load->_idx, early, _bbs);
 686  686    if (LCA == early)  return LCA;
 687  687  
 688  688    // Insert anti-dependence edges from 'load' to each store
 689  689    // in the non-early LCA block.
 690  690    // Mine the non_early_stores list for such stores.
 691  691    if (LCA->raise_LCA_mark() == load_index) {
 692  692      while (non_early_stores.size() > 0) {
 693  693        Node* store = non_early_stores.pop();
 694  694        Block* store_block = _bbs[store->_idx];
 695  695        if (store_block == LCA) {
 696  696          // add anti_dependence from store to load in its own block
 697  697          assert(store != load->in(0), "dependence cycle found");
 698  698          if (verify) {
 699  699            assert(store->find_edge(load) != -1, "missing precedence edge");
 700  700          } else {
 701  701            store->add_prec(load);
 702  702          }
 703  703        } else {
 704  704          assert(store_block->raise_LCA_mark() == load_index, "block was marked");
 705  705          // Any other stores we found must be either inside the new LCA
 706  706          // or else outside the original LCA.  In the latter case, they
 707  707          // did not interfere with any use of 'load'.
 708  708          assert(LCA->dominates(store_block)
 709  709                 || !LCA_orig->dominates(store_block), "no stray stores");
 710  710        }
 711  711      }
 712  712    }
 713  713  
 714  714    // Return the highest block containing stores; any stores
 715  715    // within that block have been given anti-dependence edges.
 716  716    return LCA;
 717  717  }
 718  718  
 719  719  // This class is used to iterate backwards over the nodes in the graph.
 720  720  
 721  721  class Node_Backward_Iterator {
 722  722  
 723  723  private:
 724  724    Node_Backward_Iterator();
 725  725  
 726  726  public:
 727  727    // Constructor for the iterator
 728  728    Node_Backward_Iterator(Node *root, VectorSet &visited, Node_List &stack, Block_Array &bbs);
 729  729  
 730  730    // Postincrement operator to iterate over the nodes
 731  731    Node *next();
 732  732  
 733  733  private:
 734  734    VectorSet   &_visited;
 735  735    Node_List   &_stack;
 736  736    Block_Array &_bbs;
 737  737  };
 738  738  
 739  739  // Constructor for the Node_Backward_Iterator
 740  740  Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_List &stack, Block_Array &bbs )
 741  741    : _visited(visited), _stack(stack), _bbs(bbs) {
 742  742    // The stack should contain exactly the root
 743  743    stack.clear();
 744  744    stack.push(root);
 745  745  
 746  746    // Clear the visited bits
 747  747    visited.Clear();
 748  748  }
 749  749  
 750  750  // Iterator for the Node_Backward_Iterator
 751  751  Node *Node_Backward_Iterator::next() {
 752  752  
 753  753    // If the _stack is empty, then just return NULL: finished.
 754  754    if ( !_stack.size() )
 755  755      return NULL;
 756  756  
 757  757    // '_stack' is emulating a real _stack.  The 'visit-all-users' loop has been
 758  758    // made stateless, so I do not need to record the index 'i' on my _stack.
 759  759    // Instead I visit all users each time, scanning for unvisited users.
 760  760    // I visit unvisited not-anti-dependence users first, then anti-dependent
 761  761    // children next.
 762  762    Node *self = _stack.pop();
 763  763  
 764  764    // I cycle here when I am entering a deeper level of recursion.
 765  765    // The key variable 'self' was set prior to jumping here.
 766  766    while( 1 ) {
 767  767  
 768  768      _visited.set(self->_idx);
 769  769  
 770  770      // Now schedule all uses as late as possible.
 771  771      uint src     = self->is_Proj() ? self->in(0)->_idx : self->_idx;
 772  772      uint src_rpo = _bbs[src]->_rpo;
 773  773  
 774  774      // Schedule all nodes in a post-order visit
 775  775      Node *unvisited = NULL;  // Unvisited anti-dependent Node, if any
 776  776  
 777  777      // Scan for unvisited nodes
 778  778      for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
 779  779        // For all uses, schedule late
 780  780        Node* n = self->fast_out(i); // Use
 781  781  
 782  782        // Skip already visited children
 783  783        if ( _visited.test(n->_idx) )
 784  784          continue;
 785  785  
 786  786        // do not traverse backward control edges
 787  787        Node *use = n->is_Proj() ? n->in(0) : n;
 788  788        uint use_rpo = _bbs[use->_idx]->_rpo;
 789  789  
 790  790        if ( use_rpo < src_rpo )
 791  791          continue;
 792  792  
 793  793        // Phi nodes always precede uses in a basic block
 794  794        if ( use_rpo == src_rpo && use->is_Phi() )
 795  795          continue;
 796  796  
 797  797        unvisited = n;      // Found unvisited
 798  798  
 799  799        // Check for possible-anti-dependent
 800  800        if( !n->needs_anti_dependence_check() )
 801  801          break;            // Not visited, not anti-dep; schedule it NOW
 802  802      }
 803  803  
 804  804      // Did I find an unvisited not-anti-dependent Node?
 805  805      if ( !unvisited )
 806  806        break;                  // All done with children; post-visit 'self'
 807  807  
 808  808      // Visit the unvisited Node.  Contains the obvious push to
 809  809      // indicate I'm entering a deeper level of recursion.  I push the
 810  810      // old state onto the _stack and set a new state and loop (recurse).
 811  811      _stack.push(self);
 812  812      self = unvisited;
 813  813    } // End recursion loop
 814  814  
 815  815    return self;
 816  816  }
 817  817  
 818  818  //------------------------------ComputeLatenciesBackwards----------------------
 819  819  // Compute the latency of all the instructions.
 820  820  void PhaseCFG::ComputeLatenciesBackwards(VectorSet &visited, Node_List &stack) {
 821  821  #ifndef PRODUCT
 822  822    if (trace_opto_pipelining())
 823  823      tty->print("\n#---- ComputeLatenciesBackwards ----\n");
 824  824  #endif
 825  825  
 826  826    Node_Backward_Iterator iter((Node *)_root, visited, stack, _bbs);
 827  827    Node *n;
 828  828  
 829  829    // Walk over all the nodes from last to first
 830  830    while (n = iter.next()) {
 831  831      // Set the latency for the definitions of this instruction
 832  832      partial_latency_of_defs(n);
 833  833    }
 834  834  } // end ComputeLatenciesBackwards
 835  835  
 836  836  //------------------------------partial_latency_of_defs------------------------
 837  837  // Compute the latency impact of this node on all defs.  This computes
 838  838  // a number that increases as we approach the beginning of the routine.
 839  839  void PhaseCFG::partial_latency_of_defs(Node *n) {
 840  840    // Set the latency for this instruction
 841  841  #ifndef PRODUCT
 842  842    if (trace_opto_pipelining()) {
 843  843      tty->print("# latency_to_inputs: node_latency[%d] = %d for node",
 844  844                 n->_idx, _node_latency->at_grow(n->_idx));
 845  845      dump();
 846  846    }
 847  847  #endif
 848  848  
 849  849    if (n->is_Proj())
 850  850      n = n->in(0);
 851  851  
 852  852    if (n->is_Root())
 853  853      return;
 854  854  
 855  855    uint nlen = n->len();
 856  856    uint use_latency = _node_latency->at_grow(n->_idx);
 857  857    uint use_pre_order = _bbs[n->_idx]->_pre_order;
 858  858  
 859  859    for ( uint j=0; j<nlen; j++ ) {
 860  860      Node *def = n->in(j);
 861  861  
 862  862      if (!def || def == n)
 863  863        continue;
 864  864  
 865  865      // Walk backwards thru projections
 866  866      if (def->is_Proj())
 867  867        def = def->in(0);
 868  868  
 869  869  #ifndef PRODUCT
 870  870      if (trace_opto_pipelining()) {
 871  871        tty->print("#    in(%2d): ", j);
 872  872        def->dump();
 873  873      }
 874  874  #endif
 875  875  
 876  876      // If the defining block is not known, assume it is ok
 877  877      Block *def_block = _bbs[def->_idx];
 878  878      uint def_pre_order = def_block ? def_block->_pre_order : 0;
 879  879  
 880  880      if ( (use_pre_order <  def_pre_order) ||
 881  881           (use_pre_order == def_pre_order && n->is_Phi()) )
 882  882        continue;
 883  883  
 884  884      uint delta_latency = n->latency(j);
 885  885      uint current_latency = delta_latency + use_latency;
 886  886  
 887  887      if (_node_latency->at_grow(def->_idx) < current_latency) {
 888  888        _node_latency->at_put_grow(def->_idx, current_latency);
 889  889      }
 890  890  
 891  891  #ifndef PRODUCT
 892  892      if (trace_opto_pipelining()) {
 893  893        tty->print_cr("#      %d + edge_latency(%d) == %d -> %d, node_latency[%d] = %d",
 894  894                      use_latency, j, delta_latency, current_latency, def->_idx,
 895  895                      _node_latency->at_grow(def->_idx));
 896  896      }
 897  897  #endif
 898  898    }
 899  899  }
 900  900  
 901  901  //------------------------------latency_from_use-------------------------------
 902  902  // Compute the latency of a specific use
 903  903  int PhaseCFG::latency_from_use(Node *n, const Node *def, Node *use) {
 904  904    // If self-reference, return no latency
 905  905    if (use == n || use->is_Root())
 906  906      return 0;
 907  907  
 908  908    uint def_pre_order = _bbs[def->_idx]->_pre_order;
 909  909    uint latency = 0;
 910  910  
 911  911    // If the use is not a projection, then it is simple...
 912  912    if (!use->is_Proj()) {
 913  913  #ifndef PRODUCT
 914  914      if (trace_opto_pipelining()) {
 915  915        tty->print("#    out(): ");
 916  916        use->dump();
 917  917      }
 918  918  #endif
 919  919  
 920  920      uint use_pre_order = _bbs[use->_idx]->_pre_order;
 921  921  
 922  922      if (use_pre_order < def_pre_order)
 923  923        return 0;
 924  924  
 925  925      if (use_pre_order == def_pre_order && use->is_Phi())
 926  926        return 0;
 927  927  
 928  928      uint nlen = use->len();
 929  929      uint nl = _node_latency->at_grow(use->_idx);
 930  930  
 931  931      for ( uint j=0; j<nlen; j++ ) {
 932  932        if (use->in(j) == n) {
 933  933          // Change this if we want local latencies
 934  934          uint ul = use->latency(j);
 935  935          uint  l = ul + nl;
 936  936          if (latency < l) latency = l;
 937  937  #ifndef PRODUCT
 938  938          if (trace_opto_pipelining()) {
 939  939            tty->print_cr("#      %d + edge_latency(%d) == %d -> %d, latency = %d",
 940  940                          nl, j, ul, l, latency);
 941  941          }
 942  942  #endif
 943  943        }
 944  944      }
 945  945    } else {
 946  946      // This is a projection, just grab the latency of the use(s)
 947  947      for (DUIterator_Fast jmax, j = use->fast_outs(jmax); j < jmax; j++) {
 948  948        uint l = latency_from_use(use, def, use->fast_out(j));
 949  949        if (latency < l) latency = l;
 950  950      }
 951  951    }
 952  952  
 953  953    return latency;
 954  954  }
 955  955  
 956  956  //------------------------------latency_from_uses------------------------------
 957  957  // Compute the latency of this instruction relative to all of it's uses.
 958  958  // This computes a number that increases as we approach the beginning of the
 959  959  // routine.
 960  960  void PhaseCFG::latency_from_uses(Node *n) {
 961  961    // Set the latency for this instruction
 962  962  #ifndef PRODUCT
 963  963    if (trace_opto_pipelining()) {
 964  964      tty->print("# latency_from_outputs: node_latency[%d] = %d for node",
 965  965                 n->_idx, _node_latency->at_grow(n->_idx));
 966  966      dump();
 967  967    }
 968  968  #endif
 969  969    uint latency=0;
 970  970    const Node *def = n->is_Proj() ? n->in(0): n;
 971  971  
 972  972    for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
 973  973      uint l = latency_from_use(n, def, n->fast_out(i));
 974  974  
 975  975      if (latency < l) latency = l;
 976  976    }
 977  977  
 978  978    _node_latency->at_put_grow(n->_idx, latency);
 979  979  }
 980  980  
 981  981  //------------------------------hoist_to_cheaper_block-------------------------
 982  982  // Pick a block for node self, between early and LCA, that is a cheaper
 983  983  // alternative to LCA.
 984  984  Block* PhaseCFG::hoist_to_cheaper_block(Block* LCA, Block* early, Node* self) {
 985  985    const double delta = 1+PROB_UNLIKELY_MAG(4);
 986  986    Block* least       = LCA;
 987  987    double least_freq  = least->_freq;
 988  988    uint target        = _node_latency->at_grow(self->_idx);
 989  989    uint start_latency = _node_latency->at_grow(LCA->_nodes[0]->_idx);
 990  990    uint end_latency   = _node_latency->at_grow(LCA->_nodes[LCA->end_idx()]->_idx);
 991  991    bool in_latency    = (target <= start_latency);
 992  992    const Block* root_block = _bbs[_root->_idx];
 993  993  
 994  994    // Turn off latency scheduling if scheduling is just plain off
 995  995    if (!C->do_scheduling())
 996  996      in_latency = true;
 997  997  
 998  998    // Do not hoist (to cover latency) instructions which target a
 999  999    // single register.  Hoisting stretches the live range of the
1000 1000    // single register and may force spilling.
1001 1001    MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;
1002 1002    if (mach && mach->out_RegMask().is_bound1() && mach->out_RegMask().is_NotEmpty())
1003 1003      in_latency = true;
1004 1004  
1005 1005  #ifndef PRODUCT
1006 1006    if (trace_opto_pipelining()) {
1007 1007      tty->print("# Find cheaper block for latency %d: ",
1008 1008        _node_latency->at_grow(self->_idx));
1009 1009      self->dump();
1010 1010      tty->print_cr("#   B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",
1011 1011        LCA->_pre_order,
1012 1012        LCA->_nodes[0]->_idx,
1013 1013        start_latency,
1014 1014        LCA->_nodes[LCA->end_idx()]->_idx,
1015 1015        end_latency,
1016 1016        least_freq);
1017 1017    }
1018 1018  #endif
1019 1019  
1020 1020    // Walk up the dominator tree from LCA (Lowest common ancestor) to
1021 1021    // the earliest legal location.  Capture the least execution frequency.
1022 1022    while (LCA != early) {
1023 1023      LCA = LCA->_idom;         // Follow up the dominator tree
1024 1024  
1025 1025      if (LCA == NULL) {
1026 1026        // Bailout without retry
1027 1027        C->record_method_not_compilable("late schedule failed: LCA == NULL");
1028 1028        return least;
1029 1029      }
1030 1030  
1031 1031      // Don't hoist machine instructions to the root basic block
1032 1032      if (mach && LCA == root_block)
1033 1033        break;
1034 1034  
1035 1035      uint start_lat = _node_latency->at_grow(LCA->_nodes[0]->_idx);
1036 1036      uint end_idx   = LCA->end_idx();
1037 1037      uint end_lat   = _node_latency->at_grow(LCA->_nodes[end_idx]->_idx);
1038 1038      double LCA_freq = LCA->_freq;
1039 1039  #ifndef PRODUCT
1040 1040      if (trace_opto_pipelining()) {
1041 1041        tty->print_cr("#   B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",
1042 1042          LCA->_pre_order, LCA->_nodes[0]->_idx, start_lat, end_idx, end_lat, LCA_freq);
1043 1043      }
1044 1044  #endif
1045 1045      if (LCA_freq < least_freq              || // Better Frequency
1046 1046          ( !in_latency                   &&    // No block containing latency
1047 1047            LCA_freq < least_freq * delta &&    // No worse frequency
1048 1048            target >= end_lat             &&    // within latency range
1049 1049            !self->is_iteratively_computed() )  // But don't hoist IV increments
1050 1050               // because they may end up above other uses of their phi forcing
1051 1051               // their result register to be different from their input.
1052 1052         ) {
1053 1053        least = LCA;            // Found cheaper block
1054 1054        least_freq = LCA_freq;
1055 1055        start_latency = start_lat;
1056 1056        end_latency = end_lat;
1057 1057        if (target <= start_lat)
1058 1058          in_latency = true;
1059 1059      }
1060 1060    }
1061 1061  
1062 1062  #ifndef PRODUCT
1063 1063    if (trace_opto_pipelining()) {
1064 1064      tty->print_cr("#  Choose block B%d with start latency=%d and freq=%g",
1065 1065        least->_pre_order, start_latency, least_freq);
1066 1066    }
1067 1067  #endif
1068 1068  
1069 1069    // See if the latency needs to be updated
1070 1070    if (target < end_latency) {
1071 1071  #ifndef PRODUCT
1072 1072      if (trace_opto_pipelining()) {
1073 1073        tty->print_cr("#  Change latency for [%4d] from %d to %d", self->_idx, target, end_latency);
1074 1074      }
1075 1075  #endif
1076 1076      _node_latency->at_put_grow(self->_idx, end_latency);
1077 1077      partial_latency_of_defs(self);
1078 1078    }
1079 1079  
1080 1080    return least;
1081 1081  }
1082 1082  
1083 1083  
1084 1084  //------------------------------schedule_late-----------------------------------
1085 1085  // Now schedule all codes as LATE as possible.  This is the LCA in the
1086 1086  // dominator tree of all USES of a value.  Pick the block with the least
1087 1087  // loop nesting depth that is lowest in the dominator tree.
1088 1088  extern const char must_clone[];
1089 1089  void PhaseCFG::schedule_late(VectorSet &visited, Node_List &stack) {
1090 1090  #ifndef PRODUCT
1091 1091    if (trace_opto_pipelining())
1092 1092      tty->print("\n#---- schedule_late ----\n");
1093 1093  #endif
1094 1094  
1095 1095    Node_Backward_Iterator iter((Node *)_root, visited, stack, _bbs);
1096 1096    Node *self;
1097 1097  
1098 1098    // Walk over all the nodes from last to first
1099 1099    while (self = iter.next()) {
1100 1100      Block* early = _bbs[self->_idx];   // Earliest legal placement
1101 1101  
1102 1102      if (self->is_top()) {
1103 1103        // Top node goes in bb #2 with other constants.
1104 1104        // It must be special-cased, because it has no out edges.
1105 1105        early->add_inst(self);
1106 1106        continue;
1107 1107      }
1108 1108  
1109 1109      // No uses, just terminate
1110 1110      if (self->outcnt() == 0) {
1111 1111        assert(self->Opcode() == Op_MachProj, "sanity");
1112 1112        continue;                   // Must be a dead machine projection
1113 1113      }
1114 1114  
1115 1115      // If node is pinned in the block, then no scheduling can be done.
1116 1116      if( self->pinned() )          // Pinned in block?
1117 1117        continue;
1118 1118  
1119 1119      MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;
1120 1120      if (mach) {
1121 1121        switch (mach->ideal_Opcode()) {
1122 1122        case Op_CreateEx:
1123 1123          // Don't move exception creation
1124 1124          early->add_inst(self);
1125 1125          continue;
1126 1126          break;
1127 1127        case Op_CheckCastPP:
1128 1128          // Don't move CheckCastPP nodes away from their input, if the input
1129 1129          // is a rawptr (5071820).
1130 1130          Node *def = self->in(1);
1131 1131          if (def != NULL && def->bottom_type()->base() == Type::RawPtr) {
1132 1132            early->add_inst(self);
1133 1133  #ifdef ASSERT
1134 1134            _raw_oops.push(def);
1135 1135  #endif
1136 1136            continue;
1137 1137          }
1138 1138          break;
1139 1139        }
1140 1140      }
1141 1141  
1142 1142      // Gather LCA of all uses
1143 1143      Block *LCA = NULL;
1144 1144      {
1145 1145        for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
1146 1146          // For all uses, find LCA
1147 1147          Node* use = self->fast_out(i);
1148 1148          LCA = raise_LCA_above_use(LCA, use, self, _bbs);
1149 1149        }
1150 1150      }  // (Hide defs of imax, i from rest of block.)
1151 1151  
1152 1152      // Place temps in the block of their use.  This isn't a
1153 1153      // requirement for correctness but it reduces useless
1154 1154      // interference between temps and other nodes.
1155 1155      if (mach != NULL && mach->is_MachTemp()) {
1156 1156        _bbs.map(self->_idx, LCA);
1157 1157        LCA->add_inst(self);
1158 1158        continue;
1159 1159      }
1160 1160  
1161 1161      // Check if 'self' could be anti-dependent on memory
1162 1162      if (self->needs_anti_dependence_check()) {
1163 1163        // Hoist LCA above possible-defs and insert anti-dependences to
1164 1164        // defs in new LCA block.
1165 1165        LCA = insert_anti_dependences(LCA, self);
1166 1166      }
1167 1167  
1168 1168      if (early->_dom_depth > LCA->_dom_depth) {
1169 1169        // Somehow the LCA has moved above the earliest legal point.
1170 1170        // (One way this can happen is via memory_early_block.)
1171 1171        if (C->subsume_loads() == true && !C->failing()) {
1172 1172          // Retry with subsume_loads == false
1173 1173          // If this is the first failure, the sentinel string will "stick"
1174 1174          // to the Compile object, and the C2Compiler will see it and retry.
1175 1175          C->record_failure(C2Compiler::retry_no_subsuming_loads());
1176 1176        } else {
1177 1177          // Bailout without retry when (early->_dom_depth > LCA->_dom_depth)
1178 1178          C->record_method_not_compilable("late schedule failed: incorrect graph");
1179 1179        }
1180 1180        return;
1181 1181      }
1182 1182  
1183 1183      // If there is no opportunity to hoist, then we're done.
1184 1184      bool try_to_hoist = (LCA != early);
1185 1185  
1186 1186      // Must clone guys stay next to use; no hoisting allowed.
1187 1187      // Also cannot hoist guys that alter memory or are otherwise not
1188 1188      // allocatable (hoisting can make a value live longer, leading to
1189 1189      // anti and output dependency problems which are normally resolved
1190 1190      // by the register allocator giving everyone a different register).
1191 1191      if (mach != NULL && must_clone[mach->ideal_Opcode()])
1192 1192        try_to_hoist = false;
1193 1193  
1194 1194      Block* late = NULL;
1195 1195      if (try_to_hoist) {
1196 1196        // Now find the block with the least execution frequency.
1197 1197        // Start at the latest schedule and work up to the earliest schedule
1198 1198        // in the dominator tree.  Thus the Node will dominate all its uses.
1199 1199        late = hoist_to_cheaper_block(LCA, early, self);
1200 1200      } else {
1201 1201        // Just use the LCA of the uses.
1202 1202        late = LCA;
1203 1203      }
1204 1204  
1205 1205      // Put the node into target block
1206 1206      schedule_node_into_block(self, late);
1207 1207  
1208 1208  #ifdef ASSERT
1209 1209      if (self->needs_anti_dependence_check()) {
1210 1210        // since precedence edges are only inserted when we're sure they
1211 1211        // are needed make sure that after placement in a block we don't
1212 1212        // need any new precedence edges.
1213 1213        verify_anti_dependences(late, self);
1214 1214      }
1215 1215  #endif
1216 1216    } // Loop until all nodes have been visited
1217 1217  
1218 1218  } // end ScheduleLate
1219 1219  
1220 1220  //------------------------------GlobalCodeMotion-------------------------------
1221 1221  void PhaseCFG::GlobalCodeMotion( Matcher &matcher, uint unique, Node_List &proj_list ) {
1222 1222    ResourceMark rm;
1223 1223  
1224 1224  #ifndef PRODUCT
1225 1225    if (trace_opto_pipelining()) {
1226 1226      tty->print("\n---- Start GlobalCodeMotion ----\n");
1227 1227    }
1228 1228  #endif
1229 1229  
1230 1230    // Initialize the bbs.map for things on the proj_list
1231 1231    uint i;
1232 1232    for( i=0; i < proj_list.size(); i++ )
1233 1233      _bbs.map(proj_list[i]->_idx, NULL);
1234 1234  
1235 1235    // Set the basic block for Nodes pinned into blocks
1236 1236    Arena *a = Thread::current()->resource_area();
1237 1237    VectorSet visited(a);
1238 1238    schedule_pinned_nodes( visited );
1239 1239  
1240 1240    // Find the earliest Block any instruction can be placed in.  Some
1241 1241    // instructions are pinned into Blocks.  Unpinned instructions can
1242 1242    // appear in last block in which all their inputs occur.
1243 1243    visited.Clear();
1244 1244    Node_List stack(a);
1245 1245    stack.map( (unique >> 1) + 16, NULL); // Pre-grow the list
1246 1246    if (!schedule_early(visited, stack)) {
1247 1247      // Bailout without retry
1248 1248      C->record_method_not_compilable("early schedule failed");
1249 1249      return;
1250 1250    }
1251 1251  
1252 1252    // Build Def-Use edges.
1253 1253    proj_list.push(_root);        // Add real root as another root
1254 1254    proj_list.pop();
1255 1255  
1256 1256    // Compute the latency information (via backwards walk) for all the
1257 1257    // instructions in the graph
1258 1258    _node_latency = new GrowableArray<uint>(); // resource_area allocation
1259 1259  
1260 1260    if( C->do_scheduling() )
1261 1261      ComputeLatenciesBackwards(visited, stack);
1262 1262  
1263 1263    // Now schedule all codes as LATE as possible.  This is the LCA in the
1264 1264    // dominator tree of all USES of a value.  Pick the block with the least
1265 1265    // loop nesting depth that is lowest in the dominator tree.
1266 1266    // ( visited.Clear() called in schedule_late()->Node_Backward_Iterator() )
1267 1267    schedule_late(visited, stack);
1268 1268    if( C->failing() ) {
1269 1269      // schedule_late fails only when graph is incorrect.
1270 1270      assert(!VerifyGraphEdges, "verification should have failed");
1271 1271      return;
1272 1272    }
1273 1273  
1274 1274    unique = C->unique();
1275 1275  
1276 1276  #ifndef PRODUCT
1277 1277    if (trace_opto_pipelining()) {
1278 1278      tty->print("\n---- Detect implicit null checks ----\n");
1279 1279    }
1280 1280  #endif
1281 1281  
1282 1282    // Detect implicit-null-check opportunities.  Basically, find NULL checks
1283 1283    // with suitable memory ops nearby.  Use the memory op to do the NULL check.
1284 1284    // I can generate a memory op if there is not one nearby.
1285 1285    if (C->is_method_compilation()) {
1286 1286      // Don't do it for natives, adapters, or runtime stubs
1287 1287      int allowed_reasons = 0;
1288 1288      // ...and don't do it when there have been too many traps, globally.
1289 1289      for (int reason = (int)Deoptimization::Reason_none+1;
1290 1290           reason < Compile::trapHistLength; reason++) {
1291 1291        assert(reason < BitsPerInt, "recode bit map");
1292 1292        if (!C->too_many_traps((Deoptimization::DeoptReason) reason))
1293 1293          allowed_reasons |= nth_bit(reason);
1294 1294      }
1295 1295      // By reversing the loop direction we get a very minor gain on mpegaudio.
1296 1296      // Feel free to revert to a forward loop for clarity.
1297 1297      // for( int i=0; i < (int)matcher._null_check_tests.size(); i+=2 ) {
1298 1298      for( int i= matcher._null_check_tests.size()-2; i>=0; i-=2 ) {
1299 1299        Node *proj = matcher._null_check_tests[i  ];
1300 1300        Node *val  = matcher._null_check_tests[i+1];
1301 1301        _bbs[proj->_idx]->implicit_null_check(this, proj, val, allowed_reasons);
1302 1302        // The implicit_null_check will only perform the transformation
1303 1303        // if the null branch is truly uncommon, *and* it leads to an
1304 1304        // uncommon trap.  Combined with the too_many_traps guards
1305 1305        // above, this prevents SEGV storms reported in 6366351,
1306 1306        // by recompiling offending methods without this optimization.
1307 1307      }
1308 1308    }
1309 1309  
1310 1310  #ifndef PRODUCT
1311 1311    if (trace_opto_pipelining()) {
1312 1312      tty->print("\n---- Start Local Scheduling ----\n");
1313 1313    }
1314 1314  #endif
1315 1315  
1316 1316    // Schedule locally.  Right now a simple topological sort.
1317 1317    // Later, do a real latency aware scheduler.
1318 1318    int *ready_cnt = NEW_RESOURCE_ARRAY(int,C->unique());
1319 1319    memset( ready_cnt, -1, C->unique() * sizeof(int) );
1320 1320    visited.Clear();
1321 1321    for (i = 0; i < _num_blocks; i++) {
1322 1322      if (!_blocks[i]->schedule_local(this, matcher, ready_cnt, visited)) {
1323 1323        if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) {
1324 1324          C->record_method_not_compilable("local schedule failed");
1325 1325        }
1326 1326        return;
1327 1327      }
1328 1328    }
1329 1329  
1330 1330    // If we inserted any instructions between a Call and his CatchNode,
1331 1331    // clone the instructions on all paths below the Catch.
1332 1332    for( i=0; i < _num_blocks; i++ )
1333 1333      _blocks[i]->call_catch_cleanup(_bbs);
1334 1334  
1335 1335  #ifndef PRODUCT
1336 1336    if (trace_opto_pipelining()) {
1337 1337      tty->print("\n---- After GlobalCodeMotion ----\n");
1338 1338      for (uint i = 0; i < _num_blocks; i++) {
1339 1339        _blocks[i]->dump();
1340 1340      }
1341 1341    }
1342 1342  #endif
1343 1343    // Dead.
1344 1344    _node_latency = (GrowableArray<uint> *)0xdeadbeef;
1345 1345  }
1346 1346  
1347 1347  
1348 1348  //------------------------------Estimate_Block_Frequency-----------------------
1349 1349  // Estimate block frequencies based on IfNode probabilities.
1350 1350  void PhaseCFG::Estimate_Block_Frequency() {
1351 1351  
1352 1352    // Force conditional branches leading to uncommon traps to be unlikely,
1353 1353    // not because we get to the uncommon_trap with less relative frequency,
1354 1354    // but because an uncommon_trap typically causes a deopt, so we only get
1355 1355    // there once.
1356 1356    if (C->do_freq_based_layout()) {
1357 1357      Block_List worklist;
1358 1358      Block* root_blk = _blocks[0];
1359 1359      for (uint i = 1; i < root_blk->num_preds(); i++) {
1360 1360        Block *pb = _bbs[root_blk->pred(i)->_idx];
1361 1361        if (pb->has_uncommon_code()) {
1362 1362          worklist.push(pb);
1363 1363        }
1364 1364      }
1365 1365      while (worklist.size() > 0) {
1366 1366        Block* uct = worklist.pop();
1367 1367        if (uct == _broot) continue;
1368 1368        for (uint i = 1; i < uct->num_preds(); i++) {
1369 1369          Block *pb = _bbs[uct->pred(i)->_idx];
1370 1370          if (pb->_num_succs == 1) {
1371 1371            worklist.push(pb);
1372 1372          } else if (pb->num_fall_throughs() == 2) {
1373 1373            pb->update_uncommon_branch(uct);
1374 1374          }
1375 1375        }
1376 1376      }
1377 1377    }
1378 1378  
1379 1379    // Create the loop tree and calculate loop depth.
1380 1380    _root_loop = create_loop_tree();
1381 1381    _root_loop->compute_loop_depth(0);
1382 1382  
1383 1383    // Compute block frequency of each block, relative to a single loop entry.
1384 1384    _root_loop->compute_freq();
1385 1385  
1386 1386    // Adjust all frequencies to be relative to a single method entry
1387 1387    _root_loop->_freq = 1.0;
1388 1388    _root_loop->scale_freq();
1389 1389  
1390 1390    // Save outmost loop frequency for LRG frequency threshold
1391 1391    _outer_loop_freq = _root_loop->outer_loop_freq();
1392 1392  
1393 1393    // force paths ending at uncommon traps to be infrequent
1394 1394    if (!C->do_freq_based_layout()) {
1395 1395      Block_List worklist;
1396 1396      Block* root_blk = _blocks[0];
1397 1397      for (uint i = 1; i < root_blk->num_preds(); i++) {
1398 1398        Block *pb = _bbs[root_blk->pred(i)->_idx];
1399 1399        if (pb->has_uncommon_code()) {
1400 1400          worklist.push(pb);
1401 1401        }
1402 1402      }
1403 1403      while (worklist.size() > 0) {
1404 1404        Block* uct = worklist.pop();
1405 1405        uct->_freq = PROB_MIN;
1406 1406        for (uint i = 1; i < uct->num_preds(); i++) {
1407 1407          Block *pb = _bbs[uct->pred(i)->_idx];
1408 1408          if (pb->_num_succs == 1 && pb->_freq > PROB_MIN) {
1409 1409            worklist.push(pb);
1410 1410          }
1411 1411        }
1412 1412      }
1413 1413    }
1414 1414  
1415 1415  #ifdef ASSERT
1416 1416    for (uint i = 0; i < _num_blocks; i++ ) {
1417 1417      Block *b = _blocks[i];
1418 1418      assert(b->_freq >= MIN_BLOCK_FREQUENCY, "Register Allocator requires meaningful block frequency");
1419 1419    }
1420 1420  #endif
1421 1421  
1422 1422  #ifndef PRODUCT
1423 1423    if (PrintCFGBlockFreq) {
1424 1424      tty->print_cr("CFG Block Frequencies");
1425 1425      _root_loop->dump_tree();
1426 1426      if (Verbose) {
1427 1427        tty->print_cr("PhaseCFG dump");
1428 1428        dump();
1429 1429        tty->print_cr("Node dump");
1430 1430        _root->dump(99999);
1431 1431      }
1432 1432    }
1433 1433  #endif
1434 1434  }
1435 1435  
1436 1436  //----------------------------create_loop_tree--------------------------------
1437 1437  // Create a loop tree from the CFG
1438 1438  CFGLoop* PhaseCFG::create_loop_tree() {
1439 1439  
1440 1440  #ifdef ASSERT
1441 1441    assert( _blocks[0] == _broot, "" );
1442 1442    for (uint i = 0; i < _num_blocks; i++ ) {
1443 1443      Block *b = _blocks[i];
1444 1444      // Check that _loop field are clear...we could clear them if not.
1445 1445      assert(b->_loop == NULL, "clear _loop expected");
1446 1446      // Sanity check that the RPO numbering is reflected in the _blocks array.
1447 1447      // It doesn't have to be for the loop tree to be built, but if it is not,
1448 1448      // then the blocks have been reordered since dom graph building...which
1449 1449      // may question the RPO numbering
1450 1450      assert(b->_rpo == i, "unexpected reverse post order number");
1451 1451    }
1452 1452  #endif
1453 1453  
1454 1454    int idct = 0;
1455 1455    CFGLoop* root_loop = new CFGLoop(idct++);
1456 1456  
1457 1457    Block_List worklist;
1458 1458  
1459 1459    // Assign blocks to loops
1460 1460    for(uint i = _num_blocks - 1; i > 0; i-- ) { // skip Root block
1461 1461      Block *b = _blocks[i];
1462 1462  
1463 1463      if (b->head()->is_Loop()) {
1464 1464        Block* loop_head = b;
1465 1465        assert(loop_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1466 1466        Node* tail_n = loop_head->pred(LoopNode::LoopBackControl);
1467 1467        Block* tail = _bbs[tail_n->_idx];
1468 1468  
1469 1469        // Defensively filter out Loop nodes for non-single-entry loops.
1470 1470        // For all reasonable loops, the head occurs before the tail in RPO.
1471 1471        if (i <= tail->_rpo) {
1472 1472  
1473 1473          // The tail and (recursive) predecessors of the tail
1474 1474          // are made members of a new loop.
1475 1475  
1476 1476          assert(worklist.size() == 0, "nonempty worklist");
1477 1477          CFGLoop* nloop = new CFGLoop(idct++);
1478 1478          assert(loop_head->_loop == NULL, "just checking");
1479 1479          loop_head->_loop = nloop;
1480 1480          // Add to nloop so push_pred() will skip over inner loops
1481 1481          nloop->add_member(loop_head);
1482 1482          nloop->push_pred(loop_head, LoopNode::LoopBackControl, worklist, _bbs);
1483 1483  
1484 1484          while (worklist.size() > 0) {
1485 1485            Block* member = worklist.pop();
1486 1486            if (member != loop_head) {
1487 1487              for (uint j = 1; j < member->num_preds(); j++) {
1488 1488                nloop->push_pred(member, j, worklist, _bbs);
1489 1489              }
1490 1490            }
1491 1491          }
1492 1492        }
1493 1493      }
1494 1494    }
1495 1495  
1496 1496    // Create a member list for each loop consisting
1497 1497    // of both blocks and (immediate child) loops.
1498 1498    for (uint i = 0; i < _num_blocks; i++) {
1499 1499      Block *b = _blocks[i];
1500 1500      CFGLoop* lp = b->_loop;
1501 1501      if (lp == NULL) {
1502 1502        // Not assigned to a loop. Add it to the method's pseudo loop.
1503 1503        b->_loop = root_loop;
1504 1504        lp = root_loop;
1505 1505      }
1506 1506      if (lp == root_loop || b != lp->head()) { // loop heads are already members
1507 1507        lp->add_member(b);
1508 1508      }
1509 1509      if (lp != root_loop) {
1510 1510        if (lp->parent() == NULL) {
1511 1511          // Not a nested loop. Make it a child of the method's pseudo loop.
1512 1512          root_loop->add_nested_loop(lp);
1513 1513        }
1514 1514        if (b == lp->head()) {
1515 1515          // Add nested loop to member list of parent loop.
1516 1516          lp->parent()->add_member(lp);
1517 1517        }
1518 1518      }
1519 1519    }
1520 1520  
1521 1521    return root_loop;
1522 1522  }
1523 1523  
1524 1524  //------------------------------push_pred--------------------------------------
1525 1525  void CFGLoop::push_pred(Block* blk, int i, Block_List& worklist, Block_Array& node_to_blk) {
1526 1526    Node* pred_n = blk->pred(i);
1527 1527    Block* pred = node_to_blk[pred_n->_idx];
1528 1528    CFGLoop *pred_loop = pred->_loop;
1529 1529    if (pred_loop == NULL) {
1530 1530      // Filter out blocks for non-single-entry loops.
1531 1531      // For all reasonable loops, the head occurs before the tail in RPO.
1532 1532      if (pred->_rpo > head()->_rpo) {
1533 1533        pred->_loop = this;
1534 1534        worklist.push(pred);
1535 1535      }
1536 1536    } else if (pred_loop != this) {
1537 1537      // Nested loop.
1538 1538      while (pred_loop->_parent != NULL && pred_loop->_parent != this) {
1539 1539        pred_loop = pred_loop->_parent;
1540 1540      }
1541 1541      // Make pred's loop be a child
1542 1542      if (pred_loop->_parent == NULL) {
1543 1543        add_nested_loop(pred_loop);
1544 1544        // Continue with loop entry predecessor.
1545 1545        Block* pred_head = pred_loop->head();
1546 1546        assert(pred_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1547 1547        assert(pred_head != head(), "loop head in only one loop");
1548 1548        push_pred(pred_head, LoopNode::EntryControl, worklist, node_to_blk);
1549 1549      } else {
1550 1550        assert(pred_loop->_parent == this && _parent == NULL, "just checking");
1551 1551      }
1552 1552    }
1553 1553  }
1554 1554  
1555 1555  //------------------------------add_nested_loop--------------------------------
1556 1556  // Make cl a child of the current loop in the loop tree.
1557 1557  void CFGLoop::add_nested_loop(CFGLoop* cl) {
1558 1558    assert(_parent == NULL, "no parent yet");
1559 1559    assert(cl != this, "not my own parent");
1560 1560    cl->_parent = this;
1561 1561    CFGLoop* ch = _child;
1562 1562    if (ch == NULL) {
1563 1563      _child = cl;
1564 1564    } else {
1565 1565      while (ch->_sibling != NULL) { ch = ch->_sibling; }
1566 1566      ch->_sibling = cl;
1567 1567    }
1568 1568  }
1569 1569  
1570 1570  //------------------------------compute_loop_depth-----------------------------
1571 1571  // Store the loop depth in each CFGLoop object.
1572 1572  // Recursively walk the children to do the same for them.
1573 1573  void CFGLoop::compute_loop_depth(int depth) {
1574 1574    _depth = depth;
1575 1575    CFGLoop* ch = _child;
1576 1576    while (ch != NULL) {
1577 1577      ch->compute_loop_depth(depth + 1);
1578 1578      ch = ch->_sibling;
1579 1579    }
1580 1580  }
1581 1581  
1582 1582  //------------------------------compute_freq-----------------------------------
1583 1583  // Compute the frequency of each block and loop, relative to a single entry
1584 1584  // into the dominating loop head.
1585 1585  void CFGLoop::compute_freq() {
1586 1586    // Bottom up traversal of loop tree (visit inner loops first.)
1587 1587    // Set loop head frequency to 1.0, then transitively
1588 1588    // compute frequency for all successors in the loop,
1589 1589    // as well as for each exit edge.  Inner loops are
1590 1590    // treated as single blocks with loop exit targets
1591 1591    // as the successor blocks.
1592 1592  
1593 1593    // Nested loops first
1594 1594    CFGLoop* ch = _child;
1595 1595    while (ch != NULL) {
1596 1596      ch->compute_freq();
1597 1597      ch = ch->_sibling;
1598 1598    }
1599 1599    assert (_members.length() > 0, "no empty loops");
1600 1600    Block* hd = head();
1601 1601    hd->_freq = 1.0f;
1602 1602    for (int i = 0; i < _members.length(); i++) {
1603 1603      CFGElement* s = _members.at(i);
1604 1604      float freq = s->_freq;
1605 1605      if (s->is_block()) {
1606 1606        Block* b = s->as_Block();
1607 1607        for (uint j = 0; j < b->_num_succs; j++) {
1608 1608          Block* sb = b->_succs[j];
1609 1609          update_succ_freq(sb, freq * b->succ_prob(j));
1610 1610        }
1611 1611      } else {
1612 1612        CFGLoop* lp = s->as_CFGLoop();
1613 1613        assert(lp->_parent == this, "immediate child");
1614 1614        for (int k = 0; k < lp->_exits.length(); k++) {
1615 1615          Block* eb = lp->_exits.at(k).get_target();
1616 1616          float prob = lp->_exits.at(k).get_prob();
1617 1617          update_succ_freq(eb, freq * prob);
1618 1618        }
1619 1619      }
1620 1620    }
1621 1621  
1622 1622    // For all loops other than the outer, "method" loop,
1623 1623    // sum and normalize the exit probability. The "method" loop
1624 1624    // should keep the initial exit probability of 1, so that
1625 1625    // inner blocks do not get erroneously scaled.
1626 1626    if (_depth != 0) {
1627 1627      // Total the exit probabilities for this loop.
1628 1628      float exits_sum = 0.0f;
1629 1629      for (int i = 0; i < _exits.length(); i++) {
1630 1630        exits_sum += _exits.at(i).get_prob();
1631 1631      }
1632 1632  
1633 1633      // Normalize the exit probabilities. Until now, the
1634 1634      // probabilities estimate the possibility of exit per
1635 1635      // a single loop iteration; afterward, they estimate
1636 1636      // the probability of exit per loop entry.
1637 1637      for (int i = 0; i < _exits.length(); i++) {
1638 1638        Block* et = _exits.at(i).get_target();
1639 1639        float new_prob = 0.0f;
1640 1640        if (_exits.at(i).get_prob() > 0.0f) {
1641 1641          new_prob = _exits.at(i).get_prob() / exits_sum;
1642 1642        }
1643 1643        BlockProbPair bpp(et, new_prob);
1644 1644        _exits.at_put(i, bpp);
1645 1645      }
1646 1646  
1647 1647      // Save the total, but guard against unreasonable probability,
1648 1648      // as the value is used to estimate the loop trip count.
1649 1649      // An infinite trip count would blur relative block
1650 1650      // frequencies.
1651 1651      if (exits_sum > 1.0f) exits_sum = 1.0;
1652 1652      if (exits_sum < PROB_MIN) exits_sum = PROB_MIN;
1653 1653      _exit_prob = exits_sum;
1654 1654    }
1655 1655  }
1656 1656  
1657 1657  //------------------------------succ_prob-------------------------------------
1658 1658  // Determine the probability of reaching successor 'i' from the receiver block.
1659 1659  float Block::succ_prob(uint i) {
1660 1660    int eidx = end_idx();
1661 1661    Node *n = _nodes[eidx];  // Get ending Node
1662 1662  
1663 1663    int op = n->Opcode();
1664 1664    if (n->is_Mach()) {
1665 1665      if (n->is_MachNullCheck()) {
1666 1666        // Can only reach here if called after lcm. The original Op_If is gone,
1667 1667        // so we attempt to infer the probability from one or both of the
1668 1668        // successor blocks.
1669 1669        assert(_num_succs == 2, "expecting 2 successors of a null check");
1670 1670        // If either successor has only one predecessor, then the
1671 1671        // probability estimate can be derived using the
1672 1672        // relative frequency of the successor and this block.
1673 1673        if (_succs[i]->num_preds() == 2) {
1674 1674          return _succs[i]->_freq / _freq;
1675 1675        } else if (_succs[1-i]->num_preds() == 2) {
1676 1676          return 1 - (_succs[1-i]->_freq / _freq);
1677 1677        } else {
1678 1678          // Estimate using both successor frequencies
1679 1679          float freq = _succs[i]->_freq;
1680 1680          return freq / (freq + _succs[1-i]->_freq);
1681 1681        }
1682 1682      }
1683 1683      op = n->as_Mach()->ideal_Opcode();
1684 1684    }
1685 1685  
1686 1686  
1687 1687    // Switch on branch type
1688 1688    switch( op ) {
1689 1689    case Op_CountedLoopEnd:
1690 1690    case Op_If: {
1691 1691      assert (i < 2, "just checking");
1692 1692      // Conditionals pass on only part of their frequency
1693 1693      float prob  = n->as_MachIf()->_prob;
1694 1694      assert(prob >= 0.0 && prob <= 1.0, "out of range probability");
1695 1695      // If succ[i] is the FALSE branch, invert path info
1696 1696      if( _nodes[i + eidx + 1]->Opcode() == Op_IfFalse ) {
1697 1697        return 1.0f - prob; // not taken
1698 1698      } else {
1699 1699        return prob; // taken
1700 1700      }
1701 1701    }
1702 1702  
1703 1703    case Op_Jump:
1704 1704      // Divide the frequency between all successors evenly
1705 1705      return 1.0f/_num_succs;
1706 1706  
1707 1707    case Op_Catch: {
1708 1708      const CatchProjNode *ci = _nodes[i + eidx + 1]->as_CatchProj();
1709 1709      if (ci->_con == CatchProjNode::fall_through_index) {
1710 1710        // Fall-thru path gets the lion's share.
1711 1711        return 1.0f - PROB_UNLIKELY_MAG(5)*_num_succs;
1712 1712      } else {
1713 1713        // Presume exceptional paths are equally unlikely
1714 1714        return PROB_UNLIKELY_MAG(5);
1715 1715      }
1716 1716    }
1717 1717  
1718 1718    case Op_Root:
1719 1719    case Op_Goto:
1720 1720      // Pass frequency straight thru to target
1721 1721      return 1.0f;
1722 1722  
1723 1723    case Op_NeverBranch:
1724 1724      return 0.0f;
1725 1725  
1726 1726    case Op_TailCall:
1727 1727    case Op_TailJump:
1728 1728    case Op_Return:
1729 1729    case Op_Halt:
1730 1730    case Op_Rethrow:
1731 1731      // Do not push out freq to root block
1732 1732      return 0.0f;
1733 1733  
1734 1734    default:
1735 1735      ShouldNotReachHere();
1736 1736    }
1737 1737  
1738 1738    return 0.0f;
1739 1739  }
1740 1740  
1741 1741  //------------------------------num_fall_throughs-----------------------------
1742 1742  // Return the number of fall-through candidates for a block
1743 1743  int Block::num_fall_throughs() {
1744 1744    int eidx = end_idx();
1745 1745    Node *n = _nodes[eidx];  // Get ending Node
1746 1746  
1747 1747    int op = n->Opcode();
1748 1748    if (n->is_Mach()) {
1749 1749      if (n->is_MachNullCheck()) {
1750 1750        // In theory, either side can fall-thru, for simplicity sake,
1751 1751        // let's say only the false branch can now.
1752 1752        return 1;
1753 1753      }
1754 1754      op = n->as_Mach()->ideal_Opcode();
1755 1755    }
1756 1756  
1757 1757    // Switch on branch type
1758 1758    switch( op ) {
1759 1759    case Op_CountedLoopEnd:
1760 1760    case Op_If:
1761 1761      return 2;
1762 1762  
1763 1763    case Op_Root:
1764 1764    case Op_Goto:
1765 1765      return 1;
1766 1766  
1767 1767    case Op_Catch: {
1768 1768      for (uint i = 0; i < _num_succs; i++) {
1769 1769        const CatchProjNode *ci = _nodes[i + eidx + 1]->as_CatchProj();
1770 1770        if (ci->_con == CatchProjNode::fall_through_index) {
1771 1771          return 1;
1772 1772        }
1773 1773      }
1774 1774      return 0;
1775 1775    }
1776 1776  
1777 1777    case Op_Jump:
1778 1778    case Op_NeverBranch:
1779 1779    case Op_TailCall:
1780 1780    case Op_TailJump:
1781 1781    case Op_Return:
1782 1782    case Op_Halt:
1783 1783    case Op_Rethrow:
1784 1784      return 0;
1785 1785  
1786 1786    default:
1787 1787      ShouldNotReachHere();
1788 1788    }
1789 1789  
1790 1790    return 0;
1791 1791  }
1792 1792  
1793 1793  //------------------------------succ_fall_through-----------------------------
1794 1794  // Return true if a specific successor could be fall-through target.
1795 1795  bool Block::succ_fall_through(uint i) {
1796 1796    int eidx = end_idx();
1797 1797    Node *n = _nodes[eidx];  // Get ending Node
1798 1798  
1799 1799    int op = n->Opcode();
1800 1800    if (n->is_Mach()) {
1801 1801      if (n->is_MachNullCheck()) {
1802 1802        // In theory, either side can fall-thru, for simplicity sake,
1803 1803        // let's say only the false branch can now.
1804 1804        return _nodes[i + eidx + 1]->Opcode() == Op_IfFalse;
1805 1805      }
1806 1806      op = n->as_Mach()->ideal_Opcode();
1807 1807    }
1808 1808  
1809 1809    // Switch on branch type
1810 1810    switch( op ) {
1811 1811    case Op_CountedLoopEnd:
1812 1812    case Op_If:
1813 1813    case Op_Root:
1814 1814    case Op_Goto:
1815 1815      return true;
1816 1816  
1817 1817    case Op_Catch: {
1818 1818      const CatchProjNode *ci = _nodes[i + eidx + 1]->as_CatchProj();
1819 1819      return ci->_con == CatchProjNode::fall_through_index;
1820 1820    }
1821 1821  
1822 1822    case Op_Jump:
1823 1823    case Op_NeverBranch:
1824 1824    case Op_TailCall:
1825 1825    case Op_TailJump:
1826 1826    case Op_Return:
1827 1827    case Op_Halt:
1828 1828    case Op_Rethrow:
1829 1829      return false;
1830 1830  
1831 1831    default:
1832 1832      ShouldNotReachHere();
1833 1833    }
1834 1834  
1835 1835    return false;
1836 1836  }
1837 1837  
1838 1838  //------------------------------update_uncommon_branch------------------------
1839 1839  // Update the probability of a two-branch to be uncommon
1840 1840  void Block::update_uncommon_branch(Block* ub) {
1841 1841    int eidx = end_idx();
1842 1842    Node *n = _nodes[eidx];  // Get ending Node
1843 1843  
1844 1844    int op = n->as_Mach()->ideal_Opcode();
1845 1845  
1846 1846    assert(op == Op_CountedLoopEnd || op == Op_If, "must be a If");
1847 1847    assert(num_fall_throughs() == 2, "must be a two way branch block");
1848 1848  
1849 1849    // Which successor is ub?
1850 1850    uint s;
1851 1851    for (s = 0; s <_num_succs; s++) {
1852 1852      if (_succs[s] == ub) break;
1853 1853    }
1854 1854    assert(s < 2, "uncommon successor must be found");
1855 1855  
1856 1856    // If ub is the true path, make the proability small, else
1857 1857    // ub is the false path, and make the probability large
1858 1858    bool invert = (_nodes[s + eidx + 1]->Opcode() == Op_IfFalse);
1859 1859  
1860 1860    // Get existing probability
1861 1861    float p = n->as_MachIf()->_prob;
1862 1862  
1863 1863    if (invert) p = 1.0 - p;
1864 1864    if (p > PROB_MIN) {
1865 1865      p = PROB_MIN;
1866 1866    }
1867 1867    if (invert) p = 1.0 - p;
1868 1868  
1869 1869    n->as_MachIf()->_prob = p;
1870 1870  }
1871 1871  
1872 1872  //------------------------------update_succ_freq-------------------------------
1873 1873  // Update the appropriate frequency associated with block 'b', a successor of
1874 1874  // a block in this loop.
1875 1875  void CFGLoop::update_succ_freq(Block* b, float freq) {
1876 1876    if (b->_loop == this) {
1877 1877      if (b == head()) {
1878 1878        // back branch within the loop
1879 1879        // Do nothing now, the loop carried frequency will be
1880 1880        // adjust later in scale_freq().
1881 1881      } else {
1882 1882        // simple branch within the loop
1883 1883        b->_freq += freq;
1884 1884      }
1885 1885    } else if (!in_loop_nest(b)) {
1886 1886      // branch is exit from this loop
1887 1887      BlockProbPair bpp(b, freq);
1888 1888      _exits.append(bpp);
1889 1889    } else {
1890 1890      // branch into nested loop
1891 1891      CFGLoop* ch = b->_loop;
1892 1892      ch->_freq += freq;
1893 1893    }
1894 1894  }
1895 1895  
1896 1896  //------------------------------in_loop_nest-----------------------------------
1897 1897  // Determine if block b is in the receiver's loop nest.
1898 1898  bool CFGLoop::in_loop_nest(Block* b) {
1899 1899    int depth = _depth;
1900 1900    CFGLoop* b_loop = b->_loop;
1901 1901    int b_depth = b_loop->_depth;
1902 1902    if (depth == b_depth) {
1903 1903      return true;
1904 1904    }
1905 1905    while (b_depth > depth) {
1906 1906      b_loop = b_loop->_parent;
1907 1907      b_depth = b_loop->_depth;
1908 1908    }
1909 1909    return b_loop == this;
1910 1910  }
1911 1911  
1912 1912  //------------------------------scale_freq-------------------------------------
1913 1913  // Scale frequency of loops and blocks by trip counts from outer loops
1914 1914  // Do a top down traversal of loop tree (visit outer loops first.)
1915 1915  void CFGLoop::scale_freq() {
1916 1916    float loop_freq = _freq * trip_count();
1917 1917    _freq = loop_freq;
1918 1918    for (int i = 0; i < _members.length(); i++) {
1919 1919      CFGElement* s = _members.at(i);
1920 1920      float block_freq = s->_freq * loop_freq;
1921 1921      if (g_isnan(block_freq) || block_freq < MIN_BLOCK_FREQUENCY)
1922 1922        block_freq = MIN_BLOCK_FREQUENCY;
1923 1923      s->_freq = block_freq;
1924 1924    }
1925 1925    CFGLoop* ch = _child;
1926 1926    while (ch != NULL) {
1927 1927      ch->scale_freq();
1928 1928      ch = ch->_sibling;
1929 1929    }
1930 1930  }
1931 1931  
1932 1932  // Frequency of outer loop
1933 1933  float CFGLoop::outer_loop_freq() const {
1934 1934    if (_child != NULL) {
1935 1935      return _child->_freq;
1936 1936    }
1937 1937    return _freq;
1938 1938  }
1939 1939  
1940 1940  #ifndef PRODUCT
1941 1941  //------------------------------dump_tree--------------------------------------
1942 1942  void CFGLoop::dump_tree() const {
1943 1943    dump();
1944 1944    if (_child != NULL)   _child->dump_tree();
1945 1945    if (_sibling != NULL) _sibling->dump_tree();
1946 1946  }
1947 1947  
1948 1948  //------------------------------dump-------------------------------------------
1949 1949  void CFGLoop::dump() const {
1950 1950    for (int i = 0; i < _depth; i++) tty->print("   ");
1951 1951    tty->print("%s: %d  trip_count: %6.0f freq: %6.0f\n",
1952 1952               _depth == 0 ? "Method" : "Loop", _id, trip_count(), _freq);
1953 1953    for (int i = 0; i < _depth; i++) tty->print("   ");
1954 1954    tty->print("         members:", _id);
1955 1955    int k = 0;
1956 1956    for (int i = 0; i < _members.length(); i++) {
1957 1957      if (k++ >= 6) {
1958 1958        tty->print("\n              ");
1959 1959        for (int j = 0; j < _depth+1; j++) tty->print("   ");
1960 1960        k = 0;
1961 1961      }
1962 1962      CFGElement *s = _members.at(i);
1963 1963      if (s->is_block()) {
1964 1964        Block *b = s->as_Block();
1965 1965        tty->print(" B%d(%6.3f)", b->_pre_order, b->_freq);
1966 1966      } else {
1967 1967        CFGLoop* lp = s->as_CFGLoop();
1968 1968        tty->print(" L%d(%6.3f)", lp->_id, lp->_freq);
1969 1969      }
1970 1970    }
1971 1971    tty->print("\n");
1972 1972    for (int i = 0; i < _depth; i++) tty->print("   ");
1973 1973    tty->print("         exits:  ");
1974 1974    k = 0;
1975 1975    for (int i = 0; i < _exits.length(); i++) {
1976 1976      if (k++ >= 7) {
1977 1977        tty->print("\n              ");
1978 1978        for (int j = 0; j < _depth+1; j++) tty->print("   ");
1979 1979        k = 0;
1980 1980      }
1981 1981      Block *blk = _exits.at(i).get_target();
1982 1982      float prob = _exits.at(i).get_prob();
1983 1983      tty->print(" ->%d@%d%%", blk->_pre_order, (int)(prob*100));
1984 1984    }
1985 1985    tty->print("\n");
1986 1986  }
1987 1987  #endif

↓ open down ↓

1908 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX