--- old/src/cpu/aarch64/vm/c2_globals_aarch64.hpp	2015-09-03 15:31:30.604338200 -0700
+++ new/src/cpu/aarch64/vm/c2_globals_aarch64.hpp	2015-09-03 15:31:30.337338200 -0700
@@ -72,6 +72,7 @@
 define_pd_global(bool, UseCISCSpill,                 true);
 define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoBundling,                 false);
+define_pd_global(bool, OptoRegScheduling,            false);
 
 define_pd_global(intx, ReservedCodeCacheSize,        48*M);
 define_pd_global(intx, NonProfiledCodeHeapSize,      21*M);
--- old/src/cpu/ppc/vm/c2_globals_ppc.hpp	2015-09-03 15:31:32.067338200 -0700
+++ new/src/cpu/ppc/vm/c2_globals_ppc.hpp	2015-09-03 15:31:31.829338200 -0700
@@ -60,6 +60,7 @@
 define_pd_global(bool, OptoPeephole,                 false);
 define_pd_global(bool, UseCISCSpill,                 false);
 define_pd_global(bool, OptoBundling,                 false);
+define_pd_global(bool, OptoRegScheduling,            false);
 // GL:
 // Detected a problem with unscaled compressed oops and
 // narrow_oop_use_complex_address() == false.
--- old/src/cpu/sparc/vm/c2_globals_sparc.hpp	2015-09-03 15:31:33.496338200 -0700
+++ new/src/cpu/sparc/vm/c2_globals_sparc.hpp	2015-09-03 15:31:33.250338200 -0700
@@ -64,6 +64,7 @@
 define_pd_global(bool, UseCISCSpill,                 false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoScheduling,               true);
+define_pd_global(bool, OptoRegScheduling,            false);
 
 #ifdef _LP64
 // We need to make sure that all generated code is within
--- old/src/cpu/x86/vm/c2_globals_x86.hpp	2015-09-03 15:31:34.909338200 -0700
+++ new/src/cpu/x86/vm/c2_globals_x86.hpp	2015-09-03 15:31:34.662338200 -0700
@@ -48,11 +48,11 @@
 
 define_pd_global(intx, OnStackReplacePercentage,     140);
 define_pd_global(intx, ConditionalMoveLimit,         3);
-define_pd_global(intx, FLOATPRESSURE,                6);
 define_pd_global(intx, FreqInlineSize,               325);
 define_pd_global(intx, MinJumpTableSize,             10);
 #ifdef AMD64
 define_pd_global(intx, INTPRESSURE,                  13);
+define_pd_global(intx, FLOATPRESSURE,                14);
 define_pd_global(intx, InteriorEntryAlignment,       16);
 define_pd_global(size_t, NewSizeThreadIncrease,      ScaleForWordSize(4*K));
 define_pd_global(intx, LoopUnrollLimit,              60);
@@ -64,6 +64,7 @@
 define_pd_global(uint64_t, MaxRAM,                   128ULL*G);
 #else
 define_pd_global(intx, INTPRESSURE,                  6);
+define_pd_global(intx, FLOATPRESSURE,                6);
 define_pd_global(intx, InteriorEntryAlignment,       4);
 define_pd_global(size_t, NewSizeThreadIncrease,      4*K);
 define_pd_global(intx, LoopUnrollLimit,              50);     // Design center runs on 1.3.1
@@ -82,6 +83,7 @@
 define_pd_global(bool, UseCISCSpill,                 true);
 define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoBundling,                 false);
+define_pd_global(bool, OptoRegScheduling,            true);
 
 define_pd_global(intx, ReservedCodeCacheSize,        48*M);
 define_pd_global(intx, NonProfiledCodeHeapSize,      21*M);
--- old/src/share/vm/opto/block.cpp	2015-09-03 15:31:36.374338200 -0700
+++ new/src/share/vm/opto/block.cpp	2015-09-03 15:31:36.157338200 -0700
@@ -358,6 +358,8 @@
 PhaseCFG::PhaseCFG(Arena* arena, RootNode* root, Matcher& matcher)
 : Phase(CFG)
 , _block_arena(arena)
+, _regalloc(NULL)
+, _scheduling_for_pressure(false)
 , _root(root)
 , _matcher(matcher)
 , _node_to_block_mapping(arena)
--- old/src/share/vm/opto/block.hpp	2015-09-03 15:31:37.720338200 -0700
+++ new/src/share/vm/opto/block.hpp	2015-09-03 15:31:37.488338200 -0700
@@ -37,6 +37,7 @@
 class Matcher;
 class RootNode;
 class VectorSet;
+class PhaseChaitin;
 struct Tarjan;
 
 //------------------------------Block_Array------------------------------------
@@ -383,6 +384,12 @@
   // Arena for the blocks to be stored in
   Arena* _block_arena;
 
+  // Info used for scheduling
+  PhaseChaitin* _regalloc;
+
+  // Register pressure heuristic used?
+  bool _scheduling_for_pressure;
+
   // The matcher for this compilation
   Matcher& _matcher;
 
@@ -433,12 +440,14 @@
   // to late. Helper for schedule_late.
   Block* hoist_to_cheaper_block(Block* LCA, Block* early, Node* self);
 
-  bool schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call);
+  bool schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call, intptr_t* recacl_pressure_nodes);
   void set_next_call(Block* block, Node* n, VectorSet& next_call);
   void needed_for_next_call(Block* block, Node* this_call, VectorSet& next_call);
 
   // Perform basic-block local scheduling
-  Node* select(Block* block, Node_List& worklist, GrowableArray<int>& ready_cnt, VectorSet& next_call, uint sched_slot);
+  Node* select(Block* block, Node_List& worklist, GrowableArray<int>& ready_cnt, VectorSet& next_call, uint sched_slot,
+               intptr_t* recacl_pressure_nodes);
+  void adjust_register_pressure(Node* n, Block* block, intptr_t *recalc_pressure_nodes, bool finalize_mode);
 
   // Schedule a call next in the block
   uint sched_call(Block* block, uint node_cnt, Node_List& worklist, GrowableArray<int>& ready_cnt, MachCallNode* mcall, VectorSet& next_call);
--- old/src/share/vm/opto/c2_globals.hpp	2015-09-03 15:31:39.406338200 -0700
+++ new/src/share/vm/opto/c2_globals.hpp	2015-09-03 15:31:39.195338200 -0700
@@ -306,6 +306,9 @@
   product_pd(bool, OptoScheduling,                                          \
           "Instruction Scheduling after register allocation")               \
                                                                             \
+  product_pd(bool, OptoRegScheduling,                                       \
+          "Instruction Scheduling before register allocation for pressure") \
+                                                                            \
   product(bool, PartialPeelLoop, true,                                      \
           "Partial peel (rotate) loops")                                    \
                                                                             \
--- old/src/share/vm/opto/chaitin.cpp	2015-09-03 15:31:40.702338200 -0700
+++ new/src/share/vm/opto/chaitin.cpp	2015-09-03 15:31:40.496338200 -0700
@@ -191,7 +191,7 @@
   return next;
 }
 
-PhaseChaitin::PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher)
+PhaseChaitin::PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher, bool scheduling_info_generated)
   : PhaseRegAlloc(unique, cfg, matcher,
 #ifndef PRODUCT
        print_chaitin_statistics
@@ -205,6 +205,11 @@
   , _spilled_twice(Thread::current()->resource_area())
   , _lo_degree(0), _lo_stk_degree(0), _hi_degree(0), _simplified(0)
   , _oldphi(unique)
+  , _scheduling_info_generated(scheduling_info_generated)
+  , _sched_int_pressure(0, INTPRESSURE)
+  , _sched_float_pressure(0, FLOATPRESSURE)
+  , _scratch_int_pressure(0, INTPRESSURE)
+  , _scratch_float_pressure(0, FLOATPRESSURE)
 #ifndef PRODUCT
   , _trace_spilling(TraceSpilling || C->method_has_option("TraceSpilling"))
 #endif
@@ -350,7 +355,7 @@
   // all copy-related live ranges low and then using the max copy-related
   // live range as a cut-off for LIVE and the IFG.  In other words, I can
   // build a subset of LIVE and IFG just for copies.
-  PhaseLive live(_cfg, _lrg_map.names(), &live_arena);
+  PhaseLive live(_cfg, _lrg_map.names(), &live_arena, false);
 
   // Need IFG for coalescing and coloring
   PhaseIFG ifg(&live_arena);
@@ -690,6 +695,29 @@
   _lrg_map.reset_uf_map(lr_counter);
 }
 
+void PhaseChaitin::mark_ssa() {
+  // Use ssa names to populate the live range maps or if no mask
+  // is available, use the 0 entry.
+  uint max_idx = 0;
+  for ( uint i = 0; i < _cfg.number_of_blocks(); i++ ) {
+    Block* block = _cfg.get_block(i);
+    uint cnt = block->number_of_nodes();
+
+    // Handle all the normal Nodes in the block
+    for ( uint j = 0; j < cnt; j++ ) {
+      Node *n = block->get_node(j);
+      // Pre-color to the zero live range, or pick virtual register
+      const RegMask &rm = n->out_RegMask();
+      _lrg_map.map(n->_idx, rm.is_NotEmpty() ? n->_idx : 0);
+      max_idx = (n->_idx > max_idx) ? n->_idx : max_idx;
+    }
+  }
+  _lrg_map.set_max_lrg_id(max_idx+1);
+
+  // Reset the Union-Find mapping to be identity
+  _lrg_map.reset_uf_map(max_idx+1);
+}
+
 
 // Gather LiveRanGe information, including register masks.  Modification of
 // cisc spillable in_RegMasks should not be done before AggressiveCoalesce.
@@ -707,7 +735,9 @@
     for (uint j = 1; j < block->number_of_nodes(); j++) {
       Node* n = block->get_node(j);
       uint input_edge_start =1; // Skip control most nodes
+      bool is_machine_node = false;
       if (n->is_Mach()) {
+        is_machine_node = true;
         input_edge_start = n->as_Mach()->oper_input_base();
       }
       uint idx = n->is_Copy();
@@ -929,6 +959,7 @@
           // Convert operand number to edge index number
           inp = n->as_Mach()->operand_index(inp);
       }
+
       // Prepare register mask for each input
       for( uint k = input_edge_start; k < cnt; k++ ) {
         uint vreg = _lrg_map.live_range_id(n->in(k));
@@ -948,6 +979,12 @@
           n->as_Mach()->use_cisc_RegMask();
         }
 
+        if (is_machine_node && _scheduling_info_generated) {
+          MachNode* cur_node = n->as_Mach();
+          // this is cleaned up by register allocation
+          if (k >= cur_node->num_opnds()) continue;
+        }
+
         LRG &lrg = lrgs(vreg);
         // // Testing for floating point code shape
         // Node *test = n->in(k);
@@ -989,7 +1026,7 @@
         // double can interfere with TWO aligned pairs, or effectively
         // FOUR registers!
 #ifdef ASSERT
-        if (is_vect) {
+        if (is_vect && !_scheduling_info_generated) {
           assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
           assert(!lrg._fat_proj, "sanity");
           assert(RegMask::num_registers(kreg) == lrg.num_regs(), "sanity");
--- old/src/share/vm/opto/chaitin.hpp	2015-09-03 15:31:42.250338200 -0700
+++ new/src/share/vm/opto/chaitin.hpp	2015-09-03 15:31:41.997338200 -0700
@@ -399,7 +399,6 @@
   int _trip_cnt;
   int _alternate;
 
-  LRG &lrgs(uint idx) const { return _ifg->lrgs(idx); }
   PhaseLive *_live;             // Liveness, used in the interference graph
   PhaseIFG *_ifg;               // Interference graph (for original chunk)
   Node_List **_lrg_nodes;       // Array of node; lists for lrgs which spill
@@ -464,16 +463,28 @@
 #endif
 
 public:
-  PhaseChaitin( uint unique, PhaseCFG &cfg, Matcher &matcher );
+  PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher, bool track_liveout_pressure);
   ~PhaseChaitin() {}
 
   LiveRangeMap _lrg_map;
 
+  LRG &lrgs(uint idx) const { return _ifg->lrgs(idx); }
+
   // Do all the real work of allocate
   void Register_Allocate();
 
   float high_frequency_lrg() const { return _high_frequency_lrg; }
 
+  // Used when scheduling info generated, not in general register allocation
+  bool _scheduling_info_generated;
+
+  void set_ifg(PhaseIFG &ifg) { _ifg = &ifg;  }
+  void set_live(PhaseLive &live) { _live = &live; }
+  PhaseLive* get_live() { return _live; }
+
+  // Populate the live range maps with ssa info for scheduling
+  void mark_ssa();
+
 #ifndef PRODUCT
   bool trace_spilling() const { return _trace_spilling; }
 #endif
@@ -516,7 +527,11 @@
       uint _final_pressure;
 
       // number of live ranges that constitute high register pressure
-      const uint _high_pressure_limit;
+      uint _high_pressure_limit;
+
+      // initial pressure observed
+      uint _start_pressure;
+
     public:
 
       // lower the register pressure and look for a low to high pressure
@@ -537,6 +552,14 @@
         }
       }
 
+      void init(int limit) {
+        _current_pressure = 0;
+        _high_pressure_index = 0;
+        _final_pressure = 0;
+        _high_pressure_limit = limit;
+        _start_pressure = 0;
+      }
+
       uint high_pressure_index() const {
         return _high_pressure_index;
       }
@@ -545,6 +568,10 @@
         return _final_pressure;
       }
 
+      uint start_pressure() const {
+        return _start_pressure;
+      }
+
       uint current_pressure() const {
         return _current_pressure;
       }
@@ -561,6 +588,15 @@
         _high_pressure_index = 0;
       }
 
+      void set_start_pressure(int value) {
+        _start_pressure = value;
+        _final_pressure = value;
+      }
+
+      void set_current_pressure(int value) {
+        _current_pressure = value;
+      }
+
       void check_pressure_at_fatproj(uint fatproj_location, RegMask& fatproj_mask) {
         // this pressure is only valid at this instruction, i.e. we don't need to lower
         // the register pressure since the fat proj was never live before (going backwards)
@@ -577,14 +613,13 @@
       }
 
       Pressure(uint high_pressure_index, uint high_pressure_limit)
-      : _current_pressure(0)
-      , _high_pressure_index(high_pressure_index)
-      , _high_pressure_limit(high_pressure_limit)
-      , _final_pressure(0) {}
+        : _current_pressure(0)
+        , _high_pressure_index(high_pressure_index)
+        , _final_pressure(0)
+        , _high_pressure_limit(high_pressure_limit)
+        , _start_pressure(0) {}
   };
 
-  void lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure);
-  void raise_pressure(Block* b, LRG& lrg, Pressure& int_pressure, Pressure& float_pressure);
   void check_for_high_pressure_transition_at_fatproj(uint& block_reg_pressure, uint location, LRG& lrg, Pressure& pressure, const int op_regtype);
   void add_input_to_liveout(Block* b, Node* n, IndexSet* liveout, double cost, Pressure& int_pressure, Pressure& float_pressure);
   void compute_initial_block_pressure(Block* b, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure, double cost);
@@ -600,10 +635,25 @@
   // acceptable register sets do not overlap, then they do not interfere.
   uint build_ifg_physical( ResourceArea *a );
 
+public:
   // Gather LiveRanGe information, including register masks and base pointer/
   // derived pointer relationships.
   void gather_lrg_masks( bool mod_cisc_masks );
 
+  // user visible pressure variables for scheduling
+  Pressure _sched_int_pressure;
+  Pressure _sched_float_pressure;
+  Pressure _scratch_int_pressure;
+  Pressure _scratch_float_pressure;
+
+  // Pressure functions for user context
+  void lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure);
+  void raise_pressure(Block* b, LRG& lrg, Pressure& int_pressure, Pressure& float_pressure);
+  void compute_entry_block_pressure(Block* b);
+  void compute_exit_block_pressure(Block* b);
+  void print_pressure_info(Pressure& pressure, const char *str);
+
+private:
   // Force the bases of derived pointers to be alive at GC points.
   bool stretch_base_pointer_live_ranges( ResourceArea *a );
   // Helper to stretch above; recursively discover the base Node for
--- old/src/share/vm/opto/compile.cpp	2015-09-03 15:31:43.825338200 -0700
+++ new/src/share/vm/opto/compile.cpp	2015-09-03 15:31:43.575338200 -0700
@@ -2336,7 +2336,7 @@
     debug_only( cfg.verify(); )
   }
 
-  PhaseChaitin regalloc(unique(), cfg, matcher);
+  PhaseChaitin regalloc(unique(), cfg, matcher, false);
   _regalloc = &regalloc;
   {
     TracePhase tp("regalloc", &timers[_t_registerAllocation]);
--- old/src/share/vm/opto/gcm.cpp	2015-09-03 15:31:45.486338200 -0700
+++ new/src/share/vm/opto/gcm.cpp	2015-09-03 15:31:45.211338200 -0700
@@ -34,6 +34,7 @@
 #include "opto/phaseX.hpp"
 #include "opto/rootnode.hpp"
 #include "opto/runtime.hpp"
+#include "opto/chaitin.hpp"
 #include "runtime/deoptimization.hpp"
 
 // Portions of code courtesy of Clifford Click
@@ -1363,6 +1364,33 @@
     }
   }
 
+  PhaseChaitin regalloc(C->unique(), *this, _matcher, true);
+  ResourceArea live_arena;      // Arena for liveness
+  ResourceMark rm_live(&live_arena);
+  PhaseLive live(*this, regalloc._lrg_map.names(), &live_arena, true);
+  PhaseIFG ifg(&live_arena);
+  intptr_t *recalc_pressure_nodes = NULL;
+
+  if (OptoRegScheduling) {
+    regalloc.mark_ssa();
+    Compile::TracePhase tp("computeLive", &timers[_t_computeLive]);
+    rm_live.reset_to_mark();           // Reclaim working storage
+    IndexSet::reset_memory(C, &live_arena);
+    uint node_size = regalloc._lrg_map.max_lrg_id();
+    ifg.init(node_size); // Empty IFG
+    regalloc.set_ifg(ifg);
+    regalloc.set_live(live);
+    regalloc.gather_lrg_masks(false);    // Collect LRG masks
+    live.compute(node_size); // Compute liveness
+
+    recalc_pressure_nodes = NEW_RESOURCE_ARRAY(intptr_t, node_size);
+    for (uint i = 0; i < node_size; i++) {
+      recalc_pressure_nodes[i] = 0;
+    }
+  }
+
+  _regalloc = &regalloc;
+
 #ifndef PRODUCT
   if (trace_opto_pipelining()) {
     tty->print("\n---- Start Local Scheduling ----\n");
@@ -1375,13 +1403,15 @@
   visited.Clear();
   for (uint i = 0; i < number_of_blocks(); i++) {
     Block* block = get_block(i);
-    if (!schedule_local(block, ready_cnt, visited)) {
+    if (!schedule_local(block, ready_cnt, visited, recalc_pressure_nodes)) {
       if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) {
         C->record_method_not_compilable("local schedule failed");
       }
+      _regalloc = NULL;
       return;
     }
   }
+  _regalloc = NULL;
 
   // If we inserted any instructions between a Call and his CatchNode,
   // clone the instructions on all paths below the Catch.
--- old/src/share/vm/opto/ifg.cpp	2015-09-03 15:31:46.936338200 -0700
+++ new/src/share/vm/opto/ifg.cpp	2015-09-03 15:31:46.692338200 -0700
@@ -439,8 +439,10 @@
       }
     }
   }
-  assert(int_pressure.current_pressure() == count_int_pressure(liveout), "the int pressure is incorrect");
-  assert(float_pressure.current_pressure() == count_float_pressure(liveout), "the float pressure is incorrect");
+  if (_scheduling_info_generated == false) {
+    assert(int_pressure.current_pressure() == count_int_pressure(liveout), "the int pressure is incorrect");
+    assert(float_pressure.current_pressure() == count_float_pressure(liveout), "the float pressure is incorrect");
+  }
 }
 
 /* Go to the first non-phi index in a block */
@@ -518,6 +520,58 @@
 }
 
 /*
+* Computes the entry register pressure of a block, looking at all live
+* ranges in the livein. The register pressure is computed for both float
+* and int/pointer registers.
+*/
+void PhaseChaitin::compute_entry_block_pressure(Block* b) {
+  IndexSet* livein = _live->livein(b);
+  IndexSetIterator elements(livein);
+  uint lid = elements.next();
+  while (lid != 0) {
+    LRG& lrg = lrgs(lid);
+    raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure);
+    lid = elements.next();
+  }
+  // Now check phis for locally defined inputs
+  for (uint j = 0; j < b->number_of_nodes(); j++) {
+    Node* n = b->get_node(j);
+    if (n->is_Phi()) {
+      for (uint k = 1; k < n->req(); k++) {
+        Node* phi_in = n->in(k);
+        // Because we are talking about phis, raise register pressure once for each
+        // instance of a phi to account for a single value
+        if (_cfg.get_block_for_node(phi_in) == b) {
+          LRG& lrg = lrgs(phi_in->_idx);
+          raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure);
+          break;
+        }
+      }
+    }
+  }
+  _sched_int_pressure.set_start_pressure(_sched_int_pressure.current_pressure());
+  _sched_float_pressure.set_start_pressure(_sched_float_pressure.current_pressure());
+}
+
+/*
+* Computes the exit register pressure of a block, looking at all live
+* ranges in the liveout. The register pressure is computed for both float
+* and int/pointer registers.
+*/
+void PhaseChaitin::compute_exit_block_pressure(Block* b) {
+  IndexSet* livein = _live->live(b);
+  IndexSetIterator elements(livein);
+  _sched_int_pressure.set_current_pressure(0);
+  _sched_float_pressure.set_current_pressure(0);
+  uint lid = elements.next();
+  while (lid != 0) {
+    LRG& lrg = lrgs(lid);
+    raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure);
+    lid = elements.next();
+  }
+}
+
+/*
  * Remove dead node if it's not used.
  * We only remove projection nodes if the node "defining" the projection is
  * dead, for example on x86, if we have a dead Add node we remove its
@@ -737,6 +791,16 @@
   block_hrp_index = i;
 }
 
+void PhaseChaitin::print_pressure_info(Pressure& pressure, const char *str) {
+  if (str != NULL) {
+    tty->print_cr("#  *** %s ***", str);
+  }
+  tty->print_cr("#     start pressure is = %d", pressure.start_pressure());
+  tty->print_cr("#     max pressure is = %d", pressure.final_pressure());
+  tty->print_cr("#     end pressure is = %d", pressure.current_pressure());
+  tty->print_cr("#");
+}
+
 /* Build an interference graph:
  *   That is, if 2 live ranges are simultaneously alive but in their acceptable
  *   register sets do not overlap, then they do not interfere. The IFG is built
--- old/src/share/vm/opto/lcm.cpp	2015-09-03 15:31:48.295338200 -0700
+++ new/src/share/vm/opto/lcm.cpp	2015-09-03 15:31:48.076338200 -0700
@@ -31,6 +31,7 @@
 #include "opto/cfgnode.hpp"
 #include "opto/machnode.hpp"
 #include "opto/runtime.hpp"
+#include "opto/chaitin.hpp"
 #include "runtime/sharedRuntime.hpp"
 
 // Optimization - Graph Style
@@ -443,7 +444,13 @@
 // remaining cases (most), choose the instruction with the greatest latency
 // (that is, the most number of pseudo-cycles required to the end of the
 // routine). If there is a tie, choose the instruction with the most inputs.
-Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &ready_cnt, VectorSet &next_call, uint sched_slot) {
+Node* PhaseCFG::select(
+  Block* block,
+  Node_List &worklist,
+  GrowableArray<int> &ready_cnt,
+  VectorSet &next_call,
+  uint sched_slot,
+  intptr_t* recalc_pressure_nodes) {
 
   // If only a single entry on the stack, use it
   uint cnt = worklist.size();
@@ -537,7 +544,45 @@
     }
 
     uint n_latency = get_latency_for_node(n);
-    uint n_score   = n->req();   // Many inputs get high score to break ties
+    uint n_score = n->req();   // Many inputs get high score to break ties
+
+    if (OptoRegScheduling) {
+      if (recalc_pressure_nodes[n->_idx] == 0x7fff7fff) {
+        _regalloc->_scratch_int_pressure.init(_regalloc->_sched_int_pressure.high_pressure_limit());
+        _regalloc->_scratch_float_pressure.init(_regalloc->_sched_float_pressure.high_pressure_limit());
+        // simulate the notion that we just picked this node to schedule
+        n->add_flag(Node::Flag_is_scheduled);
+        // now caculate its effect upon the graph if we did
+        adjust_register_pressure(n, block, recalc_pressure_nodes, false);
+        // return its state for finalize in case somebody else wins
+        n->remove_flag(Node::Flag_is_scheduled);
+        // now save the two final pressure components of register pressure, limiting pressure calcs to short size
+        short int_pressure = (short)_regalloc->_scratch_int_pressure.current_pressure();
+        short float_pressure = (short)_regalloc->_scratch_float_pressure.current_pressure();
+        recalc_pressure_nodes[n->_idx] = int_pressure;
+        recalc_pressure_nodes[n->_idx] |= (float_pressure << 16);
+      }
+
+      if (_scheduling_for_pressure) {
+        latency = n_latency;
+        if (n_choice != 3) {
+          // Now evaluate each register pressure component based on threshold in the score.
+          // In general the defining register type will dominate the score, ergo we will not see register pressure grow on both banks
+          // on a single instruction, but we might see it shrink on both banks.
+          if (_regalloc->_sched_int_pressure.current_pressure() > _regalloc->_sched_int_pressure.high_pressure_limit()) {
+            short int_pressure = (short)recalc_pressure_nodes[n->_idx];
+            n_score = (int_pressure < 0) ? ((score + n_score) - int_pressure) : (int_pressure > 0) ? 1 : n_score;
+          }
+          if (_regalloc->_sched_float_pressure.current_pressure() > _regalloc->_sched_float_pressure.high_pressure_limit()) {
+            short float_pressure = (short)(recalc_pressure_nodes[n->_idx] >> 16);
+            n_score = (float_pressure < 0) ? ((score + n_score) - float_pressure) : (float_pressure > 0) ? 1 : n_score;
+          }
+        } else {
+          // make sure we choose these candidates
+          score = 0;
+        }
+      }
+    }
 
     // Keep best latency found
     cand_cnt++;
@@ -562,6 +607,100 @@
   return n;
 }
 
+//-------------------------adjust_register_pressure----------------------------
+void PhaseCFG::adjust_register_pressure(Node* n, Block* block, intptr_t* recalc_pressure_nodes, bool finalize_mode) {
+  PhaseLive* liveinfo = _regalloc->get_live();
+  IndexSet* liveout = liveinfo->live(block);
+  // first adjust the register pressure for the sources
+  for (uint i = 1; i < n->req(); i++) {
+    bool lrg_ends = false;
+    Node *src_n = n->in(i);
+    if (src_n == NULL) continue;
+    if (!src_n->is_Mach()) continue;
+    uint src = _regalloc->_lrg_map.find(src_n);
+    if (src == 0) continue;
+    LRG& lrg_src = _regalloc->lrgs(src);
+    // detect if the live range ends or not
+    if (liveout->member(src) == false) {
+      lrg_ends = true;
+      for (DUIterator_Fast jmax, j = src_n->fast_outs(jmax); j < jmax; j++) {
+        Node* m = src_n->fast_out(j); // Get user
+        if (m == n) continue;
+        if (!m->is_Mach()) continue;
+        MachNode *mach = m->as_Mach();
+        bool src_matches = false;
+        int iop = mach->ideal_Opcode();
+
+        switch (iop) {
+        case Op_StoreB:
+        case Op_StoreC:
+        case Op_StoreCM:
+        case Op_StoreD:
+        case Op_StoreF:
+        case Op_StoreI:
+        case Op_StoreL:
+        case Op_StoreP:
+        case Op_StoreN:
+        case Op_StoreVector:
+        case Op_StoreNKlass:
+          for (uint k = 1; k < m->req(); k++) {
+            Node *in = m->in(k);
+            if (in == src_n) {
+              src_matches = true;
+              break;
+            }
+          }
+          break;
+
+        default:
+          src_matches = true;
+          break;
+        }
+
+        // If we have a store as our use, ignore the non source operands
+        if (src_matches == false) continue;
+
+        // Mark every unscheduled use which is not n with a recalculation
+        if ((get_block_for_node(m) == block) && (!m->is_scheduled())) {
+          if (finalize_mode && !m->is_Phi()) {
+            recalc_pressure_nodes[m->_idx] = 0x7fff7fff;
+          }
+          lrg_ends = false;
+        }
+      }
+    }
+    // if none, this live range ends and we can adjust register pressure
+    if (lrg_ends) {
+      if (finalize_mode) {
+        _regalloc->lower_pressure(block, 0, lrg_src, NULL, _regalloc->_sched_int_pressure, _regalloc->_sched_float_pressure);
+      } else {
+        _regalloc->lower_pressure(block, 0, lrg_src, NULL, _regalloc->_scratch_int_pressure, _regalloc->_scratch_float_pressure);
+      }
+    }
+  }
+
+  // now add the register pressure from the dest and evaluate which heuristic we should use:
+  // 1.) The default, latency scheduling
+  // 2.) Register pressure scheduling based on the high pressure limit threshold for int or float register stacks
+  uint dst = _regalloc->_lrg_map.find(n);
+  if (dst != 0) {
+    LRG& lrg_dst = _regalloc->lrgs(dst);
+    if (finalize_mode) {
+      _regalloc->raise_pressure(block, lrg_dst, _regalloc->_sched_int_pressure, _regalloc->_sched_float_pressure);
+      // check to see if we fall over the register pressure cliff here
+      if (_regalloc->_sched_int_pressure.current_pressure() > _regalloc->_sched_int_pressure.high_pressure_limit()) {
+        _scheduling_for_pressure = true;
+      } else if (_regalloc->_sched_float_pressure.current_pressure() > _regalloc->_sched_float_pressure.high_pressure_limit()) {
+        _scheduling_for_pressure = true;
+      } else {
+        // restore latency scheduling mode
+        _scheduling_for_pressure = false;
+      }
+    } else {
+      _regalloc->raise_pressure(block, lrg_dst, _regalloc->_scratch_int_pressure, _regalloc->_scratch_float_pressure);
+    }
+  }
+}
 
 //------------------------------set_next_call----------------------------------
 void PhaseCFG::set_next_call(Block* block, Node* n, VectorSet& next_call) {
@@ -644,7 +783,7 @@
         continue;
       }
       if( m->is_Phi() ) continue;
-      int m_cnt = ready_cnt.at(m->_idx)-1;
+      int m_cnt = ready_cnt.at(m->_idx) - 1;
       ready_cnt.at_put(m->_idx, m_cnt);
       if( m_cnt == 0 )
         worklist.push(m);
@@ -711,7 +850,7 @@
 
 //------------------------------schedule_local---------------------------------
 // Topological sort within a block.  Someday become a real scheduler.
-bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call) {
+bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call, intptr_t *recalc_pressure_nodes) {
   // Already "sorted" are the block start Node (as the first entry), and
   // the block-ending Node and any trailing control projections.  We leave
   // these alone.  PhiNodes and ParmNodes are made to follow the block start
@@ -733,10 +872,22 @@
     return true;
   }
 
+  // We track the uses of local definitions as input dependences so that
+  // we know when a given instruction is avialable to be scheduled.
+  uint i;
+  if (OptoRegScheduling) {
+    for (i = 1; i < block->number_of_nodes(); i++) { // setup nodes for pressure calc
+      Node *n = block->get_node(i);
+      n->remove_flag(Node::Flag_is_scheduled);
+      if (!n->is_Phi()) {
+        recalc_pressure_nodes[n->_idx] = 0x7fff7fff;
+      }
+    }
+  }
+
   // Move PhiNodes and ParmNodes from 1 to cnt up to the start
   uint node_cnt = block->end_idx();
   uint phi_cnt = 1;
-  uint i;
   for( i = 1; i<node_cnt; i++ ) { // Scan for Phi
     Node *n = block->get_node(i);
     if( n->is_Phi() ||          // Found a PhiNode or ParmNode
@@ -744,6 +895,10 @@
       // Move guy at 'phi_cnt' to the end; makes a hole at phi_cnt
       block->map_node(block->get_node(phi_cnt), i);
       block->map_node(n, phi_cnt++);  // swap Phi/Parm up front
+      if (OptoRegScheduling) {
+        // mark n as scheduled
+        n->add_flag(Node::Flag_is_scheduled);
+      }
     } else {                    // All others
       // Count block-local inputs to 'n'
       uint cnt = n->len();      // Input count
@@ -791,12 +946,18 @@
 
   // All the prescheduled guys do not hold back internal nodes
   uint i3;
-  for(i3 = 0; i3<phi_cnt; i3++ ) {  // For all pre-scheduled
+  for (i3 = 0; i3 < phi_cnt; i3++) {  // For all pre-scheduled
     Node *n = block->get_node(i3);       // Get pre-scheduled
     for (DUIterator_Fast jmax, j = n->fast_outs(jmax); j < jmax; j++) {
       Node* m = n->fast_out(j);
       if (get_block_for_node(m) == block) { // Local-block user
         int m_cnt = ready_cnt.at(m->_idx)-1;
+        if (OptoRegScheduling) {
+          // mark m as scheduled
+          if (m_cnt < 0) {
+            m->add_flag(Node::Flag_is_scheduled);
+          }
+        }
         ready_cnt.at_put(m->_idx, m_cnt);   // Fix ready count
       }
     }
@@ -827,6 +988,23 @@
     worklist.push(d);
   }
 
+  if (OptoRegScheduling) {
+    // To stage register pressure calculations we need to examine the live set variables
+    // breaking them up by register class to compartmentalize the calculations.
+    uint float_pressure = FLOATPRESSURE;
+#ifdef _LP64
+    if (UseAVX > 2) {
+      float_pressure *= 2;
+    }
+#endif
+    _regalloc->_sched_int_pressure.init(INTPRESSURE);
+    _regalloc->_sched_float_pressure.init(float_pressure);
+    _regalloc->_scratch_int_pressure.init(INTPRESSURE);
+    _regalloc->_scratch_float_pressure.init(float_pressure);
+
+    _regalloc->compute_entry_block_pressure(block);
+  }
+
   // Warm up the 'next_call' heuristic bits
   needed_for_next_call(block, block->head(), next_call);
 
@@ -858,9 +1036,18 @@
 #endif
 
     // Select and pop a ready guy from worklist
-    Node* n = select(block, worklist, ready_cnt, next_call, phi_cnt);
+    Node* n = select(block, worklist, ready_cnt, next_call, phi_cnt, recalc_pressure_nodes);
     block->map_node(n, phi_cnt++);    // Schedule him next
 
+    if (OptoRegScheduling) {
+      n->add_flag(Node::Flag_is_scheduled);
+
+      // Now adjust the resister pressure with the node we selected
+      if (!n->is_Phi()) {
+        adjust_register_pressure(n, block, recalc_pressure_nodes, true);
+      }
+    }
+
 #ifndef PRODUCT
     if (trace_opto_pipelining()) {
       tty->print("#    select %d: %s", n->_idx, n->Name());
@@ -906,7 +1093,7 @@
         assert(m->is_MachProj() && n->is_Mach() && n->as_Mach()->has_call(), "unexpected node types");
         continue;
       }
-      int m_cnt = ready_cnt.at(m->_idx)-1;
+      int m_cnt = ready_cnt.at(m->_idx) - 1;
       ready_cnt.at_put(m->_idx, m_cnt);
       if( m_cnt == 0 )
         worklist.push(m);
@@ -925,6 +1112,12 @@
     return false;
   }
 
+  if (OptoRegScheduling) {
+    _regalloc->compute_exit_block_pressure(block);
+    block->_reg_pressure = _regalloc->_sched_int_pressure.final_pressure();
+    block->_freg_pressure = _regalloc->_sched_float_pressure.final_pressure();
+  }
+
 #ifndef PRODUCT
   if (trace_opto_pipelining()) {
     tty->print_cr("#");
@@ -933,11 +1126,17 @@
       tty->print("# ");
       block->get_node(i)->fast_dump();
     }
+    tty->print_cr("# ");
+
+    if (OptoRegScheduling) {
+      tty->print_cr("# pressure info : %d", block->_pre_order);
+      _regalloc->print_pressure_info(_regalloc->_sched_int_pressure, "int register info");
+      _regalloc->print_pressure_info(_regalloc->_sched_float_pressure, "float register info");
+    }
     tty->cr();
   }
 #endif
 
-
   return true;
 }
 
--- old/src/share/vm/opto/live.cpp	2015-09-03 15:31:49.698338200 -0700
+++ new/src/share/vm/opto/live.cpp	2015-09-03 15:31:49.478338200 -0700
@@ -41,7 +41,14 @@
 // block is put on the worklist.
 //   The locally live-in stuff is computed once and added to predecessor
 // live-out sets.  This separate compilation is done in the outer loop below.
-PhaseLive::PhaseLive( const PhaseCFG &cfg, const LRG_List &names, Arena *arena ) : Phase(LIVE), _cfg(cfg), _names(names), _arena(arena), _live(0) {
+PhaseLive::PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena, bool keep_deltas)
+  : Phase(LIVE),
+  _cfg(cfg),
+  _names(names),
+  _arena(arena),
+  _live(0),
+  _livein(0),
+  _keep_deltas(keep_deltas) {
 }
 
 void PhaseLive::compute(uint maxlrg) {
@@ -56,6 +63,13 @@
     _live[i].initialize(_maxlrg);
   }
 
+  if (_keep_deltas) {
+    _livein = (IndexSet*)_arena->Amalloc(sizeof(IndexSet) * _cfg.number_of_blocks());
+    for (i = 0; i < _cfg.number_of_blocks(); i++) {
+      _livein[i].initialize(_maxlrg);
+    }
+  }
+
   // Init the sparse arrays for delta-sets.
   ResourceMark rm;              // Nuke temp storage on exit
 
@@ -124,7 +138,10 @@
 
       // PhiNode uses go in the live-out set of prior blocks.
       for (uint k = i; k > 0; k--) {
-        add_liveout(p, _names.at(block->get_node(k-1)->in(l)->_idx), first_pass);
+        Node *phi = block->get_node(k - 1);
+        if (l < phi->req()) {
+          add_liveout(p, _names.at(phi->in(l)->_idx), first_pass);
+        }
       }
     }
     freeset(block);
@@ -200,8 +217,11 @@
 }
 
 // Free an IndexSet from a block.
-void PhaseLive::freeset( const Block *p ) {
+void PhaseLive::freeset( Block *p ) {
   IndexSet *f = _deltas[p->_pre_order-1];
+  if ( _keep_deltas ) {
+    add_livein(p, f);
+  }
   f->set_next(_free_IndexSet);
   _free_IndexSet = f;           // Drop onto free list
   _deltas[p->_pre_order-1] = NULL;
@@ -249,10 +269,23 @@
   }
 }
 
+// Add a vector of live-in values to a given blocks live-in set.
+void PhaseLive::add_livein(Block *p, IndexSet *lo) {
+  IndexSet *livein = &_livein[p->_pre_order-1];
+  IndexSetIterator elements(lo);
+  uint r;
+  while ((r = elements.next()) != 0) {
+    livein->insert(r);         // Then add to live-in set
+  }
+}
+
 #ifndef PRODUCT
 // Dump the live-out set for a block
 void PhaseLive::dump( const Block *b ) const {
   tty->print("Block %d: ",b->_pre_order);
+  if ( _keep_deltas ) {
+    tty->print("LiveIn: ");  _livein[b->_pre_order-1].dump();
+  }
   tty->print("LiveOut: ");  _live[b->_pre_order-1].dump();
   uint cnt = b->number_of_nodes();
   for( uint i=0; i<cnt; i++ ) {
--- old/src/share/vm/opto/live.hpp	2015-09-03 15:31:51.040338200 -0700
+++ new/src/share/vm/opto/live.hpp	2015-09-03 15:31:50.819338200 -0700
@@ -46,7 +46,8 @@
 class PhaseLive : public Phase {
   // Array of Sets of values live at the start of a block.
   // Indexed by block pre-order number.
-  IndexSet *_live;
+  IndexSet *_live; // live out
+  IndexSet *_livein; // live in
 
   // Array of Sets of values defined locally in the block
   // Indexed by block pre-order number.
@@ -62,15 +63,17 @@
   const LRG_List &_names;       // Mapping from Nodes to live ranges
   uint _maxlrg;                 // Largest live-range number
   Arena *_arena;
+  bool _keep_deltas;            // Retain live in information
 
   IndexSet *getset( Block *p );
   IndexSet *getfreeset( );
-  void freeset( const Block *p );
+  void freeset( Block *p );
   void add_liveout( Block *p, uint r, VectorSet &first_pass );
   void add_liveout( Block *p, IndexSet *lo, VectorSet &first_pass );
+  void add_livein( Block *p, IndexSet *lo );
 
 public:
-  PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena);
+  PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena, bool keep_deltas);
   ~PhaseLive() {}
   // Compute liveness info
   void compute(uint maxlrg);
@@ -79,6 +82,7 @@
 
   // Return the live-out set for this block
   IndexSet *live( const Block * b ) { return &_live[b->_pre_order-1]; }
+  IndexSet *livein( const Block * b ) { return &_livein[b->_pre_order - 1]; }
 
 #ifndef PRODUCT
   void dump( const Block *b ) const;
--- old/src/share/vm/opto/node.hpp	2015-09-03 15:31:52.313338200 -0700
+++ new/src/share/vm/opto/node.hpp	2015-09-03 15:31:52.089338200 -0700
@@ -674,7 +674,8 @@
     Flag_avoid_back_to_back_after    = Flag_avoid_back_to_back_before << 1,
     Flag_has_call                    = Flag_avoid_back_to_back_after << 1,
     Flag_is_reduction                = Flag_has_call << 1,
-    Flag_is_expensive                = Flag_is_reduction << 1,
+    Flag_is_scheduled                = Flag_is_reduction << 1,
+    Flag_is_expensive                = Flag_is_scheduled << 1,
     _max_flags = (Flag_is_expensive << 1) - 1 // allow flags combination
   };
 
@@ -861,6 +862,9 @@
   // It must have the loop's phi as input and provide a def to the phi.
   bool is_reduction() const { return (_flags & Flag_is_reduction) != 0; }
 
+  // Used in lcm to mark nodes that have scheduled
+  bool is_scheduled() const { return (_flags & Flag_is_scheduled) != 0; }
+
 //----------------- Optimization
 
   // Get the worst-case Type output for this Node.