--- old/src/cpu/aarch64/vm/c2_globals_aarch64.hpp 2015-09-03 15:31:30.604338200 -0700 +++ new/src/cpu/aarch64/vm/c2_globals_aarch64.hpp 2015-09-03 15:31:30.337338200 -0700 @@ -72,6 +72,7 @@ define_pd_global(bool, UseCISCSpill, true); define_pd_global(bool, OptoScheduling, false); define_pd_global(bool, OptoBundling, false); +define_pd_global(bool, OptoRegScheduling, false); define_pd_global(intx, ReservedCodeCacheSize, 48*M); define_pd_global(intx, NonProfiledCodeHeapSize, 21*M); --- old/src/cpu/ppc/vm/c2_globals_ppc.hpp 2015-09-03 15:31:32.067338200 -0700 +++ new/src/cpu/ppc/vm/c2_globals_ppc.hpp 2015-09-03 15:31:31.829338200 -0700 @@ -60,6 +60,7 @@ define_pd_global(bool, OptoPeephole, false); define_pd_global(bool, UseCISCSpill, false); define_pd_global(bool, OptoBundling, false); +define_pd_global(bool, OptoRegScheduling, false); // GL: // Detected a problem with unscaled compressed oops and // narrow_oop_use_complex_address() == false. --- old/src/cpu/sparc/vm/c2_globals_sparc.hpp 2015-09-03 15:31:33.496338200 -0700 +++ new/src/cpu/sparc/vm/c2_globals_sparc.hpp 2015-09-03 15:31:33.250338200 -0700 @@ -64,6 +64,7 @@ define_pd_global(bool, UseCISCSpill, false); define_pd_global(bool, OptoBundling, false); define_pd_global(bool, OptoScheduling, true); +define_pd_global(bool, OptoRegScheduling, false); #ifdef _LP64 // We need to make sure that all generated code is within --- old/src/cpu/x86/vm/c2_globals_x86.hpp 2015-09-03 15:31:34.909338200 -0700 +++ new/src/cpu/x86/vm/c2_globals_x86.hpp 2015-09-03 15:31:34.662338200 -0700 @@ -48,11 +48,11 @@ define_pd_global(intx, OnStackReplacePercentage, 140); define_pd_global(intx, ConditionalMoveLimit, 3); -define_pd_global(intx, FLOATPRESSURE, 6); define_pd_global(intx, FreqInlineSize, 325); define_pd_global(intx, MinJumpTableSize, 10); #ifdef AMD64 define_pd_global(intx, INTPRESSURE, 13); +define_pd_global(intx, FLOATPRESSURE, 14); define_pd_global(intx, InteriorEntryAlignment, 16); define_pd_global(size_t, NewSizeThreadIncrease, ScaleForWordSize(4*K)); define_pd_global(intx, LoopUnrollLimit, 60); @@ -64,6 +64,7 @@ define_pd_global(uint64_t, MaxRAM, 128ULL*G); #else define_pd_global(intx, INTPRESSURE, 6); +define_pd_global(intx, FLOATPRESSURE, 6); define_pd_global(intx, InteriorEntryAlignment, 4); define_pd_global(size_t, NewSizeThreadIncrease, 4*K); define_pd_global(intx, LoopUnrollLimit, 50); // Design center runs on 1.3.1 @@ -82,6 +83,7 @@ define_pd_global(bool, UseCISCSpill, true); define_pd_global(bool, OptoScheduling, false); define_pd_global(bool, OptoBundling, false); +define_pd_global(bool, OptoRegScheduling, true); define_pd_global(intx, ReservedCodeCacheSize, 48*M); define_pd_global(intx, NonProfiledCodeHeapSize, 21*M); --- old/src/share/vm/opto/block.cpp 2015-09-03 15:31:36.374338200 -0700 +++ new/src/share/vm/opto/block.cpp 2015-09-03 15:31:36.157338200 -0700 @@ -358,6 +358,8 @@ PhaseCFG::PhaseCFG(Arena* arena, RootNode* root, Matcher& matcher) : Phase(CFG) , _block_arena(arena) +, _regalloc(NULL) +, _scheduling_for_pressure(false) , _root(root) , _matcher(matcher) , _node_to_block_mapping(arena) --- old/src/share/vm/opto/block.hpp 2015-09-03 15:31:37.720338200 -0700 +++ new/src/share/vm/opto/block.hpp 2015-09-03 15:31:37.488338200 -0700 @@ -37,6 +37,7 @@ class Matcher; class RootNode; class VectorSet; +class PhaseChaitin; struct Tarjan; //------------------------------Block_Array------------------------------------ @@ -383,6 +384,12 @@ // Arena for the blocks to be stored in Arena* _block_arena; + // Info used for scheduling + PhaseChaitin* _regalloc; + + // Register pressure heuristic used? + bool _scheduling_for_pressure; + // The matcher for this compilation Matcher& _matcher; @@ -433,12 +440,14 @@ // to late. Helper for schedule_late. Block* hoist_to_cheaper_block(Block* LCA, Block* early, Node* self); - bool schedule_local(Block* block, GrowableArray& ready_cnt, VectorSet& next_call); + bool schedule_local(Block* block, GrowableArray& ready_cnt, VectorSet& next_call, intptr_t* recacl_pressure_nodes); void set_next_call(Block* block, Node* n, VectorSet& next_call); void needed_for_next_call(Block* block, Node* this_call, VectorSet& next_call); // Perform basic-block local scheduling - Node* select(Block* block, Node_List& worklist, GrowableArray& ready_cnt, VectorSet& next_call, uint sched_slot); + Node* select(Block* block, Node_List& worklist, GrowableArray& ready_cnt, VectorSet& next_call, uint sched_slot, + intptr_t* recacl_pressure_nodes); + void adjust_register_pressure(Node* n, Block* block, intptr_t *recalc_pressure_nodes, bool finalize_mode); // Schedule a call next in the block uint sched_call(Block* block, uint node_cnt, Node_List& worklist, GrowableArray& ready_cnt, MachCallNode* mcall, VectorSet& next_call); --- old/src/share/vm/opto/c2_globals.hpp 2015-09-03 15:31:39.406338200 -0700 +++ new/src/share/vm/opto/c2_globals.hpp 2015-09-03 15:31:39.195338200 -0700 @@ -306,6 +306,9 @@ product_pd(bool, OptoScheduling, \ "Instruction Scheduling after register allocation") \ \ + product_pd(bool, OptoRegScheduling, \ + "Instruction Scheduling before register allocation for pressure") \ + \ product(bool, PartialPeelLoop, true, \ "Partial peel (rotate) loops") \ \ --- old/src/share/vm/opto/chaitin.cpp 2015-09-03 15:31:40.702338200 -0700 +++ new/src/share/vm/opto/chaitin.cpp 2015-09-03 15:31:40.496338200 -0700 @@ -191,7 +191,7 @@ return next; } -PhaseChaitin::PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher) +PhaseChaitin::PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher, bool scheduling_info_generated) : PhaseRegAlloc(unique, cfg, matcher, #ifndef PRODUCT print_chaitin_statistics @@ -205,6 +205,11 @@ , _spilled_twice(Thread::current()->resource_area()) , _lo_degree(0), _lo_stk_degree(0), _hi_degree(0), _simplified(0) , _oldphi(unique) + , _scheduling_info_generated(scheduling_info_generated) + , _sched_int_pressure(0, INTPRESSURE) + , _sched_float_pressure(0, FLOATPRESSURE) + , _scratch_int_pressure(0, INTPRESSURE) + , _scratch_float_pressure(0, FLOATPRESSURE) #ifndef PRODUCT , _trace_spilling(TraceSpilling || C->method_has_option("TraceSpilling")) #endif @@ -350,7 +355,7 @@ // all copy-related live ranges low and then using the max copy-related // live range as a cut-off for LIVE and the IFG. In other words, I can // build a subset of LIVE and IFG just for copies. - PhaseLive live(_cfg, _lrg_map.names(), &live_arena); + PhaseLive live(_cfg, _lrg_map.names(), &live_arena, false); // Need IFG for coalescing and coloring PhaseIFG ifg(&live_arena); @@ -690,6 +695,29 @@ _lrg_map.reset_uf_map(lr_counter); } +void PhaseChaitin::mark_ssa() { + // Use ssa names to populate the live range maps or if no mask + // is available, use the 0 entry. + uint max_idx = 0; + for ( uint i = 0; i < _cfg.number_of_blocks(); i++ ) { + Block* block = _cfg.get_block(i); + uint cnt = block->number_of_nodes(); + + // Handle all the normal Nodes in the block + for ( uint j = 0; j < cnt; j++ ) { + Node *n = block->get_node(j); + // Pre-color to the zero live range, or pick virtual register + const RegMask &rm = n->out_RegMask(); + _lrg_map.map(n->_idx, rm.is_NotEmpty() ? n->_idx : 0); + max_idx = (n->_idx > max_idx) ? n->_idx : max_idx; + } + } + _lrg_map.set_max_lrg_id(max_idx+1); + + // Reset the Union-Find mapping to be identity + _lrg_map.reset_uf_map(max_idx+1); +} + // Gather LiveRanGe information, including register masks. Modification of // cisc spillable in_RegMasks should not be done before AggressiveCoalesce. @@ -707,7 +735,9 @@ for (uint j = 1; j < block->number_of_nodes(); j++) { Node* n = block->get_node(j); uint input_edge_start =1; // Skip control most nodes + bool is_machine_node = false; if (n->is_Mach()) { + is_machine_node = true; input_edge_start = n->as_Mach()->oper_input_base(); } uint idx = n->is_Copy(); @@ -929,6 +959,7 @@ // Convert operand number to edge index number inp = n->as_Mach()->operand_index(inp); } + // Prepare register mask for each input for( uint k = input_edge_start; k < cnt; k++ ) { uint vreg = _lrg_map.live_range_id(n->in(k)); @@ -948,6 +979,12 @@ n->as_Mach()->use_cisc_RegMask(); } + if (is_machine_node && _scheduling_info_generated) { + MachNode* cur_node = n->as_Mach(); + // this is cleaned up by register allocation + if (k >= cur_node->num_opnds()) continue; + } + LRG &lrg = lrgs(vreg); // // Testing for floating point code shape // Node *test = n->in(k); @@ -989,7 +1026,7 @@ // double can interfere with TWO aligned pairs, or effectively // FOUR registers! #ifdef ASSERT - if (is_vect) { + if (is_vect && !_scheduling_info_generated) { assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned"); assert(!lrg._fat_proj, "sanity"); assert(RegMask::num_registers(kreg) == lrg.num_regs(), "sanity"); --- old/src/share/vm/opto/chaitin.hpp 2015-09-03 15:31:42.250338200 -0700 +++ new/src/share/vm/opto/chaitin.hpp 2015-09-03 15:31:41.997338200 -0700 @@ -399,7 +399,6 @@ int _trip_cnt; int _alternate; - LRG &lrgs(uint idx) const { return _ifg->lrgs(idx); } PhaseLive *_live; // Liveness, used in the interference graph PhaseIFG *_ifg; // Interference graph (for original chunk) Node_List **_lrg_nodes; // Array of node; lists for lrgs which spill @@ -464,16 +463,28 @@ #endif public: - PhaseChaitin( uint unique, PhaseCFG &cfg, Matcher &matcher ); + PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher, bool track_liveout_pressure); ~PhaseChaitin() {} LiveRangeMap _lrg_map; + LRG &lrgs(uint idx) const { return _ifg->lrgs(idx); } + // Do all the real work of allocate void Register_Allocate(); float high_frequency_lrg() const { return _high_frequency_lrg; } + // Used when scheduling info generated, not in general register allocation + bool _scheduling_info_generated; + + void set_ifg(PhaseIFG &ifg) { _ifg = &ifg; } + void set_live(PhaseLive &live) { _live = &live; } + PhaseLive* get_live() { return _live; } + + // Populate the live range maps with ssa info for scheduling + void mark_ssa(); + #ifndef PRODUCT bool trace_spilling() const { return _trace_spilling; } #endif @@ -516,7 +527,11 @@ uint _final_pressure; // number of live ranges that constitute high register pressure - const uint _high_pressure_limit; + uint _high_pressure_limit; + + // initial pressure observed + uint _start_pressure; + public: // lower the register pressure and look for a low to high pressure @@ -537,6 +552,14 @@ } } + void init(int limit) { + _current_pressure = 0; + _high_pressure_index = 0; + _final_pressure = 0; + _high_pressure_limit = limit; + _start_pressure = 0; + } + uint high_pressure_index() const { return _high_pressure_index; } @@ -545,6 +568,10 @@ return _final_pressure; } + uint start_pressure() const { + return _start_pressure; + } + uint current_pressure() const { return _current_pressure; } @@ -561,6 +588,15 @@ _high_pressure_index = 0; } + void set_start_pressure(int value) { + _start_pressure = value; + _final_pressure = value; + } + + void set_current_pressure(int value) { + _current_pressure = value; + } + void check_pressure_at_fatproj(uint fatproj_location, RegMask& fatproj_mask) { // this pressure is only valid at this instruction, i.e. we don't need to lower // the register pressure since the fat proj was never live before (going backwards) @@ -577,14 +613,13 @@ } Pressure(uint high_pressure_index, uint high_pressure_limit) - : _current_pressure(0) - , _high_pressure_index(high_pressure_index) - , _high_pressure_limit(high_pressure_limit) - , _final_pressure(0) {} + : _current_pressure(0) + , _high_pressure_index(high_pressure_index) + , _final_pressure(0) + , _high_pressure_limit(high_pressure_limit) + , _start_pressure(0) {} }; - void lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure); - void raise_pressure(Block* b, LRG& lrg, Pressure& int_pressure, Pressure& float_pressure); void check_for_high_pressure_transition_at_fatproj(uint& block_reg_pressure, uint location, LRG& lrg, Pressure& pressure, const int op_regtype); void add_input_to_liveout(Block* b, Node* n, IndexSet* liveout, double cost, Pressure& int_pressure, Pressure& float_pressure); void compute_initial_block_pressure(Block* b, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure, double cost); @@ -600,10 +635,25 @@ // acceptable register sets do not overlap, then they do not interfere. uint build_ifg_physical( ResourceArea *a ); +public: // Gather LiveRanGe information, including register masks and base pointer/ // derived pointer relationships. void gather_lrg_masks( bool mod_cisc_masks ); + // user visible pressure variables for scheduling + Pressure _sched_int_pressure; + Pressure _sched_float_pressure; + Pressure _scratch_int_pressure; + Pressure _scratch_float_pressure; + + // Pressure functions for user context + void lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure); + void raise_pressure(Block* b, LRG& lrg, Pressure& int_pressure, Pressure& float_pressure); + void compute_entry_block_pressure(Block* b); + void compute_exit_block_pressure(Block* b); + void print_pressure_info(Pressure& pressure, const char *str); + +private: // Force the bases of derived pointers to be alive at GC points. bool stretch_base_pointer_live_ranges( ResourceArea *a ); // Helper to stretch above; recursively discover the base Node for --- old/src/share/vm/opto/compile.cpp 2015-09-03 15:31:43.825338200 -0700 +++ new/src/share/vm/opto/compile.cpp 2015-09-03 15:31:43.575338200 -0700 @@ -2336,7 +2336,7 @@ debug_only( cfg.verify(); ) } - PhaseChaitin regalloc(unique(), cfg, matcher); + PhaseChaitin regalloc(unique(), cfg, matcher, false); _regalloc = ®alloc; { TracePhase tp("regalloc", &timers[_t_registerAllocation]); --- old/src/share/vm/opto/gcm.cpp 2015-09-03 15:31:45.486338200 -0700 +++ new/src/share/vm/opto/gcm.cpp 2015-09-03 15:31:45.211338200 -0700 @@ -34,6 +34,7 @@ #include "opto/phaseX.hpp" #include "opto/rootnode.hpp" #include "opto/runtime.hpp" +#include "opto/chaitin.hpp" #include "runtime/deoptimization.hpp" // Portions of code courtesy of Clifford Click @@ -1363,6 +1364,33 @@ } } + PhaseChaitin regalloc(C->unique(), *this, _matcher, true); + ResourceArea live_arena; // Arena for liveness + ResourceMark rm_live(&live_arena); + PhaseLive live(*this, regalloc._lrg_map.names(), &live_arena, true); + PhaseIFG ifg(&live_arena); + intptr_t *recalc_pressure_nodes = NULL; + + if (OptoRegScheduling) { + regalloc.mark_ssa(); + Compile::TracePhase tp("computeLive", &timers[_t_computeLive]); + rm_live.reset_to_mark(); // Reclaim working storage + IndexSet::reset_memory(C, &live_arena); + uint node_size = regalloc._lrg_map.max_lrg_id(); + ifg.init(node_size); // Empty IFG + regalloc.set_ifg(ifg); + regalloc.set_live(live); + regalloc.gather_lrg_masks(false); // Collect LRG masks + live.compute(node_size); // Compute liveness + + recalc_pressure_nodes = NEW_RESOURCE_ARRAY(intptr_t, node_size); + for (uint i = 0; i < node_size; i++) { + recalc_pressure_nodes[i] = 0; + } + } + + _regalloc = ®alloc; + #ifndef PRODUCT if (trace_opto_pipelining()) { tty->print("\n---- Start Local Scheduling ----\n"); @@ -1375,13 +1403,15 @@ visited.Clear(); for (uint i = 0; i < number_of_blocks(); i++) { Block* block = get_block(i); - if (!schedule_local(block, ready_cnt, visited)) { + if (!schedule_local(block, ready_cnt, visited, recalc_pressure_nodes)) { if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) { C->record_method_not_compilable("local schedule failed"); } + _regalloc = NULL; return; } } + _regalloc = NULL; // If we inserted any instructions between a Call and his CatchNode, // clone the instructions on all paths below the Catch. --- old/src/share/vm/opto/ifg.cpp 2015-09-03 15:31:46.936338200 -0700 +++ new/src/share/vm/opto/ifg.cpp 2015-09-03 15:31:46.692338200 -0700 @@ -439,8 +439,10 @@ } } } - assert(int_pressure.current_pressure() == count_int_pressure(liveout), "the int pressure is incorrect"); - assert(float_pressure.current_pressure() == count_float_pressure(liveout), "the float pressure is incorrect"); + if (_scheduling_info_generated == false) { + assert(int_pressure.current_pressure() == count_int_pressure(liveout), "the int pressure is incorrect"); + assert(float_pressure.current_pressure() == count_float_pressure(liveout), "the float pressure is incorrect"); + } } /* Go to the first non-phi index in a block */ @@ -518,6 +520,58 @@ } /* +* Computes the entry register pressure of a block, looking at all live +* ranges in the livein. The register pressure is computed for both float +* and int/pointer registers. +*/ +void PhaseChaitin::compute_entry_block_pressure(Block* b) { + IndexSet* livein = _live->livein(b); + IndexSetIterator elements(livein); + uint lid = elements.next(); + while (lid != 0) { + LRG& lrg = lrgs(lid); + raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure); + lid = elements.next(); + } + // Now check phis for locally defined inputs + for (uint j = 0; j < b->number_of_nodes(); j++) { + Node* n = b->get_node(j); + if (n->is_Phi()) { + for (uint k = 1; k < n->req(); k++) { + Node* phi_in = n->in(k); + // Because we are talking about phis, raise register pressure once for each + // instance of a phi to account for a single value + if (_cfg.get_block_for_node(phi_in) == b) { + LRG& lrg = lrgs(phi_in->_idx); + raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure); + break; + } + } + } + } + _sched_int_pressure.set_start_pressure(_sched_int_pressure.current_pressure()); + _sched_float_pressure.set_start_pressure(_sched_float_pressure.current_pressure()); +} + +/* +* Computes the exit register pressure of a block, looking at all live +* ranges in the liveout. The register pressure is computed for both float +* and int/pointer registers. +*/ +void PhaseChaitin::compute_exit_block_pressure(Block* b) { + IndexSet* livein = _live->live(b); + IndexSetIterator elements(livein); + _sched_int_pressure.set_current_pressure(0); + _sched_float_pressure.set_current_pressure(0); + uint lid = elements.next(); + while (lid != 0) { + LRG& lrg = lrgs(lid); + raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure); + lid = elements.next(); + } +} + +/* * Remove dead node if it's not used. * We only remove projection nodes if the node "defining" the projection is * dead, for example on x86, if we have a dead Add node we remove its @@ -737,6 +791,16 @@ block_hrp_index = i; } +void PhaseChaitin::print_pressure_info(Pressure& pressure, const char *str) { + if (str != NULL) { + tty->print_cr("# *** %s ***", str); + } + tty->print_cr("# start pressure is = %d", pressure.start_pressure()); + tty->print_cr("# max pressure is = %d", pressure.final_pressure()); + tty->print_cr("# end pressure is = %d", pressure.current_pressure()); + tty->print_cr("#"); +} + /* Build an interference graph: * That is, if 2 live ranges are simultaneously alive but in their acceptable * register sets do not overlap, then they do not interfere. The IFG is built --- old/src/share/vm/opto/lcm.cpp 2015-09-03 15:31:48.295338200 -0700 +++ new/src/share/vm/opto/lcm.cpp 2015-09-03 15:31:48.076338200 -0700 @@ -31,6 +31,7 @@ #include "opto/cfgnode.hpp" #include "opto/machnode.hpp" #include "opto/runtime.hpp" +#include "opto/chaitin.hpp" #include "runtime/sharedRuntime.hpp" // Optimization - Graph Style @@ -443,7 +444,13 @@ // remaining cases (most), choose the instruction with the greatest latency // (that is, the most number of pseudo-cycles required to the end of the // routine). If there is a tie, choose the instruction with the most inputs. -Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray &ready_cnt, VectorSet &next_call, uint sched_slot) { +Node* PhaseCFG::select( + Block* block, + Node_List &worklist, + GrowableArray &ready_cnt, + VectorSet &next_call, + uint sched_slot, + intptr_t* recalc_pressure_nodes) { // If only a single entry on the stack, use it uint cnt = worklist.size(); @@ -537,7 +544,45 @@ } uint n_latency = get_latency_for_node(n); - uint n_score = n->req(); // Many inputs get high score to break ties + uint n_score = n->req(); // Many inputs get high score to break ties + + if (OptoRegScheduling) { + if (recalc_pressure_nodes[n->_idx] == 0x7fff7fff) { + _regalloc->_scratch_int_pressure.init(_regalloc->_sched_int_pressure.high_pressure_limit()); + _regalloc->_scratch_float_pressure.init(_regalloc->_sched_float_pressure.high_pressure_limit()); + // simulate the notion that we just picked this node to schedule + n->add_flag(Node::Flag_is_scheduled); + // now caculate its effect upon the graph if we did + adjust_register_pressure(n, block, recalc_pressure_nodes, false); + // return its state for finalize in case somebody else wins + n->remove_flag(Node::Flag_is_scheduled); + // now save the two final pressure components of register pressure, limiting pressure calcs to short size + short int_pressure = (short)_regalloc->_scratch_int_pressure.current_pressure(); + short float_pressure = (short)_regalloc->_scratch_float_pressure.current_pressure(); + recalc_pressure_nodes[n->_idx] = int_pressure; + recalc_pressure_nodes[n->_idx] |= (float_pressure << 16); + } + + if (_scheduling_for_pressure) { + latency = n_latency; + if (n_choice != 3) { + // Now evaluate each register pressure component based on threshold in the score. + // In general the defining register type will dominate the score, ergo we will not see register pressure grow on both banks + // on a single instruction, but we might see it shrink on both banks. + if (_regalloc->_sched_int_pressure.current_pressure() > _regalloc->_sched_int_pressure.high_pressure_limit()) { + short int_pressure = (short)recalc_pressure_nodes[n->_idx]; + n_score = (int_pressure < 0) ? ((score + n_score) - int_pressure) : (int_pressure > 0) ? 1 : n_score; + } + if (_regalloc->_sched_float_pressure.current_pressure() > _regalloc->_sched_float_pressure.high_pressure_limit()) { + short float_pressure = (short)(recalc_pressure_nodes[n->_idx] >> 16); + n_score = (float_pressure < 0) ? ((score + n_score) - float_pressure) : (float_pressure > 0) ? 1 : n_score; + } + } else { + // make sure we choose these candidates + score = 0; + } + } + } // Keep best latency found cand_cnt++; @@ -562,6 +607,100 @@ return n; } +//-------------------------adjust_register_pressure---------------------------- +void PhaseCFG::adjust_register_pressure(Node* n, Block* block, intptr_t* recalc_pressure_nodes, bool finalize_mode) { + PhaseLive* liveinfo = _regalloc->get_live(); + IndexSet* liveout = liveinfo->live(block); + // first adjust the register pressure for the sources + for (uint i = 1; i < n->req(); i++) { + bool lrg_ends = false; + Node *src_n = n->in(i); + if (src_n == NULL) continue; + if (!src_n->is_Mach()) continue; + uint src = _regalloc->_lrg_map.find(src_n); + if (src == 0) continue; + LRG& lrg_src = _regalloc->lrgs(src); + // detect if the live range ends or not + if (liveout->member(src) == false) { + lrg_ends = true; + for (DUIterator_Fast jmax, j = src_n->fast_outs(jmax); j < jmax; j++) { + Node* m = src_n->fast_out(j); // Get user + if (m == n) continue; + if (!m->is_Mach()) continue; + MachNode *mach = m->as_Mach(); + bool src_matches = false; + int iop = mach->ideal_Opcode(); + + switch (iop) { + case Op_StoreB: + case Op_StoreC: + case Op_StoreCM: + case Op_StoreD: + case Op_StoreF: + case Op_StoreI: + case Op_StoreL: + case Op_StoreP: + case Op_StoreN: + case Op_StoreVector: + case Op_StoreNKlass: + for (uint k = 1; k < m->req(); k++) { + Node *in = m->in(k); + if (in == src_n) { + src_matches = true; + break; + } + } + break; + + default: + src_matches = true; + break; + } + + // If we have a store as our use, ignore the non source operands + if (src_matches == false) continue; + + // Mark every unscheduled use which is not n with a recalculation + if ((get_block_for_node(m) == block) && (!m->is_scheduled())) { + if (finalize_mode && !m->is_Phi()) { + recalc_pressure_nodes[m->_idx] = 0x7fff7fff; + } + lrg_ends = false; + } + } + } + // if none, this live range ends and we can adjust register pressure + if (lrg_ends) { + if (finalize_mode) { + _regalloc->lower_pressure(block, 0, lrg_src, NULL, _regalloc->_sched_int_pressure, _regalloc->_sched_float_pressure); + } else { + _regalloc->lower_pressure(block, 0, lrg_src, NULL, _regalloc->_scratch_int_pressure, _regalloc->_scratch_float_pressure); + } + } + } + + // now add the register pressure from the dest and evaluate which heuristic we should use: + // 1.) The default, latency scheduling + // 2.) Register pressure scheduling based on the high pressure limit threshold for int or float register stacks + uint dst = _regalloc->_lrg_map.find(n); + if (dst != 0) { + LRG& lrg_dst = _regalloc->lrgs(dst); + if (finalize_mode) { + _regalloc->raise_pressure(block, lrg_dst, _regalloc->_sched_int_pressure, _regalloc->_sched_float_pressure); + // check to see if we fall over the register pressure cliff here + if (_regalloc->_sched_int_pressure.current_pressure() > _regalloc->_sched_int_pressure.high_pressure_limit()) { + _scheduling_for_pressure = true; + } else if (_regalloc->_sched_float_pressure.current_pressure() > _regalloc->_sched_float_pressure.high_pressure_limit()) { + _scheduling_for_pressure = true; + } else { + // restore latency scheduling mode + _scheduling_for_pressure = false; + } + } else { + _regalloc->raise_pressure(block, lrg_dst, _regalloc->_scratch_int_pressure, _regalloc->_scratch_float_pressure); + } + } +} //------------------------------set_next_call---------------------------------- void PhaseCFG::set_next_call(Block* block, Node* n, VectorSet& next_call) { @@ -644,7 +783,7 @@ continue; } if( m->is_Phi() ) continue; - int m_cnt = ready_cnt.at(m->_idx)-1; + int m_cnt = ready_cnt.at(m->_idx) - 1; ready_cnt.at_put(m->_idx, m_cnt); if( m_cnt == 0 ) worklist.push(m); @@ -711,7 +850,7 @@ //------------------------------schedule_local--------------------------------- // Topological sort within a block. Someday become a real scheduler. -bool PhaseCFG::schedule_local(Block* block, GrowableArray& ready_cnt, VectorSet& next_call) { +bool PhaseCFG::schedule_local(Block* block, GrowableArray& ready_cnt, VectorSet& next_call, intptr_t *recalc_pressure_nodes) { // Already "sorted" are the block start Node (as the first entry), and // the block-ending Node and any trailing control projections. We leave // these alone. PhiNodes and ParmNodes are made to follow the block start @@ -733,10 +872,22 @@ return true; } + // We track the uses of local definitions as input dependences so that + // we know when a given instruction is avialable to be scheduled. + uint i; + if (OptoRegScheduling) { + for (i = 1; i < block->number_of_nodes(); i++) { // setup nodes for pressure calc + Node *n = block->get_node(i); + n->remove_flag(Node::Flag_is_scheduled); + if (!n->is_Phi()) { + recalc_pressure_nodes[n->_idx] = 0x7fff7fff; + } + } + } + // Move PhiNodes and ParmNodes from 1 to cnt up to the start uint node_cnt = block->end_idx(); uint phi_cnt = 1; - uint i; for( i = 1; iget_node(i); if( n->is_Phi() || // Found a PhiNode or ParmNode @@ -744,6 +895,10 @@ // Move guy at 'phi_cnt' to the end; makes a hole at phi_cnt block->map_node(block->get_node(phi_cnt), i); block->map_node(n, phi_cnt++); // swap Phi/Parm up front + if (OptoRegScheduling) { + // mark n as scheduled + n->add_flag(Node::Flag_is_scheduled); + } } else { // All others // Count block-local inputs to 'n' uint cnt = n->len(); // Input count @@ -791,12 +946,18 @@ // All the prescheduled guys do not hold back internal nodes uint i3; - for(i3 = 0; i3get_node(i3); // Get pre-scheduled for (DUIterator_Fast jmax, j = n->fast_outs(jmax); j < jmax; j++) { Node* m = n->fast_out(j); if (get_block_for_node(m) == block) { // Local-block user int m_cnt = ready_cnt.at(m->_idx)-1; + if (OptoRegScheduling) { + // mark m as scheduled + if (m_cnt < 0) { + m->add_flag(Node::Flag_is_scheduled); + } + } ready_cnt.at_put(m->_idx, m_cnt); // Fix ready count } } @@ -827,6 +988,23 @@ worklist.push(d); } + if (OptoRegScheduling) { + // To stage register pressure calculations we need to examine the live set variables + // breaking them up by register class to compartmentalize the calculations. + uint float_pressure = FLOATPRESSURE; +#ifdef _LP64 + if (UseAVX > 2) { + float_pressure *= 2; + } +#endif + _regalloc->_sched_int_pressure.init(INTPRESSURE); + _regalloc->_sched_float_pressure.init(float_pressure); + _regalloc->_scratch_int_pressure.init(INTPRESSURE); + _regalloc->_scratch_float_pressure.init(float_pressure); + + _regalloc->compute_entry_block_pressure(block); + } + // Warm up the 'next_call' heuristic bits needed_for_next_call(block, block->head(), next_call); @@ -858,9 +1036,18 @@ #endif // Select and pop a ready guy from worklist - Node* n = select(block, worklist, ready_cnt, next_call, phi_cnt); + Node* n = select(block, worklist, ready_cnt, next_call, phi_cnt, recalc_pressure_nodes); block->map_node(n, phi_cnt++); // Schedule him next + if (OptoRegScheduling) { + n->add_flag(Node::Flag_is_scheduled); + + // Now adjust the resister pressure with the node we selected + if (!n->is_Phi()) { + adjust_register_pressure(n, block, recalc_pressure_nodes, true); + } + } + #ifndef PRODUCT if (trace_opto_pipelining()) { tty->print("# select %d: %s", n->_idx, n->Name()); @@ -906,7 +1093,7 @@ assert(m->is_MachProj() && n->is_Mach() && n->as_Mach()->has_call(), "unexpected node types"); continue; } - int m_cnt = ready_cnt.at(m->_idx)-1; + int m_cnt = ready_cnt.at(m->_idx) - 1; ready_cnt.at_put(m->_idx, m_cnt); if( m_cnt == 0 ) worklist.push(m); @@ -925,6 +1112,12 @@ return false; } + if (OptoRegScheduling) { + _regalloc->compute_exit_block_pressure(block); + block->_reg_pressure = _regalloc->_sched_int_pressure.final_pressure(); + block->_freg_pressure = _regalloc->_sched_float_pressure.final_pressure(); + } + #ifndef PRODUCT if (trace_opto_pipelining()) { tty->print_cr("#"); @@ -933,11 +1126,17 @@ tty->print("# "); block->get_node(i)->fast_dump(); } + tty->print_cr("# "); + + if (OptoRegScheduling) { + tty->print_cr("# pressure info : %d", block->_pre_order); + _regalloc->print_pressure_info(_regalloc->_sched_int_pressure, "int register info"); + _regalloc->print_pressure_info(_regalloc->_sched_float_pressure, "float register info"); + } tty->cr(); } #endif - return true; } --- old/src/share/vm/opto/live.cpp 2015-09-03 15:31:49.698338200 -0700 +++ new/src/share/vm/opto/live.cpp 2015-09-03 15:31:49.478338200 -0700 @@ -41,7 +41,14 @@ // block is put on the worklist. // The locally live-in stuff is computed once and added to predecessor // live-out sets. This separate compilation is done in the outer loop below. -PhaseLive::PhaseLive( const PhaseCFG &cfg, const LRG_List &names, Arena *arena ) : Phase(LIVE), _cfg(cfg), _names(names), _arena(arena), _live(0) { +PhaseLive::PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena, bool keep_deltas) + : Phase(LIVE), + _cfg(cfg), + _names(names), + _arena(arena), + _live(0), + _livein(0), + _keep_deltas(keep_deltas) { } void PhaseLive::compute(uint maxlrg) { @@ -56,6 +63,13 @@ _live[i].initialize(_maxlrg); } + if (_keep_deltas) { + _livein = (IndexSet*)_arena->Amalloc(sizeof(IndexSet) * _cfg.number_of_blocks()); + for (i = 0; i < _cfg.number_of_blocks(); i++) { + _livein[i].initialize(_maxlrg); + } + } + // Init the sparse arrays for delta-sets. ResourceMark rm; // Nuke temp storage on exit @@ -124,7 +138,10 @@ // PhiNode uses go in the live-out set of prior blocks. for (uint k = i; k > 0; k--) { - add_liveout(p, _names.at(block->get_node(k-1)->in(l)->_idx), first_pass); + Node *phi = block->get_node(k - 1); + if (l < phi->req()) { + add_liveout(p, _names.at(phi->in(l)->_idx), first_pass); + } } } freeset(block); @@ -200,8 +217,11 @@ } // Free an IndexSet from a block. -void PhaseLive::freeset( const Block *p ) { +void PhaseLive::freeset( Block *p ) { IndexSet *f = _deltas[p->_pre_order-1]; + if ( _keep_deltas ) { + add_livein(p, f); + } f->set_next(_free_IndexSet); _free_IndexSet = f; // Drop onto free list _deltas[p->_pre_order-1] = NULL; @@ -249,10 +269,23 @@ } } +// Add a vector of live-in values to a given blocks live-in set. +void PhaseLive::add_livein(Block *p, IndexSet *lo) { + IndexSet *livein = &_livein[p->_pre_order-1]; + IndexSetIterator elements(lo); + uint r; + while ((r = elements.next()) != 0) { + livein->insert(r); // Then add to live-in set + } +} + #ifndef PRODUCT // Dump the live-out set for a block void PhaseLive::dump( const Block *b ) const { tty->print("Block %d: ",b->_pre_order); + if ( _keep_deltas ) { + tty->print("LiveIn: "); _livein[b->_pre_order-1].dump(); + } tty->print("LiveOut: "); _live[b->_pre_order-1].dump(); uint cnt = b->number_of_nodes(); for( uint i=0; i_pre_order-1]; } + IndexSet *livein( const Block * b ) { return &_livein[b->_pre_order - 1]; } #ifndef PRODUCT void dump( const Block *b ) const; --- old/src/share/vm/opto/node.hpp 2015-09-03 15:31:52.313338200 -0700 +++ new/src/share/vm/opto/node.hpp 2015-09-03 15:31:52.089338200 -0700 @@ -674,7 +674,8 @@ Flag_avoid_back_to_back_after = Flag_avoid_back_to_back_before << 1, Flag_has_call = Flag_avoid_back_to_back_after << 1, Flag_is_reduction = Flag_has_call << 1, - Flag_is_expensive = Flag_is_reduction << 1, + Flag_is_scheduled = Flag_is_reduction << 1, + Flag_is_expensive = Flag_is_scheduled << 1, _max_flags = (Flag_is_expensive << 1) - 1 // allow flags combination }; @@ -861,6 +862,9 @@ // It must have the loop's phi as input and provide a def to the phi. bool is_reduction() const { return (_flags & Flag_is_reduction) != 0; } + // Used in lcm to mark nodes that have scheduled + bool is_scheduled() const { return (_flags & Flag_is_scheduled) != 0; } + //----------------- Optimization // Get the worst-case Type output for this Node.