--- old/src/share/vm/opto/chaitin.cpp 2015-01-20 09:42:49.000000000 -0800 +++ new/src/share/vm/opto/chaitin.cpp 2015-01-20 09:42:49.000000000 -0800 @@ -575,6 +575,9 @@ // Peephole remove copies post_allocate_copy_removal(); + // Merge multidefs if multiple defs representing the same value are used in a single block. + merge_multidefs(); + #ifdef ASSERT // Veify the graph after RA. verify(&live_arena); --- old/src/share/vm/opto/chaitin.hpp 2015-01-20 09:42:50.000000000 -0800 +++ new/src/share/vm/opto/chaitin.hpp 2015-01-20 09:42:50.000000000 -0800 @@ -578,6 +578,32 @@ // Extend the node to LRG mapping void add_reference( const Node *node, const Node *old_node); + // Record the first use of a def in the block for a register. + class RegDefUse { + Node* _def; + Node* _first_use; + public: + RegDefUse() : _def(NULL), _first_use(NULL) { } + Node* def() const { return _def; } + Node* first_use() const { return _first_use; } + + void update(Node* def, Node* use) { + if (_def != def) { + _def = def; + _first_use = use; + } + } + void clear() { + _def = NULL; + _first_use = NULL; + } + }; + typedef GrowableArray RegToDefUseMap; + int possibly_merge_multidef(Node *n, uint k, Block *block, RegToDefUseMap& reg2defuse); + + // Merge nodes that are a part of a multidef lrg and produce the same value within a block. + void merge_multidefs(); + private: static int _final_loads, _final_stores, _final_copies, _final_memoves; --- old/src/share/vm/opto/machnode.hpp 2015-01-20 09:42:50.000000000 -0800 +++ new/src/share/vm/opto/machnode.hpp 2015-01-20 09:42:50.000000000 -0800 @@ -558,6 +558,29 @@ #endif }; +// MachMergeNode is similar to a PhiNode in a sense it merges multiple values, +// however it doesn't have a control input and is more like a MergeMem. +// It is inserted after the register allocation is done to ensure that nodes use single +// definition of a multidef lrg in a block. +class MachMergeNode : public MachIdealNode { +public: + MachMergeNode(Node *n1) { + init_class_id(Class_MachMerge); + add_req(NULL); + add_req(n1); + } + virtual const RegMask &out_RegMask() const { return in(1)->out_RegMask(); } + virtual const RegMask &in_RegMask(uint idx) const { return in(1)->in_RegMask(idx); } + virtual const class Type *bottom_type() const { return in(1)->bottom_type(); } + virtual uint ideal_reg() const { return bottom_type()->ideal_reg(); } + virtual uint oper_input_base() const { return 1; } + virtual void emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { } + virtual uint size(PhaseRegAlloc *ra_) const { return 0; } +#ifndef PRODUCT + virtual const char *Name() const { return "MachMerge"; } +#endif +}; + //------------------------------MachBranchNode-------------------------------- // Abstract machine branch Node class MachBranchNode : public MachIdealNode { --- old/src/share/vm/opto/node.hpp 2015-01-20 09:42:50.000000000 -0800 +++ new/src/share/vm/opto/node.hpp 2015-01-20 09:42:50.000000000 -0800 @@ -98,6 +98,7 @@ class MachSafePointNode; class MachSpillCopyNode; class MachTempNode; +class MachMergeNode; class Matcher; class MemBarNode; class MemBarStoreStoreNode; @@ -591,6 +592,7 @@ DEFINE_CLASS_ID(MachTemp, Mach, 3) DEFINE_CLASS_ID(MachConstantBase, Mach, 4) DEFINE_CLASS_ID(MachConstant, Mach, 5) + DEFINE_CLASS_ID(MachMerge, Mach, 6) DEFINE_CLASS_ID(Type, Node, 2) DEFINE_CLASS_ID(Phi, Type, 0) @@ -761,6 +763,7 @@ DEFINE_CLASS_QUERY(MachSafePoint) DEFINE_CLASS_QUERY(MachSpillCopy) DEFINE_CLASS_QUERY(MachTemp) + DEFINE_CLASS_QUERY(MachMerge) DEFINE_CLASS_QUERY(Mem) DEFINE_CLASS_QUERY(MemBar) DEFINE_CLASS_QUERY(MemBarStoreStore) --- old/src/share/vm/opto/phase.cpp 2015-01-20 09:42:50.000000000 -0800 +++ new/src/share/vm/opto/phase.cpp 2015-01-20 09:42:50.000000000 -0800 @@ -74,6 +74,7 @@ elapsedTimer Phase::_t_computeLive; elapsedTimer Phase::_t_regAllocSplit; elapsedTimer Phase::_t_postAllocCopyRemoval; +elapsedTimer Phase::_t_mergeMultidefs; elapsedTimer Phase::_t_fixupSpills; // Subtimers for _t_output @@ -136,11 +137,12 @@ tty->print_cr (" computeLive : %3.3f sec", Phase::_t_computeLive.seconds()); tty->print_cr (" regAllocSplit : %3.3f sec", Phase::_t_regAllocSplit.seconds()); tty->print_cr (" postAllocCopyRemoval: %3.3f sec", Phase::_t_postAllocCopyRemoval.seconds()); + tty->print_cr (" mergeMultidefs: %3.3f sec", Phase::_t_mergeMultidefs.seconds()); tty->print_cr (" fixupSpills : %3.3f sec", Phase::_t_fixupSpills.seconds()); double regalloc_subtotal = Phase::_t_ctorChaitin.seconds() + Phase::_t_buildIFGphysical.seconds() + Phase::_t_computeLive.seconds() + Phase::_t_regAllocSplit.seconds() + Phase::_t_fixupSpills.seconds() + - Phase::_t_postAllocCopyRemoval.seconds(); + Phase::_t_postAllocCopyRemoval.seconds() + Phase::_t_mergeMultidefs.seconds(); double percent_of_regalloc = ((regalloc_subtotal == 0.0) ? 0.0 : (regalloc_subtotal / Phase::_t_registerAllocation.seconds() * 100.0)); tty->print_cr (" subtotal : %3.3f sec, %3.2f %%", regalloc_subtotal, percent_of_regalloc); } --- old/src/share/vm/opto/phase.hpp 2015-01-20 09:42:51.000000000 -0800 +++ new/src/share/vm/opto/phase.hpp 2015-01-20 09:42:51.000000000 -0800 @@ -109,6 +109,7 @@ static elapsedTimer _t_computeLive; static elapsedTimer _t_regAllocSplit; static elapsedTimer _t_postAllocCopyRemoval; + static elapsedTimer _t_mergeMultidefs; static elapsedTimer _t_fixupSpills; // Subtimers for _t_output --- old/src/share/vm/opto/postaloc.cpp 2015-01-20 09:42:51.000000000 -0800 +++ new/src/share/vm/opto/postaloc.cpp 2015-01-20 09:42:51.000000000 -0800 @@ -263,20 +263,6 @@ // intermediate copies might be illegal, i.e., value is stored down to stack // then reloaded BUT survives in a register the whole way. Node *val = skip_copies(n->in(k)); - - if (val == x && nk_idx != 0 && - regnd[nk_reg] != NULL && regnd[nk_reg] != x && - _lrg_map.live_range_id(x) == _lrg_map.live_range_id(regnd[nk_reg])) { - // When rematerialzing nodes and stretching lifetimes, the - // allocator will reuse the original def for multidef LRG instead - // of the current reaching def because it can't know it's safe to - // do so. After allocation completes if they are in the same LRG - // then it should use the current reaching def instead. - n->set_req(k, regnd[nk_reg]); - blk_adjust += yank_if_dead(val, current_block, &value, ®nd); - val = skip_copies(n->in(k)); - } - if (val == x) return blk_adjust; // No progress? int n_regs = RegMask::num_registers(val->ideal_reg()); @@ -382,6 +368,94 @@ return false; } +// The algorithms works as follows: +// We traverse the block top to bottom. possibly_merge_multidef() is invoked for every input edge k +// of the instruction n. We check to see if the input is a multidef lrg. If it is, we record the fact that we've +// seen a definition (coming as an input) and add that fact to the reg2defuse array. The array maps registers to their +// current reaching definitions (we track only multidefs though). With each definition we also associate the first +// instruction we saw use it. If we encounter the situation when we observe an def (an input) that is a part of the +// same lrg but is different from the previous seen def we merge the two with a MachMerge node and substitute +// all the uses that we've seen so far to use the merge. After that we keep replacing the new defs in the same lrg +// as they get encountered with the merge node and keep adding these defs to the merge inputs. +void PhaseChaitin::merge_multidefs() { + NOT_PRODUCT( Compile::TracePhase t3("mergeMultidefs", &_t_mergeMultidefs, TimeCompiler); ) + ResourceMark rm; + // Keep track of the defs seen in registers and collect their uses in the block. + RegToDefUseMap reg2defuse(_max_reg, _max_reg, RegDefUse()); + for (uint i = 0; i < _cfg.number_of_blocks(); i++) { + Block* block = _cfg.get_block(i); + for (uint j = 1; j < block->number_of_nodes(); j++) { + Node* n = block->get_node(j); + if (n->is_Phi()) continue; + for (uint k = 1; k < n->req(); k++) { + j += possibly_merge_multidef(n, k, block, reg2defuse); + } + // Null out the value produced by the instruction itself, since we're only interested in defs + // implicitly defined by the uses. We are actually interested in tracking only redefinitions + // of the multidef lrgs in the same register. For that matter it's enough to track changes in + // the base register only and ignore other effects of multi-register lrgs and fat projections. + // It is also ok to ignore defs coming from singledefs. After an implicit overwrite by one of + // those our register is guaranteed to be used by another lrg and we won't attempt to merge it. + uint lrg = _lrg_map.live_range_id(n); + if (lrg > 0 && lrgs(lrg).is_multidef()) { + OptoReg::Name reg = lrgs(lrg).reg(); + reg2defuse.at(reg).clear(); + } + } + // Clear reg->def->use tracking for the next block + for (int j = 0; j < reg2defuse.length(); j++) { + reg2defuse.at(j).clear(); + } + } +} + +int PhaseChaitin::possibly_merge_multidef(Node *n, uint k, Block *block, RegToDefUseMap& reg2defuse) { + int blk_adjust = 0; + + uint lrg = _lrg_map.live_range_id(n->in(k)); + if (lrg > 0 && lrgs(lrg).is_multidef()) { + OptoReg::Name reg = lrgs(lrg).reg(); + + Node* def = reg2defuse.at(reg).def(); + if (def != NULL && lrg == _lrg_map.live_range_id(def) && def != n->in(k)) { + // Same lrg but different node, we have to merge. + MachMergeNode* merge; + if (def->is_MachMerge()) { // is it already a merge? + merge = def->as_MachMerge(); + } else { + merge = new (C) MachMergeNode(def); + + // Insert the merge node into the block before the first use. + uint use_index = block->find_node(reg2defuse.at(reg).first_use()); + block->insert_node(merge, use_index++); + + // Let the allocator know about the new node, use the same lrg + _lrg_map.extend(merge->_idx, lrg); + blk_adjust++; + + // Fixup all the uses (there is at least one) that happened between the first + // use and before the current one. + for (; use_index < block->number_of_nodes(); use_index++) { + Node* use = block->get_node(use_index); + if (use == n) { + break; + } + use->replace_edge(def, merge); + } + } + if (merge->find_edge(n->in(k)) == -1) { + merge->add_req(n->in(k)); + } + n->set_req(k, merge); + } + + // update the uses + reg2defuse.at(reg).update(n->in(k), n); + } + + return blk_adjust; +} + //------------------------------post_allocate_copy_removal--------------------- // Post-Allocation peephole copy removal. We do this in 1 pass over the