--- old/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp 2018-05-16 09:38:55.702496620 +0200 +++ new/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp 2018-05-16 09:38:49.650514185 +0200 @@ -1343,12 +1343,11 @@ __ mov_metadata(mdo, md->constant_encoding()); Address data_addr = __ form_address(rscratch2, mdo, - md->byte_offset_of_slot(data, DataLayout::DataLayout::header_offset()), - LogBytesPerWord); - int header_bits = DataLayout::flag_mask_to_header_mask(BitData::null_seen_byte_constant()); - __ ldr(rscratch1, data_addr); - __ orr(rscratch1, rscratch1, header_bits); - __ str(rscratch1, data_addr); + md->byte_offset_of_slot(data, DataLayout::flags_offset()), + 0); + __ ldrb(rscratch1, data_addr); + __ orr(rscratch1, rscratch1, BitData::null_seen_byte_constant()); + __ strb(rscratch1, data_addr); __ b(*obj_is_null); __ bind(not_null); } else { @@ -1421,7 +1420,7 @@ Address counter_addr = __ form_address(rscratch2, mdo, md->byte_offset_of_slot(data, CounterData::count_offset()), - LogBytesPerWord); + 0); __ ldr(rscratch1, counter_addr); __ sub(rscratch1, rscratch1, DataLayout::counter_increment); __ str(rscratch1, counter_addr); @@ -1470,12 +1469,11 @@ __ mov_metadata(mdo, md->constant_encoding()); Address data_addr = __ form_address(rscratch2, mdo, - md->byte_offset_of_slot(data, DataLayout::header_offset()), + md->byte_offset_of_slot(data, DataLayout::flags_offset()), LogBytesPerInt); - int header_bits = DataLayout::flag_mask_to_header_mask(BitData::null_seen_byte_constant()); - __ ldrw(rscratch1, data_addr); - __ orrw(rscratch1, rscratch1, header_bits); - __ strw(rscratch1, data_addr); + __ ldrb(rscratch1, data_addr); + __ orr(rscratch1, rscratch1, BitData::null_seen_byte_constant()); + __ strb(rscratch1, data_addr); __ b(done); __ bind(not_null); } else { --- old/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp 2018-05-16 09:39:01.834478824 +0200 +++ new/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp 2018-05-16 09:38:55.848496197 +0200 @@ -970,12 +970,11 @@ void InterpreterMacroAssembler::set_mdp_flag_at(Register mdp_in, int flag_byte_constant) { assert(ProfileInterpreter, "must be profiling interpreter"); - int header_offset = in_bytes(DataLayout::header_offset()); - int header_bits = DataLayout::flag_mask_to_header_mask(flag_byte_constant); + int flags_offset = in_bytes(DataLayout::flags_offset()); // Set the flag - ldr(rscratch1, Address(mdp_in, header_offset)); - orr(rscratch1, rscratch1, header_bits); - str(rscratch1, Address(mdp_in, header_offset)); + ldrb(rscratch1, Address(mdp_in, flags_offset)); + orr(rscratch1, rscratch1, flag_byte_constant); + strb(rscratch1, Address(mdp_in, flags_offset)); } --- old/src/hotspot/cpu/x86/assembler_x86.cpp 2018-05-16 09:39:13.219445782 +0200 +++ new/src/hotspot/cpu/x86/assembler_x86.cpp 2018-05-16 09:39:01.959478461 +0200 @@ -3338,6 +3338,14 @@ emit_operand(src, dst); } +void Assembler::orb(Address dst, int imm8) { + InstructionMark im(this); + prefix(dst); + emit_int8((unsigned char)0x80); + emit_operand(rcx, dst, 1); + emit_int8(imm8); +} + void Assembler::packuswb(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); --- old/src/hotspot/cpu/x86/assembler_x86.hpp 2018-05-16 09:39:19.525427481 +0200 +++ new/src/hotspot/cpu/x86/assembler_x86.hpp 2018-05-16 09:39:13.405445242 +0200 @@ -1536,6 +1536,8 @@ void orl(Register dst, Register src); void orl(Address dst, Register src); + void orb(Address dst, int imm8); + void orq(Address dst, int32_t imm32); void orq(Register dst, int32_t imm32); void orq(Register dst, Address src); --- old/src/hotspot/cpu/x86/c1_LIRAssembler_x86.cpp 2018-05-16 09:39:25.859409098 +0200 +++ new/src/hotspot/cpu/x86/c1_LIRAssembler_x86.cpp 2018-05-16 09:39:19.667427069 +0200 @@ -1678,9 +1678,9 @@ // Object is null; update MDO and exit Register mdo = klass_RInfo; __ mov_metadata(mdo, md->constant_encoding()); - Address data_addr(mdo, md->byte_offset_of_slot(data, DataLayout::header_offset())); - int header_bits = DataLayout::flag_mask_to_header_mask(BitData::null_seen_byte_constant()); - __ orl(data_addr, header_bits); + Address data_addr(mdo, md->byte_offset_of_slot(data, DataLayout::flags_offset())); + int header_bits = BitData::null_seen_byte_constant(); + __ orb(data_addr, header_bits); __ jmp(*obj_is_null); __ bind(not_null); } else { @@ -1824,9 +1824,9 @@ // Object is null; update MDO and exit Register mdo = klass_RInfo; __ mov_metadata(mdo, md->constant_encoding()); - Address data_addr(mdo, md->byte_offset_of_slot(data, DataLayout::header_offset())); - int header_bits = DataLayout::flag_mask_to_header_mask(BitData::null_seen_byte_constant()); - __ orl(data_addr, header_bits); + Address data_addr(mdo, md->byte_offset_of_slot(data, DataLayout::flags_offset())); + int header_bits = BitData::null_seen_byte_constant(); + __ orb(data_addr, header_bits); __ jmp(done); __ bind(not_null); } else { --- old/src/hotspot/cpu/x86/interp_masm_x86.cpp 2018-05-16 09:39:32.003391267 +0200 +++ new/src/hotspot/cpu/x86/interp_masm_x86.cpp 2018-05-16 09:39:26.022408625 +0200 @@ -1435,10 +1435,10 @@ void InterpreterMacroAssembler::set_mdp_flag_at(Register mdp_in, int flag_byte_constant) { assert(ProfileInterpreter, "must be profiling interpreter"); - int header_offset = in_bytes(DataLayout::header_offset()); - int header_bits = DataLayout::flag_mask_to_header_mask(flag_byte_constant); + int header_offset = in_bytes(DataLayout::flags_offset()); + int header_bits = flag_byte_constant; // Set the flag - orl(Address(mdp_in, header_offset), header_bits); + orb(Address(mdp_in, header_offset), header_bits); } --- old/src/hotspot/share/oops/methodData.hpp 2018-05-16 09:39:38.151373424 +0200 +++ new/src/hotspot/share/oops/methodData.hpp 2018-05-16 09:39:32.163390802 +0200 @@ -83,16 +83,17 @@ private: // Every data layout begins with a header. This header // contains a tag, which is used to indicate the size/layout - // of the data, 4 bits of flags, which can be used in any way, - // 4 bits of trap history (none/one reason/many reasons), + // of the data, 8 bits of flags, which can be used in any way, + // 32 bits of trap history (none/one reason/many reasons), // and a bci, which is used to tie this piece of data to a // specific bci in the bytecodes. union { - intptr_t _bits; + u8 _bits; struct { u1 _tag; u1 _flags; u2 _bci; + u4 _traps; } _struct; } _header; @@ -131,28 +132,23 @@ }; enum { - // The _struct._flags word is formatted as [trap_state:4 | flags:4]. - // The trap state breaks down further as [recompile:1 | reason:3]. + // The trap state breaks down as [recompile:1 | reason:31]. // This further breakdown is defined in deoptimization.cpp. // See Deoptimization::trap_state_reason for an assert that // trap_bits is big enough to hold reasons < Reason_RECORDED_LIMIT. // // The trap_state is collected only if ProfileTraps is true. - trap_bits = 1+3, // 3: enough to distinguish [0..Reason_RECORDED_LIMIT]. - trap_shift = BitsPerByte - trap_bits, + trap_bits = 1+31, // 31: enough to distinguish [0..Reason_RECORDED_LIMIT]. trap_mask = right_n_bits(trap_bits), - trap_mask_in_place = (trap_mask << trap_shift), - flag_limit = trap_shift, - flag_mask = right_n_bits(flag_limit), first_flag = 0 }; // Size computation static int header_size_in_bytes() { - return cell_size; + return header_size_in_cells() * cell_size; } static int header_size_in_cells() { - return 1; + return LP64_ONLY(1) NOT_LP64(2); } static int compute_size_in_bytes(int cell_count) { @@ -167,7 +163,7 @@ return _header._struct._tag; } - // Return a few bits of trap state. Range is [0..trap_mask]. + // Return 32 bits of trap state. // The state tells if traps with zero, one, or many reasons have occurred. // It also tells whether zero or many recompilations have occurred. // The associated trap histogram in the MDO itself tells whether @@ -175,14 +171,14 @@ // occurred, and the MDO shows N occurrences of X, we make the // simplifying assumption that all N occurrences can be blamed // on that BCI. - int trap_state() const { - return ((_header._struct._flags >> trap_shift) & trap_mask); + uint trap_state() const { + return _header._struct._traps; } - void set_trap_state(int new_state) { + void set_trap_state(uint new_state) { assert(ProfileTraps, "used only under +ProfileTraps"); - uint old_flags = (_header._struct._flags & flag_mask); - _header._struct._flags = (new_state << trap_shift) | old_flags; + uint old_flags = _header._struct._traps; + _header._struct._traps = new_state | old_flags; } u1 flags() const { @@ -193,10 +189,10 @@ return _header._struct._bci; } - void set_header(intptr_t value) { + void set_header(u8 value) { _header._bits = value; } - intptr_t header() { + u8 header() { return _header._bits; } void set_cell_at(int index, intptr_t value) { @@ -207,12 +203,10 @@ return _cells[index]; } - void set_flag_at(int flag_number) { - assert(flag_number < flag_limit, "oob"); + void set_flag_at(u1 flag_number) { _header._struct._flags |= (0x1 << flag_number); } - bool flag_at(int flag_number) const { - assert(flag_number < flag_limit, "oob"); + bool flag_at(u1 flag_number) const { return (_header._struct._flags & (0x1 << flag_number)) != 0; } @@ -238,14 +232,13 @@ } #endif // CC_INTERP // Return a value which, when or-ed as a byte into _flags, sets the flag. - static int flag_number_to_byte_constant(int flag_number) { - assert(0 <= flag_number && flag_number < flag_limit, "oob"); + static u1 flag_number_to_constant(u1 flag_number) { DataLayout temp; temp.set_header(0); temp.set_flag_at(flag_number); return temp._header._struct._flags; } // Return a value which, when or-ed as a word into _header, sets the flag. - static intptr_t flag_mask_to_header_mask(int byte_constant) { + static u8 flag_mask_to_header_mask(uint byte_constant) { DataLayout temp; temp.set_header(0); temp._header._struct._flags = byte_constant; return temp._header._bits; @@ -364,8 +357,8 @@ static ByteSize cell_offset(int index) { return DataLayout::cell_offset(index); } - static int flag_number_to_byte_constant(int flag_number) { - return DataLayout::flag_number_to_byte_constant(flag_number); + static int flag_number_to_constant(int flag_number) { + return DataLayout::flag_number_to_constant(flag_number); } ProfileData(DataLayout* data) { @@ -574,7 +567,7 @@ // Code generation support static int null_seen_byte_constant() { - return flag_number_to_byte_constant(null_seen_flag); + return flag_number_to_constant(null_seen_flag); } static ByteSize bit_data_size() { @@ -2050,6 +2043,9 @@ protected: enum { speculative_trap_method, +#ifndef _LP64 + speculative_trap_padding, +#endif speculative_trap_cell_count }; public: @@ -2162,7 +2158,7 @@ // Whole-method sticky bits and flags enum { - _trap_hist_limit = 23 JVMCI_ONLY(+5), // decoupled from Deoptimization::Reason_LIMIT + _trap_hist_limit = 24 JVMCI_ONLY(+5), // decoupled from Deoptimization::Reason_LIMIT _trap_hist_mask = max_jubyte, _extra_data_count = 4 // extra DataLayout headers, for trap history }; // Public flag values --- old/src/hotspot/share/opto/c2_globals.hpp 2018-05-16 09:39:44.489355029 +0200 +++ new/src/hotspot/share/opto/c2_globals.hpp 2018-05-16 09:39:38.288373026 +0200 @@ -751,6 +751,9 @@ product(uintx, LoopStripMiningIterShortLoop, 0, \ "Loop with fewer iterations are not strip mined") \ range(0, max_juint) \ + \ + product(bool, UseProfiledLoopPredicate, true, \ + "move predicates out of loops based on profiling data") \ C2_FLAGS(DECLARE_DEVELOPER_FLAG, \ DECLARE_PD_DEVELOPER_FLAG, \ --- old/src/hotspot/share/opto/graphKit.cpp 2018-05-16 09:39:50.712336969 +0200 +++ new/src/hotspot/share/opto/graphKit.cpp 2018-05-16 09:39:44.626354632 +0200 @@ -3813,6 +3813,9 @@ if (UseLoopPredicate) { add_predicate_impl(Deoptimization::Reason_predicate, nargs); } + if (UseProfiledLoopPredicate) { + add_predicate_impl(Deoptimization::Reason_profile_predicate, nargs); + } // loop's limit check predicate should be near the loop. add_predicate_impl(Deoptimization::Reason_loop_limit_check, nargs); } --- old/src/hotspot/share/opto/loopPredicate.cpp 2018-05-16 09:40:01.910304470 +0200 +++ new/src/hotspot/share/opto/loopPredicate.cpp 2018-05-16 09:39:50.864336528 +0200 @@ -34,6 +34,8 @@ #include "opto/opaquenode.hpp" #include "opto/rootnode.hpp" #include "opto/subnode.hpp" +#include +#include /* * The general idea of Loop Predication is to insert a predicate on the entry @@ -318,18 +320,37 @@ if (limit_check_proj != NULL) { entry = entry->in(0)->in(0); } + ProjNode* profile_predicate_proj = NULL; + ProjNode* predicate_proj = NULL; + if (UseProfiledLoopPredicate) { + profile_predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_profile_predicate); + if (profile_predicate_proj != NULL) { + entry = skip_loop_predicates(entry); + } + } if (UseLoopPredicate) { - ProjNode* predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate); - if (predicate_proj != NULL) { // right pattern that can be used by loop predication - // clone predicate - new_entry = clone_predicate(predicate_proj, new_entry, - Deoptimization::Reason_predicate, - loop_phase, igvn); - assert(new_entry != NULL && new_entry->is_Proj(), "IfTrue or IfFalse after clone predicate"); - if (TraceLoopPredicate) { - tty->print("Loop Predicate cloned: "); - debug_only( new_entry->in(0)->dump(); ) - } + predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate); + } + if (predicate_proj != NULL) { // right pattern that can be used by loop predication + // clone predicate + new_entry = clone_predicate(predicate_proj, new_entry, + Deoptimization::Reason_predicate, + loop_phase, igvn); + assert(new_entry != NULL && new_entry->is_Proj(), "IfTrue or IfFalse after clone predicate"); + if (TraceLoopPredicate) { + tty->print("Loop Predicate cloned: "); + debug_only( new_entry->in(0)->dump(); ); + } + } + if (profile_predicate_proj != NULL) { // right pattern that can be used by loop predication + // clone predicate + new_entry = clone_predicate(profile_predicate_proj, new_entry, + Deoptimization::Reason_profile_predicate, + loop_phase, igvn); + assert(new_entry != NULL && new_entry->is_Proj(), "IfTrue or IfFalse after clone predicate"); + if (TraceLoopPredicate) { + tty->print("Loop Predicate cloned: "); + debug_only( new_entry->in(0)->dump(); ); } } if (limit_check_proj != NULL && clone_limit_check) { @@ -351,25 +372,36 @@ //--------------------------skip_loop_predicates------------------------------ // Skip related predicates. Node* PhaseIdealLoop::skip_loop_predicates(Node* entry) { + IfNode* iff = entry->in(0)->as_If(); + ProjNode* uncommon_proj = iff->proj_out(1 - entry->as_Proj()->_con); + Node* rgn = uncommon_proj->unique_ctrl_out(); + assert(rgn->is_Region() || rgn->is_Call(), "must be a region or call uct"); + entry = entry->in(0)->in(0); + while (entry != NULL && entry->is_Proj() && entry->in(0)->is_If()) { + uncommon_proj = entry->in(0)->as_If()->proj_out(1 - entry->as_Proj()->_con); + if (uncommon_proj->unique_ctrl_out() != rgn) + break; + entry = entry->in(0)->in(0); + } + return entry; +} + +Node* PhaseIdealLoop::skip_all_loop_predicates(Node* entry) { Node* predicate = NULL; predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check); if (predicate != NULL) { entry = entry->in(0)->in(0); } + if (UseProfiledLoopPredicate) { + predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_profile_predicate); + if (predicate != NULL) { // right pattern that can be used by loop predication + entry = skip_loop_predicates(entry); + } + } if (UseLoopPredicate) { predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate); if (predicate != NULL) { // right pattern that can be used by loop predication - IfNode* iff = entry->in(0)->as_If(); - ProjNode* uncommon_proj = iff->proj_out(1 - entry->as_Proj()->_con); - Node* rgn = uncommon_proj->unique_ctrl_out(); - assert(rgn->is_Region() || rgn->is_Call(), "must be a region or call uct"); - entry = entry->in(0)->in(0); - while (entry != NULL && entry->is_Proj() && entry->in(0)->is_If()) { - uncommon_proj = entry->in(0)->as_If()->proj_out(1 - entry->as_Proj()->_con); - if (uncommon_proj->unique_ctrl_out() != rgn) - break; - entry = entry->in(0)->in(0); - } + entry = skip_loop_predicates(entry); } } return entry; @@ -400,6 +432,12 @@ return entry; } } + if (UseProfiledLoopPredicate) { + predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_profile_predicate); + if (predicate != NULL) { // right pattern that can be used by loop predication + return entry; + } + } return NULL; } @@ -766,6 +804,413 @@ return bol; } +// Should loop predication look not only in the path from tail to head +// but also in branches of the loop body? +bool PhaseIdealLoop::loop_predication_should_follow_branches(IdealLoopTree *loop, ProjNode *predicate_proj, float& loop_trip_cnt) { + if (!UseProfiledLoopPredicate) { + return false; + } + + if (predicate_proj == NULL) { + return false; + } + + LoopNode* head = loop->_head->as_Loop(); + bool follow_branches = true; + IdealLoopTree* l = loop->_child; + // For leaf loops and loops with a single inner loop + while (l != NULL && follow_branches) { + IdealLoopTree* child = l; + if (child->_child != NULL && + child->_head->is_OuterStripMinedLoop()) { + assert(child->_child->_next == NULL, "only one inner loop for strip mined loop"); + assert(child->_child->_head->is_CountedLoop() && child->_child->_head->as_CountedLoop()->is_strip_mined(), "inner loop should be strip mined"); + child = child->_child; + } + if (child->_child != NULL || child->_irreducible) { + follow_branches = false; + } + l = l->_next; + } + if (follow_branches) { + loop->compute_profile_trip_cnt(this); + if (head->is_profile_trip_failed()) { + follow_branches = false; + } else { + loop_trip_cnt = head->profile_trip_cnt(); + if (head->is_CountedLoop()) { + CountedLoopNode* cl = head->as_CountedLoop(); + if (cl->phi() != NULL) { + const TypeInt* t = _igvn.type(cl->phi())->is_int(); + float worst_case_trip_cnt = ((float)t->_hi - t->_lo) / ABS(cl->stride_con()); + if (worst_case_trip_cnt < loop_trip_cnt) { + loop_trip_cnt = worst_case_trip_cnt; + } + } + } + } + } + return follow_branches; +} + +// Compute probability of reaching some CFG node from a fixed +// dominating CFG node +class PathFrequency { +private: + Node* _dom; // frequencies are computed relative to this node + Node_Stack _stack; + GrowableArray _freqs_stack; // keep track of intermediate result at regions + GrowableArray _freqs; // cache frequencies + PhaseIdealLoop* _phase; + +public: + PathFrequency(Node* dom, PhaseIdealLoop* phase) + : _dom(dom), _stack(0), _phase(phase) { + } + + float to(Node* n) { + // post order walk on the CFG graph from n to _dom + fesetround(FE_TOWARDZERO); // make sure rounding doesn't push frequency above 1 + IdealLoopTree* loop = _phase->get_loop(_dom); + Node* c = n; + for (;;) { + assert(_phase->get_loop(c) == loop, "have to be in the same loop"); + if (c == _dom || _freqs.at_grow(c->_idx, -1) >= 0) { + float f = c == _dom ? 1 : _freqs.at(c->_idx); + Node* prev = c; + while (_stack.size() > 0 && prev == c) { + Node* n = _stack.node(); + if (!n->is_Region()) { + if (_phase->get_loop(n) != _phase->get_loop(n->in(0))) { + // Found an inner loop: compute frequency of reaching this + // exit from the loop head by looking at the number of + // times each loop exit was taken + IdealLoopTree* inner_loop = _phase->get_loop(n->in(0)); + LoopNode* inner_head = inner_loop->_head->as_Loop(); + assert(_phase->get_loop(n) == loop, "only 1 inner loop"); + if (inner_head->is_OuterStripMinedLoop()) { + inner_head->verify_strip_mined(1); + if (n->in(0) == inner_head->in(LoopNode::LoopBackControl)->in(0)) { + n = n->in(0)->in(0)->in(0); + } + inner_loop = inner_loop->_child; + inner_head = inner_loop->_head->as_Loop(); + inner_head->verify_strip_mined(1); + } + fesetround(FE_UPWARD); // make sure rounding doesn't push frequency above 1 + float loop_exit_cnt = 0.0f; + for (uint i = 0; i < inner_loop->_body.size(); i++) { + Node *n = inner_loop->_body[i]; + float c = inner_loop->compute_profile_trip_cnt_helper(n); + loop_exit_cnt += c; + } + fesetround(FE_TOWARDZERO); + float cnt = -1; + if (n->in(0)->is_If()) { + IfNode* iff = n->in(0)->as_If(); + float p = n->in(0)->as_If()->_prob; + if (n->Opcode() == Op_IfFalse) { + p = 1 - p; + } + if (p > PROB_MIN) { + cnt = p * iff->_fcnt; + } else { + cnt = 0; + } + } else { + assert(n->in(0)->is_Jump(), "unsupported node kind"); + JumpNode* jmp = n->in(0)->as_Jump(); + float p = n->in(0)->as_Jump()->_probs[n->as_JumpProj()->_con]; + cnt = p * jmp->_fcnt; + } + float this_exit_f = cnt > 0 ? cnt / loop_exit_cnt : 0; + assert(this_exit_f <= 1 && this_exit_f >= 0, "Incorrect frequency"); + f = f * this_exit_f; + assert(f <= 1 && f >= 0, "Incorrect frequency"); + } else { + float p = -1; + if (n->in(0)->is_If()) { + p = n->in(0)->as_If()->_prob; + if (n->Opcode() == Op_IfFalse) { + p = 1 - p; + } + } else { + assert(n->in(0)->is_Jump(), "unsupported node kind"); + p = n->in(0)->as_Jump()->_probs[n->as_JumpProj()->_con]; + } + f = f * p; + assert(f <= 1 && f >= 0, "Incorrect frequency"); + } + _freqs.at_put_grow(n->_idx, (float)f, -1); + _stack.pop(); + } else { + float prev_f = _freqs_stack.pop(); + float new_f = f; + f = new_f + prev_f; + assert(f <= 1 && f >= 0, "Incorrect frequency"); + uint i = _stack.index(); + if (i < n->req()) { + c = n->in(i); + _stack.set_index(i+1); + _freqs_stack.push(f); + } else { + _freqs.at_put_grow(n->_idx, f, -1); + _stack.pop(); + } + } + } + if (_stack.size() == 0) { + fesetround(FE_TONEAREST); + assert(f >= 0 && f <= 1, "should have been computed"); + return f; + } + } else if (c->is_Loop()) { + ShouldNotReachHere(); + c = c->in(LoopNode::EntryControl); + } else if (c->is_Region()) { + _freqs_stack.push(0); + _stack.push(c, 2); + c = c->in(1); + } else { + if (c->is_IfProj()) { + IfNode* iff = c->in(0)->as_If(); + if (iff->_prob == PROB_UNKNOWN) { + // assume never taken + _freqs.at_put_grow(c->_idx, 0, -1); + } else if (_phase->get_loop(c) != _phase->get_loop(iff)) { + if (iff->_fcnt == COUNT_UNKNOWN) { + // assume never taken + _freqs.at_put_grow(c->_idx, 0, -1); + } else { + // skip over loop + _stack.push(c, 1); + c = _phase->get_loop(c->in(0))->_head->as_Loop()->skip_strip_mined()->in(LoopNode::EntryControl); + } + } else { + _stack.push(c, 1); + c = iff; + } + } else if (c->is_JumpProj()) { + JumpNode* jmp = c->in(0)->as_Jump(); + if (_phase->get_loop(c) != _phase->get_loop(jmp)) { + if (jmp->_fcnt == COUNT_UNKNOWN) { + // assume never taken + _freqs.at_put_grow(c->_idx, 0, -1); + } else { + // skip over loop + _stack.push(c, 1); + c = _phase->get_loop(c->in(0))->_head->as_Loop()->skip_strip_mined()->in(LoopNode::EntryControl); + } + } else { + _stack.push(c, 1); + c = jmp; + } + } else if (c->Opcode() == Op_CatchProj && + c->in(0)->Opcode() == Op_Catch && + c->in(0)->in(0)->is_Proj() && + c->in(0)->in(0)->in(0)->is_Call()) { + // assume exceptions are never thrown + uint con = c->as_Proj()->_con; + if (con == CatchProjNode::fall_through_index) { + Node* call = c->in(0)->in(0)->in(0)->in(0); + if (_phase->get_loop(call) != _phase->get_loop(c)) { + _freqs.at_put_grow(c->_idx, 0, -1); + } else { + c = call; + } + } else { + assert(con >= CatchProjNode::catch_all_index, "what else?"); + _freqs.at_put_grow(c->_idx, 0, -1); + } + } else if (c->unique_ctrl_out() == NULL && !c->is_If() && !c->is_Jump()) { + ShouldNotReachHere(); + } else { + c = c->in(0); + } + } + } + ShouldNotReachHere(); + return -1; + } +}; + +void PhaseIdealLoop::loop_predication_follow_branches(Node *n, IdealLoopTree *loop, float loop_trip_cnt, + PathFrequency& pf, Node_Stack& stack, VectorSet& seen, + Node_List& if_proj_list) { + assert(n->is_Region(), "start from a region"); + Node* tail = loop->tail(); + stack.push(n, 1); + do { + Node* c = stack.node(); + assert(c->is_Region() || c->is_IfProj(), "only region here"); + uint i = stack.index(); + + if (i < c->req()) { + stack.set_index(i+1); + Node* in = c->in(i); + while (!is_dominator(in, tail) && !seen.test_set(in->_idx)) { + IdealLoopTree* in_loop = get_loop(in); + if (in_loop != loop) { + in = in_loop->_head->in(LoopNode::EntryControl); + } else if (in->is_Region()) { + stack.push(in, 1); + break; + } else if (in->is_IfProj() && + in->as_Proj()->is_uncommon_trap_if_pattern(Deoptimization::Reason_none)) { + if (pf.to(in) * loop_trip_cnt >= 1) { + stack.push(in, 1); + } + in = in->in(0); + } else { + in = in->in(0); + } + } + } else { + if (c->is_IfProj()) { + if_proj_list.push(c); + } + stack.pop(); + } + + } while (stack.size() > 0); +} + + +bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree *loop, ProjNode* proj, ProjNode *predicate_proj, + CountedLoopNode *cl, ConNode* zero, Invariance& invar, + Deoptimization::DeoptReason reason) { + // Following are changed to nonnull when a predicate can be hoisted + ProjNode* new_predicate_proj = NULL; + IfNode* iff = proj->in(0)->as_If(); + Node* test = iff->in(1); + if (!test->is_Bool()){ //Conv2B, ... + return false; + } + BoolNode* bol = test->as_Bool(); + if (invar.is_invariant(bol)) { + // Invariant test + new_predicate_proj = create_new_if_for_predicate(predicate_proj, NULL, + reason, + iff->Opcode()); + Node* ctrl = new_predicate_proj->in(0)->as_If()->in(0); + BoolNode* new_predicate_bol = invar.clone(bol, ctrl)->as_Bool(); + + // Negate test if necessary + bool negated = false; + if (proj->_con != predicate_proj->_con) { + new_predicate_bol = new BoolNode(new_predicate_bol->in(1), new_predicate_bol->_test.negate()); + register_new_node(new_predicate_bol, ctrl); + negated = true; + } + IfNode* new_predicate_iff = new_predicate_proj->in(0)->as_If(); + _igvn.hash_delete(new_predicate_iff); + new_predicate_iff->set_req(1, new_predicate_bol); +#ifndef PRODUCT + if (TraceLoopPredicate) { + tty->print("Predicate invariant if%s: %d ", negated ? " negated" : "", new_predicate_iff->_idx); + loop->dump_head(); + } else if (TraceLoopOpts) { + tty->print("Predicate IC "); + loop->dump_head(); + } +#endif + } else if (cl != NULL && loop->is_range_check_if(iff, this, invar)) { + // Range check for counted loops + const Node* cmp = bol->in(1)->as_Cmp(); + Node* idx = cmp->in(1); + assert(!invar.is_invariant(idx), "index is variant"); + Node* rng = cmp->in(2); + assert(rng->Opcode() == Op_LoadRange || iff->is_RangeCheck() || _igvn.type(rng)->is_int()->_lo >= 0, "must be"); + assert(invar.is_invariant(rng), "range must be invariant"); + int scale = 1; + Node* offset = zero; + bool ok = is_scaled_iv_plus_offset(idx, cl->phi(), &scale, &offset); + assert(ok, "must be index expression"); + + Node* init = cl->init_trip(); + // Limit is not exact. + // Calculate exact limit here. + // Note, counted loop's test is '<' or '>'. + Node* limit = exact_limit(loop); + int stride = cl->stride()->get_int(); + + // Build if's for the upper and lower bound tests. The + // lower_bound test will dominate the upper bound test and all + // cloned or created nodes will use the lower bound test as + // their declared control. + + // Perform cloning to keep Invariance state correct since the + // late schedule will place invariant things in the loop. + Node *ctrl = predicate_proj->in(0)->as_If()->in(0); + rng = invar.clone(rng, ctrl); + if (offset && offset != zero) { + assert(invar.is_invariant(offset), "offset must be loop invariant"); + offset = invar.clone(offset, ctrl); + } + // If predicate expressions may overflow in the integer range, longs are used. + bool overflow = false; + + // Test the lower bound + BoolNode* lower_bound_bol = rc_predicate(loop, ctrl, scale, offset, init, limit, stride, rng, false, overflow); + // Negate test if necessary + bool negated = false; + if (proj->_con != predicate_proj->_con) { + lower_bound_bol = new BoolNode(lower_bound_bol->in(1), lower_bound_bol->_test.negate()); + register_new_node(lower_bound_bol, ctrl); + negated = true; + } + ProjNode* lower_bound_proj = create_new_if_for_predicate(predicate_proj, NULL, reason, overflow ? Op_If : iff->Opcode()); + IfNode* lower_bound_iff = lower_bound_proj->in(0)->as_If(); + _igvn.hash_delete(lower_bound_iff); + lower_bound_iff->set_req(1, lower_bound_bol); + if (TraceLoopPredicate) tty->print_cr("lower bound check if: %s %d ", negated ? " negated" : "", lower_bound_iff->_idx); + + // Test the upper bound + BoolNode* upper_bound_bol = rc_predicate(loop, lower_bound_proj, scale, offset, init, limit, stride, rng, true, overflow); + negated = false; + if (proj->_con != predicate_proj->_con) { + upper_bound_bol = new BoolNode(upper_bound_bol->in(1), upper_bound_bol->_test.negate()); + register_new_node(upper_bound_bol, ctrl); + negated = true; + } + ProjNode* upper_bound_proj = create_new_if_for_predicate(predicate_proj, NULL, reason, overflow ? Op_If : iff->Opcode()); + assert(upper_bound_proj->in(0)->as_If()->in(0) == lower_bound_proj, "should dominate"); + IfNode* upper_bound_iff = upper_bound_proj->in(0)->as_If(); + _igvn.hash_delete(upper_bound_iff); + upper_bound_iff->set_req(1, upper_bound_bol); + if (TraceLoopPredicate) tty->print_cr("upper bound check if: %s %d ", negated ? " negated" : "", lower_bound_iff->_idx); + + // Fall through into rest of the clean up code which will move + // any dependent nodes onto the upper bound test. + new_predicate_proj = upper_bound_proj; + + if (iff->is_RangeCheck()) { + new_predicate_proj = insert_skeleton_predicate(iff, loop, proj, predicate_proj, upper_bound_proj, scale, offset, init, limit, stride, rng, overflow, reason); + } + +#ifndef PRODUCT + if (TraceLoopOpts && !TraceLoopPredicate) { + tty->print("Predicate RC "); + loop->dump_head(); + } +#endif + } else { + // Loop variant check (for example, range check in non-counted loop) + // with uncommon trap. + return false; + } + assert(new_predicate_proj != NULL, "sanity"); + // Success - attach condition (new_predicate_bol) to predicate if + invar.map_ctrl(proj, new_predicate_proj); // so that invariance test can be appropriate + + // Eliminate the old If in the loop body + dominated_by( new_predicate_proj, iff, proj->_con != new_predicate_proj->_con ); + + C->set_major_progress(); + return true; +} + + // After pre/main/post loops are created, we'll put a copy of some // range checks between the pre and main loop to validate the initial // value of the induction variable for the main loop. Make a copy of @@ -776,14 +1221,15 @@ ProjNode* upper_bound_proj, int scale, Node* offset, Node* init, Node* limit, jint stride, - Node* rng, bool &overflow) { + Node* rng, bool &overflow, + Deoptimization::DeoptReason reason) { assert(proj->_con && predicate_proj->_con, "not a range check?"); Node* opaque_init = new Opaque1Node(C, init); register_new_node(opaque_init, upper_bound_proj); BoolNode* bol = rc_predicate(loop, upper_bound_proj, scale, offset, opaque_init, limit, stride, rng, (stride > 0) != (scale > 0), overflow); Node* opaque_bol = new Opaque4Node(C, bol, _igvn.intcon(1)); // This will go away once loop opts are over register_new_node(opaque_bol, upper_bound_proj); - ProjNode* new_proj = create_new_if_for_predicate(predicate_proj, NULL, Deoptimization::Reason_predicate, overflow ? Op_If : iff->Opcode()); + ProjNode* new_proj = create_new_if_for_predicate(predicate_proj, NULL, reason, overflow ? Op_If : iff->Opcode()); _igvn.replace_input_of(new_proj->in(0), 1, opaque_bol); assert(opaque_init->outcnt() > 0, "should be used"); return new_proj; @@ -821,13 +1267,32 @@ } Node* entry = head->skip_strip_mined()->in(LoopNode::EntryControl); + ProjNode *loop_limit_proj = NULL; ProjNode *predicate_proj = NULL; + ProjNode *profile_predicate_proj = NULL; // Loop limit check predicate should be near the loop. - predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check); - if (predicate_proj != NULL) - entry = predicate_proj->in(0)->in(0); + loop_limit_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check); + if (loop_limit_proj != NULL) { + entry = loop_limit_proj->in(0)->in(0); + } + bool has_profile_predicates = false; + profile_predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_profile_predicate); + if (profile_predicate_proj != NULL) { + Node* n = skip_loop_predicates(entry); + // Check if predicates were already added to the profile predicate + // block + if (n != entry->in(0)->in(0)) { + has_profile_predicates = true; + } + entry = n; + } predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate); - if (!predicate_proj) { + + float loop_trip_cnt = -1; + bool follow_branches = loop_predication_should_follow_branches(loop, profile_predicate_proj, loop_trip_cnt); + assert(!follow_branches || loop_trip_cnt >= 0, "negative trip count?"); + + if (predicate_proj == NULL && !follow_branches) { #ifndef PRODUCT if (TraceLoopPredicate) { tty->print("missing predicate:"); @@ -846,7 +1311,11 @@ // Create list of if-projs such that a newer proj dominates all older // projs in the list, and they all dominate loop->tail() Node_List if_proj_list(area); + Node_List regions(area); Node *current_proj = loop->tail(); //start from tail + + + Node_List controls(area); while (current_proj != head) { if (loop == get_loop(current_proj) && // still in the loop ? current_proj->is_Proj() && // is a projection ? @@ -854,161 +1323,79 @@ current_proj->in(0)->Opcode() == Op_RangeCheck)) { // is a if projection ? if_proj_list.push(current_proj); } + if (follow_branches && + current_proj->Opcode() == Op_Region && + loop == get_loop(current_proj)) { + regions.push(current_proj); + } current_proj = idom(current_proj); } bool hoisted = false; // true if at least one proj is promoted - while (if_proj_list.size() > 0) { - // Following are changed to nonnull when a predicate can be hoisted - ProjNode* new_predicate_proj = NULL; - - ProjNode* proj = if_proj_list.pop()->as_Proj(); - IfNode* iff = proj->in(0)->as_If(); - - if (!proj->is_uncommon_trap_if_pattern(Deoptimization::Reason_none)) { - if (loop->is_loop_exit(iff)) { - // stop processing the remaining projs in the list because the execution of them - // depends on the condition of "iff" (iff->in(1)). + + if (!has_profile_predicates) { + while (if_proj_list.size() > 0) { + Node* n = if_proj_list.pop(); + + ProjNode* proj = n->as_Proj(); + IfNode* iff = proj->in(0)->as_If(); + + CallStaticJavaNode* call = proj->is_uncommon_trap_if_pattern(Deoptimization::Reason_none); + if (call == NULL) { + if (loop->is_loop_exit(iff)) { + // stop processing the remaining projs in the list because the execution of them + // depends on the condition of "iff" (iff->in(1)). + break; + } else { + // Both arms are inside the loop. There are two cases: + // (1) there is one backward branch. In this case, any remaining proj + // in the if_proj list post-dominates "iff". So, the condition of "iff" + // does not determine the execution the remining projs directly, and we + // can safely continue. + // (2) both arms are forwarded, i.e. a diamond shape. In this case, "proj" + // does not dominate loop->tail(), so it can not be in the if_proj list. + continue; + } + } + Deoptimization::DeoptReason reason = Deoptimization::trap_request_reason(call->uncommon_trap_request()); + if (reason == Deoptimization::Reason_predicate) { break; - } else { - // Both arms are inside the loop. There are two cases: - // (1) there is one backward branch. In this case, any remaining proj - // in the if_proj list post-dominates "iff". So, the condition of "iff" - // does not determine the execution the remining projs directly, and we - // can safely continue. - // (2) both arms are forwarded, i.e. a diamond shape. In this case, "proj" - // does not dominate loop->tail(), so it can not be in the if_proj list. - continue; } - } - Node* test = iff->in(1); - if (!test->is_Bool()){ //Conv2B, ... - continue; - } - BoolNode* bol = test->as_Bool(); - if (invar.is_invariant(bol)) { - // Invariant test - new_predicate_proj = create_new_if_for_predicate(predicate_proj, NULL, - Deoptimization::Reason_predicate, - iff->Opcode()); - Node* ctrl = new_predicate_proj->in(0)->as_If()->in(0); - BoolNode* new_predicate_bol = invar.clone(bol, ctrl)->as_Bool(); - - // Negate test if necessary - bool negated = false; - if (proj->_con != predicate_proj->_con) { - new_predicate_bol = new BoolNode(new_predicate_bol->in(1), new_predicate_bol->_test.negate()); - register_new_node(new_predicate_bol, ctrl); - negated = true; - } - IfNode* new_predicate_iff = new_predicate_proj->in(0)->as_If(); - _igvn.hash_delete(new_predicate_iff); - new_predicate_iff->set_req(1, new_predicate_bol); -#ifndef PRODUCT - if (TraceLoopPredicate) { - tty->print("Predicate invariant if%s: %d ", negated ? " negated" : "", new_predicate_iff->_idx); - loop->dump_head(); - } else if (TraceLoopOpts) { - tty->print("Predicate IC "); - loop->dump_head(); + if (predicate_proj != NULL) { + hoisted = loop_predication_impl_helper(loop, proj, predicate_proj, cl, zero, invar, Deoptimization::Reason_predicate) | hoisted; } -#endif - } else if (cl != NULL && loop->is_range_check_if(iff, this, invar)) { - // Range check for counted loops - const Node* cmp = bol->in(1)->as_Cmp(); - Node* idx = cmp->in(1); - assert(!invar.is_invariant(idx), "index is variant"); - Node* rng = cmp->in(2); - assert(rng->Opcode() == Op_LoadRange || iff->is_RangeCheck() || _igvn.type(rng)->is_int()->_lo >= 0, "must be"); - assert(invar.is_invariant(rng), "range must be invariant"); - int scale = 1; - Node* offset = zero; - bool ok = is_scaled_iv_plus_offset(idx, cl->phi(), &scale, &offset); - assert(ok, "must be index expression"); - - Node* init = cl->init_trip(); - // Limit is not exact. - // Calculate exact limit here. - // Note, counted loop's test is '<' or '>'. - Node* limit = exact_limit(loop); - int stride = cl->stride()->get_int(); - - // Build if's for the upper and lower bound tests. The - // lower_bound test will dominate the upper bound test and all - // cloned or created nodes will use the lower bound test as - // their declared control. - - // Perform cloning to keep Invariance state correct since the - // late schedule will place invariant things in the loop. - Node *ctrl = predicate_proj->in(0)->as_If()->in(0); - rng = invar.clone(rng, ctrl); - if (offset && offset != zero) { - assert(invar.is_invariant(offset), "offset must be loop invariant"); - offset = invar.clone(offset, ctrl); - } - // If predicate expressions may overflow in the integer range, longs are used. - bool overflow = false; - - // Test the lower bound - BoolNode* lower_bound_bol = rc_predicate(loop, ctrl, scale, offset, init, limit, stride, rng, false, overflow); - // Negate test if necessary - bool negated = false; - if (proj->_con != predicate_proj->_con) { - lower_bound_bol = new BoolNode(lower_bound_bol->in(1), lower_bound_bol->_test.negate()); - register_new_node(lower_bound_bol, ctrl); - negated = true; - } - ProjNode* lower_bound_proj = create_new_if_for_predicate(predicate_proj, NULL, Deoptimization::Reason_predicate, overflow ? Op_If : iff->Opcode()); - IfNode* lower_bound_iff = lower_bound_proj->in(0)->as_If(); - _igvn.hash_delete(lower_bound_iff); - lower_bound_iff->set_req(1, lower_bound_bol); - if (TraceLoopPredicate) tty->print_cr("lower bound check if: %s %d ", negated ? " negated" : "", lower_bound_iff->_idx); - - // Test the upper bound - BoolNode* upper_bound_bol = rc_predicate(loop, lower_bound_proj, scale, offset, init, limit, stride, rng, true, overflow); - negated = false; - if (proj->_con != predicate_proj->_con) { - upper_bound_bol = new BoolNode(upper_bound_bol->in(1), upper_bound_bol->_test.negate()); - register_new_node(upper_bound_bol, ctrl); - negated = true; - } - ProjNode* upper_bound_proj = create_new_if_for_predicate(predicate_proj, NULL, Deoptimization::Reason_predicate, overflow ? Op_If : iff->Opcode()); - assert(upper_bound_proj->in(0)->as_If()->in(0) == lower_bound_proj, "should dominate"); - IfNode* upper_bound_iff = upper_bound_proj->in(0)->as_If(); - _igvn.hash_delete(upper_bound_iff); - upper_bound_iff->set_req(1, upper_bound_bol); - if (TraceLoopPredicate) tty->print_cr("upper bound check if: %s %d ", negated ? " negated" : "", lower_bound_iff->_idx); - - // Fall through into rest of the clean up code which will move - // any dependent nodes onto the upper bound test. - new_predicate_proj = upper_bound_proj; + } // end while + } - if (iff->is_RangeCheck()) { - new_predicate_proj = insert_skeleton_predicate(iff, loop, proj, predicate_proj, upper_bound_proj, scale, offset, init, limit, stride, rng, overflow); + Node_List if_proj_list_freq(area); + if (follow_branches) { + PathFrequency pf(loop->_head, this); + + // Some projections were skipped by regular predicates because of + // an early loop exit. Try them with profile data. + while (if_proj_list.size() > 0) { + Node* proj = if_proj_list.pop(); + float f = pf.to(proj); + if (proj->as_Proj()->is_uncommon_trap_if_pattern(Deoptimization::Reason_none) && + f * loop_trip_cnt >= 1) { + hoisted = loop_predication_impl_helper(loop, proj->as_Proj(), profile_predicate_proj, cl, zero, invar, Deoptimization::Reason_profile_predicate) | hoisted; } + } -#ifndef PRODUCT - if (TraceLoopOpts && !TraceLoopPredicate) { - tty->print("Predicate RC "); - loop->dump_head(); - } -#endif - } else { - // Loop variant check (for example, range check in non-counted loop) - // with uncommon trap. - continue; - } - assert(new_predicate_proj != NULL, "sanity"); - // Success - attach condition (new_predicate_bol) to predicate if - invar.map_ctrl(proj, new_predicate_proj); // so that invariance test can be appropriate - - // Eliminate the old If in the loop body - dominated_by( new_predicate_proj, iff, proj->_con != new_predicate_proj->_con ); - - hoisted = true; - C->set_major_progress(); - } // end while + // And look into all branches + Node_Stack stack(0); + VectorSet seen(Thread::current()->resource_area()); + while (regions.size() > 0) { + Node* c = regions.pop(); + loop_predication_follow_branches(c, loop, loop_trip_cnt, pf, stack, seen, if_proj_list_freq); + } + + for (uint i = 0; i < if_proj_list_freq.size(); i++) { + ProjNode* proj = if_proj_list_freq.at(i)->as_Proj(); + hoisted = loop_predication_impl_helper(loop, proj, profile_predicate_proj, cl, zero, invar, Deoptimization::Reason_profile_predicate) | hoisted; + } + } #ifndef PRODUCT // report that the loop predication has been actually performed --- old/src/hotspot/share/opto/loopTransform.cpp 2018-05-16 09:40:08.372285715 +0200 +++ new/src/hotspot/share/opto/loopTransform.cpp 2018-05-16 09:40:02.030304121 +0200 @@ -135,11 +135,45 @@ //------------------------------compute_profile_trip_cnt---------------------------- // Compute loop trip count from profile data as // (backedge_count + loop_exit_count) / loop_exit_count -void IdealLoopTree::compute_profile_trip_cnt( PhaseIdealLoop *phase ) { - if (!_head->is_CountedLoop()) { + +float IdealLoopTree::compute_profile_trip_cnt_helper(Node* n) { + if (n->is_If()) { + IfNode *iff = n->as_If(); + if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) { + Node *exit = is_loop_exit(iff); + if (exit) { + float exit_prob = iff->_prob; + if (exit->Opcode() == Op_IfFalse) exit_prob = 1.0 - exit_prob; + if (exit_prob > PROB_MIN) { + float exit_cnt = iff->_fcnt * exit_prob; + return exit_cnt; + } + } + } + } + if (n->is_Jump()) { + JumpNode *jmp = n->as_Jump(); + if (jmp->_fcnt != COUNT_UNKNOWN) { + float* probs = jmp->_probs; + float exit_prob = 0; + PhaseIdealLoop *phase = _phase; + for (DUIterator_Fast imax, i = jmp->fast_outs(imax); i < imax; i++) { + JumpProjNode* u = jmp->fast_out(i)->as_JumpProj(); + if (!is_member(_phase->get_loop(u))) { + exit_prob += probs[u->_con]; + } + } + return exit_prob * jmp->_fcnt; + } + } + return 0; +} + +void IdealLoopTree::compute_profile_trip_cnt(PhaseIdealLoop *phase) { + if (!_head->is_Loop()) { return; } - CountedLoopNode* head = _head->as_CountedLoop(); + LoopNode* head = _head->as_Loop(); if (head->profile_trip_cnt() != COUNT_UNKNOWN) { return; // Already computed } @@ -151,7 +185,8 @@ back->in(0) && back->in(0)->is_If() && back->in(0)->as_If()->_fcnt != COUNT_UNKNOWN && - back->in(0)->as_If()->_prob != PROB_UNKNOWN) { + back->in(0)->as_If()->_prob != PROB_UNKNOWN && + (back->Opcode() == Op_IfTrue ? 1-back->in(0)->as_If()->_prob : back->in(0)->as_If()->_prob) > PROB_MIN) { break; } back = phase->idom(back); @@ -160,26 +195,34 @@ assert((back->Opcode() == Op_IfTrue || back->Opcode() == Op_IfFalse) && back->in(0), "if-projection exists"); IfNode* back_if = back->in(0)->as_If(); - float loop_back_cnt = back_if->_fcnt * back_if->_prob; + float loop_back_cnt = back_if->_fcnt * (back->Opcode() == Op_IfTrue ? back_if->_prob : (1 - back_if->_prob)); // Now compute a loop exit count float loop_exit_cnt = 0.0f; - for( uint i = 0; i < _body.size(); i++ ) { - Node *n = _body[i]; - if( n->is_If() ) { - IfNode *iff = n->as_If(); - if( iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN ) { - Node *exit = is_loop_exit(iff); - if( exit ) { - float exit_prob = iff->_prob; - if (exit->Opcode() == Op_IfFalse) exit_prob = 1.0 - exit_prob; - if (exit_prob > PROB_MIN) { - float exit_cnt = iff->_fcnt * exit_prob; - loop_exit_cnt += exit_cnt; + if (_child == NULL) { + for( uint i = 0; i < _body.size(); i++ ) { + Node *n = _body[i]; + loop_exit_cnt += compute_profile_trip_cnt_helper(n); + } + } else { + ResourceMark rm; + Unique_Node_List wq; + wq.push(back); + for (uint i = 0; i < wq.size(); i++) { + Node *n = wq.at(i); + assert(n->is_CFG(), "only control nodes"); + if (n != head) { + if (n->is_Region()) { + for (uint j = 1; j < n->req(); j++) { + wq.push(n->in(j)); } + } else { + loop_exit_cnt += compute_profile_trip_cnt_helper(n); + wq.push(n->in(0)); } } } + } if (loop_exit_cnt > 0.0f) { trip_cnt = (loop_back_cnt + loop_exit_cnt) / loop_exit_cnt; @@ -187,6 +230,8 @@ // No exit count so use trip_cnt = loop_back_cnt; } + } else { + head->mark_profile_trip_failed(); } #ifndef PRODUCT if (TraceProfileTripCount) { @@ -1014,125 +1059,140 @@ // the control paths must die too but the range checks were removed by // predication. The range checks that we add here guarantee that they // do. -void PhaseIdealLoop::duplicate_predicates(CountedLoopNode* pre_head, Node* min_taken, Node* castii, - IdealLoopTree* outer_loop, LoopNode* outer_main_head, - uint dd_main_head) { - if (UseLoopPredicate) { - Node* entry = pre_head->in(LoopNode::EntryControl); - Node* predicate = NULL; - predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check); - if (predicate != NULL) { - entry = entry->in(0)->in(0); - } - predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate); - if (predicate != NULL) { - IfNode* iff = entry->in(0)->as_If(); - ProjNode* uncommon_proj = iff->proj_out(1 - entry->as_Proj()->_con); - Node* rgn = uncommon_proj->unique_ctrl_out(); - assert(rgn->is_Region() || rgn->is_Call(), "must be a region or call uct"); - assert(iff->in(1)->in(1)->Opcode() == Op_Opaque1, "unexpected predicate shape"); - entry = entry->in(0)->in(0); - Node* prev_proj = min_taken; - while (entry != NULL && entry->is_Proj() && entry->in(0)->is_If()) { - uncommon_proj = entry->in(0)->as_If()->proj_out(1 - entry->as_Proj()->_con); - if (uncommon_proj->unique_ctrl_out() != rgn) - break; - iff = entry->in(0)->as_If(); - if (iff->in(1)->Opcode() == Op_Opaque4) { - Node_Stack to_clone(2); - to_clone.push(iff->in(1), 1); - uint current = C->unique(); - Node* result = NULL; - // Look for the opaque node to replace with the init value - // and clone everything in between. We keep the Opaque4 node - // so the duplicated predicates are eliminated once loop - // opts are over: they are here only to keep the IR graph - // consistent. - do { - Node* n = to_clone.node(); - uint i = to_clone.index(); - Node* m = n->in(i); - int op = m->Opcode(); - if (m->is_Bool() || - m->is_Cmp() || - op == Op_AndL || - op == Op_OrL || - op == Op_RShiftL || - op == Op_LShiftL || - op == Op_AddL || - op == Op_AddI || - op == Op_MulL || - op == Op_MulI || - op == Op_SubL || - op == Op_SubI || - op == Op_ConvI2L) { - to_clone.push(m, 1); - continue; +void PhaseIdealLoop::duplicate_predicates_helper(Node* predicate, Node* castii, IdealLoopTree* outer_loop, + LoopNode* outer_main_head, uint dd_main_head) { + if (predicate != NULL) { + IfNode* iff = predicate->in(0)->as_If(); + ProjNode* uncommon_proj = iff->proj_out(1 - predicate->as_Proj()->_con); + Node* rgn = uncommon_proj->unique_ctrl_out(); + assert(rgn->is_Region() || rgn->is_Call(), "must be a region or call uct"); + assert(iff->in(1)->in(1)->Opcode() == Op_Opaque1, "unexpected predicate shape"); + predicate = predicate->in(0)->in(0); + Node* current_proj = outer_main_head->in(LoopNode::EntryControl); + Node* prev_proj = current_proj; + while (predicate != NULL && predicate->is_Proj() && predicate->in(0)->is_If()) { + uncommon_proj = predicate->in(0)->as_If()->proj_out(1 - predicate->as_Proj()->_con); + if (uncommon_proj->unique_ctrl_out() != rgn) + break; + iff = predicate->in(0)->as_If(); + if (iff->in(1)->Opcode() == Op_Opaque4) { + Node_Stack to_clone(2); + to_clone.push(iff->in(1), 1); + uint current = C->unique(); + Node* result = NULL; + // Look for the opaque node to replace with the init value + // and clone everything in between. We keep the Opaque4 node + // so the duplicated predicates are eliminated once loop + // opts are over: they are here only to keep the IR graph + // consistent. + do { + Node* n = to_clone.node(); + uint i = to_clone.index(); + Node* m = n->in(i); + int op = m->Opcode(); + if (m->is_Bool() || + m->is_Cmp() || + op == Op_AndL || + op == Op_OrL || + op == Op_RShiftL || + op == Op_LShiftL || + op == Op_AddL || + op == Op_AddI || + op == Op_MulL || + op == Op_MulI || + op == Op_SubL || + op == Op_SubI || + op == Op_ConvI2L) { + to_clone.push(m, 1); + continue; + } + if (op == Op_Opaque1) { + if (n->_idx < current) { + n = n->clone(); } - if (op == Op_Opaque1) { - if (n->_idx < current) { - n = n->clone(); - } - n->set_req(i, castii); - register_new_node(n, min_taken); - to_clone.set_node(n); + n->set_req(i, castii); + register_new_node(n, current_proj); + to_clone.set_node(n); + } + for (;;) { + Node* cur = to_clone.node(); + uint j = to_clone.index(); + if (j+1 < cur->req()) { + to_clone.set_index(j+1); + break; } - for (;;) { - Node* cur = to_clone.node(); - uint j = to_clone.index(); - if (j+1 < cur->req()) { - to_clone.set_index(j+1); - break; - } - to_clone.pop(); - if (to_clone.size() == 0) { - result = cur; - break; - } - Node* next = to_clone.node(); - j = to_clone.index(); - if (cur->_idx >= current) { - if (next->_idx < current) { - next = next->clone(); - register_new_node(next, min_taken); - to_clone.set_node(next); - } - assert(next->in(j) != cur, "input should have been cloned"); - next->set_req(j, cur); + to_clone.pop(); + if (to_clone.size() == 0) { + result = cur; + break; + } + Node* next = to_clone.node(); + j = to_clone.index(); + if (cur->_idx >= current) { + if (next->_idx < current) { + next = next->clone(); + register_new_node(next, current_proj); + to_clone.set_node(next); } + assert(next->in(j) != cur, "input should have been cloned"); + next->set_req(j, cur); } - } while (result == NULL); - assert(result->_idx >= current, "new node expected"); + } + } while (result == NULL); + assert(result->_idx >= current, "new node expected"); - Node* proj = entry->clone(); - Node* other_proj = uncommon_proj->clone(); - Node* new_iff = iff->clone(); - new_iff->set_req(1, result); - proj->set_req(0, new_iff); - other_proj->set_req(0, new_iff); - Node *frame = new ParmNode(C->start(), TypeFunc::FramePtr); - register_new_node(frame, C->start()); - // It's impossible for the predicate to fail at runtime. Use - // an Halt node. - Node* halt = new HaltNode(other_proj, frame); - C->root()->add_req(halt); - new_iff->set_req(0, prev_proj); - - register_control(new_iff, outer_loop->_parent, prev_proj); - register_control(proj, outer_loop->_parent, new_iff); - register_control(other_proj, _ltree_root, new_iff); - register_control(halt, _ltree_root, other_proj); + Node* proj = predicate->clone(); + Node* other_proj = uncommon_proj->clone(); + Node* new_iff = iff->clone(); + new_iff->set_req(1, result); + proj->set_req(0, new_iff); + other_proj->set_req(0, new_iff); + Node *frame = new ParmNode(C->start(), TypeFunc::FramePtr); + register_new_node(frame, C->start()); + // It's impossible for the predicate to fail at runtime. Use + // an Halt node. + Node* halt = new HaltNode(other_proj, frame); + C->root()->add_req(halt); + new_iff->set_req(0, prev_proj); + + register_control(new_iff, outer_loop->_parent, prev_proj); + register_control(proj, outer_loop->_parent, new_iff); + register_control(other_proj, _ltree_root, new_iff); + register_control(halt, _ltree_root, other_proj); - prev_proj = proj; - } - entry = entry->in(0)->in(0); + prev_proj = proj; } + predicate = predicate->in(0)->in(0); + } + if (prev_proj != current_proj) { _igvn.replace_input_of(outer_main_head, LoopNode::EntryControl, prev_proj); set_idom(outer_main_head, prev_proj, dd_main_head); } } } +void PhaseIdealLoop::duplicate_predicates(CountedLoopNode* pre_head, Node* castii, IdealLoopTree* outer_loop, + LoopNode* outer_main_head, uint dd_main_head) { + if (UseLoopPredicate) { + Node* entry = pre_head->in(LoopNode::EntryControl); + Node* predicate = NULL; + predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check); + if (predicate != NULL) { + entry = entry->in(0)->in(0); + } + Node* profile_predicate = NULL; + if (UseProfiledLoopPredicate) { + profile_predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_profile_predicate); + if (profile_predicate != NULL) { + entry = skip_loop_predicates(entry); + } + } + predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate); + duplicate_predicates_helper(predicate, castii, outer_loop, outer_main_head, dd_main_head); + duplicate_predicates_helper(profile_predicate, castii, outer_loop, outer_main_head, dd_main_head); + } +} + //------------------------------insert_pre_post_loops-------------------------- // Insert pre and post loops. If peel_only is set, the pre-loop can not have // more iterations added. It acts as a 'peel' only, no lower-bound RCE, no @@ -1276,7 +1336,7 @@ // CastII for the main loop: Node* castii = cast_incr_before_loop( pre_incr, min_taken, main_head ); assert(castii != NULL, "no castII inserted"); - duplicate_predicates(pre_head, min_taken, castii, outer_loop, outer_main_head, dd_main_head); + duplicate_predicates(pre_head, castii, outer_loop, outer_main_head, dd_main_head); // Step B4: Shorten the pre-loop to run only 1 iteration (for now). // RCE and alignment may change this later. @@ -2813,7 +2873,7 @@ } if (needs_guard) { // Check for an obvious zero trip guard. - Node* inctrl = PhaseIdealLoop::skip_loop_predicates(cl->skip_predicates()); + Node* inctrl = PhaseIdealLoop::skip_all_loop_predicates(cl->skip_predicates()); if (inctrl->Opcode() == Op_IfTrue || inctrl->Opcode() == Op_IfFalse) { bool maybe_swapped = (inctrl->Opcode() == Op_IfFalse); // The test should look like just the backedge of a CountedLoop --- old/src/hotspot/share/opto/loopUnswitch.cpp 2018-05-16 09:40:14.664267454 +0200 +++ new/src/hotspot/share/opto/loopUnswitch.cpp 2018-05-16 09:40:08.549285202 +0200 @@ -138,9 +138,19 @@ Node* uniqc = proj_true->unique_ctrl_out(); Node* entry = head->skip_strip_mined()->in(LoopNode::EntryControl); Node* predicate = find_predicate(entry); + if (predicate != NULL) { + entry = skip_loop_predicates(entry); + } if (predicate != NULL && UseLoopPredicate) { // We may have two predicates, find first. - entry = find_predicate(entry->in(0)->in(0)); + Node* n = find_predicate(entry); + if (n != NULL) { + predicate = n; + entry = skip_loop_predicates(entry); + } + } + if (predicate != NULL && UseProfiledLoopPredicate) { + entry = find_predicate(entry); if (entry != NULL) predicate = entry; } if (predicate != NULL) predicate = predicate->in(0); --- old/src/hotspot/share/opto/loopnode.cpp 2018-05-16 09:40:20.964249170 +0200 +++ new/src/hotspot/share/opto/loopnode.cpp 2018-05-16 09:40:14.778267124 +0200 @@ -1252,9 +1252,7 @@ return l->outer_safepoint(); } -Node* CountedLoopNode::skip_predicates() { - if (is_main_loop()) { - Node* ctrl = skip_strip_mined()->in(LoopNode::EntryControl); +Node* CountedLoopNode::skip_predicates_from_entry(Node* ctrl) { while (ctrl != NULL && ctrl->is_Proj() && ctrl->in(0)->is_If() && ctrl->in(0)->as_If()->proj_out(1-ctrl->as_Proj()->_con)->outcnt() == 1 && ctrl->in(0)->as_If()->proj_out(1-ctrl->as_Proj()->_con)->unique_out()->Opcode() == Op_Halt) { @@ -1263,6 +1261,13 @@ return ctrl; } + +Node* CountedLoopNode::skip_predicates() { + if (is_main_loop()) { + Node* ctrl = skip_strip_mined()->in(LoopNode::EntryControl); + + return skip_predicates_from_entry(ctrl); + } return in(LoopNode::EntryControl); } @@ -2371,6 +2376,13 @@ entry = PhaseIdealLoop::find_predicate_insertion_point(entry, Deoptimization::Reason_predicate); if (entry != NULL) { tty->print(" predicated"); + entry = PhaseIdealLoop::skip_loop_predicates(entry); + } + } + if (UseProfiledLoopPredicate) { + entry = PhaseIdealLoop::find_predicate_insertion_point(entry, Deoptimization::Reason_profile_predicate); + if (entry != NULL) { + tty->print(" profile_predicated"); } } if (_head->is_CountedLoop()) { @@ -2478,11 +2490,18 @@ if (predicate_proj != NULL ) { // right pattern that can be used by loop predication assert(entry->in(0)->in(1)->in(1)->Opcode() == Op_Opaque1, "must be"); useful_predicates.push(entry->in(0)->in(1)->in(1)); // good one - entry = entry->in(0)->in(0); + entry = skip_loop_predicates(entry); } predicate_proj = find_predicate(entry); // Predicate if (predicate_proj != NULL ) { useful_predicates.push(entry->in(0)->in(1)->in(1)); // good one + entry = skip_loop_predicates(entry); + } + if (UseProfiledLoopPredicate) { + predicate_proj = find_predicate(entry); // Predicate + if (predicate_proj != NULL ) { + useful_predicates.push(entry->in(0)->in(1)->in(1)); // good one + } } } @@ -4165,12 +4184,33 @@ // which can inhibit range check elimination. if (least != early) { Node* ctrl_out = least->unique_ctrl_out(); - if (ctrl_out && ctrl_out->is_Loop() && - least == ctrl_out->in(LoopNode::EntryControl) && - (ctrl_out->is_CountedLoop() || ctrl_out->is_OuterStripMinedLoop())) { - Node* least_dom = idom(least); - if (get_loop(least_dom)->is_member(get_loop(least))) { - least = least_dom; + if (ctrl_out && ctrl_out->is_CountedLoop() && + least == ctrl_out->in(LoopNode::EntryControl)) { + Node* new_ctrl = least; + // Move the node above predicates so a following pass of loop + // predication doesn't hoist a predicate that depends on it + // above that node. + if (find_predicate_insertion_point(new_ctrl, Deoptimization::Reason_loop_limit_check) != NULL) { + new_ctrl = new_ctrl->in(0)->in(0); + assert(is_dominator(early, new_ctrl), "least != early so we can move up the dominator tree"); + } + if (find_predicate_insertion_point(new_ctrl, Deoptimization::Reason_profile_predicate) != NULL) { + Node* c = new_ctrl->in(0)->in(0); + assert(is_dominator(early, c), "least != early so we can move up the dominator tree"); + new_ctrl = c; + } + if (find_predicate_insertion_point(new_ctrl, Deoptimization::Reason_predicate) != NULL) { + Node* c = new_ctrl->in(0)->in(0); + assert(is_dominator(early, c), "least != early so we can move up the dominator tree"); + new_ctrl = c; + } + if (new_ctrl != ctrl_out) { + least = new_ctrl; + } else if (ctrl_out->is_CountedLoop() || ctrl_out->is_OuterStripMinedLoop()) { + Node* least_dom = idom(least); + if (get_loop(least_dom)->is_member(get_loop(least))) { + least = least_dom; + } } } } --- old/src/hotspot/share/opto/loopnode.hpp 2018-05-16 09:40:27.383230541 +0200 +++ new/src/hotspot/share/opto/loopnode.hpp 2018-05-16 09:40:21.104248764 +0200 @@ -38,6 +38,7 @@ class LoopNode; class Node; class OuterStripMinedLoopEndNode; +class PathFrequency; class PhaseIdealLoop; class CountedLoopReserveKit; class VectorSet; @@ -57,7 +58,7 @@ // the semantics so it does not appear in the hash & cmp functions. virtual uint size_of() const { return sizeof(*this); } protected: - short _loop_flags; + uint _loop_flags; // Names for flag bitfields enum { Normal=0, Pre=1, Main=2, Post=3, PreMainPostFlagsMask=3, MainHasNoPreLoop=4, @@ -73,26 +74,31 @@ HasAtomicPostLoop=4096, HasRangeChecks=8192, IsMultiversioned=16384, - StripMined=32768}; + StripMined=32768, + ProfileTripFailed=65536}; char _unswitch_count; enum { _unswitch_max=3 }; char _postloop_flags; enum { LoopNotRCEChecked = 0, LoopRCEChecked = 1, RCEPostLoop = 2 }; + // Expected trip count from profile data + float _profile_trip_cnt; + public: // Names for edge indices enum { Self=0, EntryControl, LoopBackControl }; - int is_inner_loop() const { return _loop_flags & InnerLoop; } + uint is_inner_loop() const { return _loop_flags & InnerLoop; } void set_inner_loop() { _loop_flags |= InnerLoop; } - int range_checks_present() const { return _loop_flags & HasRangeChecks; } - int is_multiversioned() const { return _loop_flags & IsMultiversioned; } - int is_vectorized_loop() const { return _loop_flags & VectorizedLoop; } - int is_partial_peel_loop() const { return _loop_flags & PartialPeelLoop; } + uint range_checks_present() const { return _loop_flags & HasRangeChecks; } + uint is_multiversioned() const { return _loop_flags & IsMultiversioned; } + uint is_vectorized_loop() const { return _loop_flags & VectorizedLoop; } + uint is_partial_peel_loop() const { return _loop_flags & PartialPeelLoop; } void set_partial_peel_loop() { _loop_flags |= PartialPeelLoop; } - int partial_peel_has_failed() const { return _loop_flags & PartialPeelFailed; } - int is_strip_mined() const { return _loop_flags & StripMined; } + uint partial_peel_has_failed() const { return _loop_flags & PartialPeelFailed; } + uint is_strip_mined() const { return _loop_flags & StripMined; } + uint is_profile_trip_failed() const { return _loop_flags & ProfileTripFailed; } void mark_partial_peel_failed() { _loop_flags |= PartialPeelFailed; } void mark_has_reductions() { _loop_flags |= HasReductions; } @@ -105,6 +111,7 @@ void mark_is_multiversioned() { _loop_flags |= IsMultiversioned; } void mark_strip_mined() { _loop_flags |= StripMined; } void clear_strip_mined() { _loop_flags &= ~StripMined; } + void mark_profile_trip_failed() { _loop_flags |= ProfileTripFailed; } int unswitch_max() { return _unswitch_max; } int unswitch_count() { return _unswitch_count; } @@ -119,7 +126,12 @@ _unswitch_count = val; } - LoopNode(Node *entry, Node *backedge) : RegionNode(3), _loop_flags(0), _unswitch_count(0), _postloop_flags(0) { + void set_profile_trip_cnt(float ptc) { _profile_trip_cnt = ptc; } + float profile_trip_cnt() { return _profile_trip_cnt; } + + LoopNode(Node *entry, Node *backedge) + : RegionNode(3), _loop_flags(0), _unswitch_count(0), + _postloop_flags(0), _profile_trip_cnt(COUNT_UNKNOWN) { init_class_id(Class_Loop); init_req(EntryControl, entry); init_req(LoopBackControl, backedge); @@ -186,9 +198,6 @@ // Known trip count calculated by compute_exact_trip_count() uint _trip_count; - // Expected trip count from profile data - float _profile_trip_cnt; - // Log2 of original loop bodies in unrolled loop int _unrolled_count_log2; @@ -203,8 +212,8 @@ public: CountedLoopNode( Node *entry, Node *backedge ) : LoopNode(entry, backedge), _main_idx(0), _trip_count(max_juint), - _profile_trip_cnt(COUNT_UNKNOWN), _unrolled_count_log2(0), - _node_count_before_unroll(0), _slp_maximum_unroll_factor(0) { + _unrolled_count_log2(0), _node_count_before_unroll(0), + _slp_maximum_unroll_factor(0) { init_class_id(Class_CountedLoop); // Initialize _trip_count to the largest possible value. // Will be reset (lower) if the loop's trip count is known. @@ -245,16 +254,16 @@ // A 'main' loop that is ONLY unrolled or peeled, never RCE'd or // Aligned, may be missing it's pre-loop. - int is_normal_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Normal; } - int is_pre_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Pre; } - int is_main_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Main; } - int is_post_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Post; } - int is_reduction_loop() const { return (_loop_flags&HasReductions) == HasReductions; } - int was_slp_analyzed () const { return (_loop_flags&WasSlpAnalyzed) == WasSlpAnalyzed; } - int has_passed_slp () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; } - int do_unroll_only () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; } - int is_main_no_pre_loop() const { return _loop_flags & MainHasNoPreLoop; } - int has_atomic_post_loop () const { return (_loop_flags & HasAtomicPostLoop) == HasAtomicPostLoop; } + uint is_normal_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Normal; } + uint is_pre_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Pre; } + uint is_main_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Main; } + uint is_post_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Post; } + uint is_reduction_loop() const { return (_loop_flags&HasReductions) == HasReductions; } + uint was_slp_analyzed () const { return (_loop_flags&WasSlpAnalyzed) == WasSlpAnalyzed; } + uint has_passed_slp () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; } + uint do_unroll_only () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; } + uint is_main_no_pre_loop() const { return _loop_flags & MainHasNoPreLoop; } + uint has_atomic_post_loop () const { return (_loop_flags & HasAtomicPostLoop) == HasAtomicPostLoop; } void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; } int main_idx() const { return _main_idx; } @@ -280,9 +289,6 @@ _loop_flags &= ~PassedSlpAnalysis; } - void set_profile_trip_cnt(float ptc) { _profile_trip_cnt = ptc; } - float profile_trip_cnt() { return _profile_trip_cnt; } - void double_unrolled_count() { _unrolled_count_log2++; } int unrolled_count() { return 1 << MIN2(_unrolled_count_log2, BitsPerInt-3); } @@ -301,6 +307,7 @@ // If this is a main loop in a pre/main/post loop nest, walk over // the predicates that were inserted by // duplicate_predicates()/add_range_check_predicate() + static Node* skip_predicates_from_entry(Node* ctrl); Node* skip_predicates(); #ifndef PRODUCT @@ -588,6 +595,7 @@ void compute_trip_count(PhaseIdealLoop* phase); // Compute loop trip count from profile data + float compute_profile_trip_cnt_helper(Node* n); void compute_profile_trip_cnt( PhaseIdealLoop *phase ); // Reassociate invariant expressions. @@ -730,9 +738,10 @@ } Node* cast_incr_before_loop(Node* incr, Node* ctrl, Node* loop); - void duplicate_predicates(CountedLoopNode* pre_head, Node *min_taken, Node* castii, - IdealLoopTree* outer_loop, LoopNode* outer_main_head, - uint dd_main_head); + void duplicate_predicates_helper(Node* predicate, Node* castii, IdealLoopTree* outer_loop, + LoopNode* outer_main_head, uint dd_main_head); + void duplicate_predicates(CountedLoopNode* pre_head, Node* castii, IdealLoopTree* outer_loop, + LoopNode* outer_main_head, uint dd_main_head); public: @@ -1061,6 +1070,7 @@ PhaseIterGVN* igvn); Node* clone_loop_predicates(Node* old_entry, Node* new_entry, bool clone_limit_check); + static Node* skip_all_loop_predicates(Node* entry); static Node* skip_loop_predicates(Node* entry); // Find a good location to insert a predicate @@ -1075,12 +1085,20 @@ // Implementation of the loop predication to promote checks outside the loop bool loop_predication_impl(IdealLoopTree *loop); + bool loop_predication_impl_helper(IdealLoopTree *loop, ProjNode* proj, ProjNode *predicate_proj, + CountedLoopNode *cl, ConNode* zero, Invariance& invar, + Deoptimization::DeoptReason reason); + bool loop_predication_should_follow_branches(IdealLoopTree *loop, ProjNode *predicate_proj, float& loop_trip_cnt); + void loop_predication_follow_branches(Node *c, IdealLoopTree *loop, float loop_trip_cnt, + PathFrequency& pf, Node_Stack& stack, VectorSet& seen, + Node_List& if_proj_list); ProjNode* insert_skeleton_predicate(IfNode* iff, IdealLoopTree *loop, ProjNode* proj, ProjNode *predicate_proj, ProjNode* upper_bound_proj, int scale, Node* offset, Node* init, Node* limit, jint stride, - Node* rng, bool& overflow); + Node* rng, bool& overflow, + Deoptimization::DeoptReason reason); Node* add_range_check_predicate(IdealLoopTree* loop, CountedLoopNode* cl, Node* predicate_proj, int scale_con, Node* offset, Node* limit, jint stride_con); --- old/src/hotspot/share/opto/node.hpp 2018-05-16 09:40:33.660212324 +0200 +++ new/src/hotspot/share/opto/node.hpp 2018-05-16 09:40:27.510230172 +0200 @@ -73,6 +73,7 @@ class FastLockNode; class FastUnlockNode; class IfNode; +class IfProjNode; class IfFalseNode; class IfTrueNode; class InitializeNode; @@ -672,8 +673,9 @@ DEFINE_CLASS_ID(Proj, Node, 3) DEFINE_CLASS_ID(CatchProj, Proj, 0) DEFINE_CLASS_ID(JumpProj, Proj, 1) - DEFINE_CLASS_ID(IfTrue, Proj, 2) - DEFINE_CLASS_ID(IfFalse, Proj, 3) + DEFINE_CLASS_ID(IfProj, Proj, 2) + DEFINE_CLASS_ID(IfTrue, IfProj, 0) + DEFINE_CLASS_ID(IfFalse, IfProj, 1) DEFINE_CLASS_ID(Parm, Proj, 4) DEFINE_CLASS_ID(MachProj, Proj, 5) @@ -812,6 +814,7 @@ DEFINE_CLASS_QUERY(FastUnlock) DEFINE_CLASS_QUERY(If) DEFINE_CLASS_QUERY(RangeCheck) + DEFINE_CLASS_QUERY(IfProj) DEFINE_CLASS_QUERY(IfFalse) DEFINE_CLASS_QUERY(IfTrue) DEFINE_CLASS_QUERY(Initialize) --- old/src/hotspot/share/runtime/deoptimization.cpp 2018-05-16 09:40:39.790194533 +0200 +++ new/src/hotspot/share/runtime/deoptimization.cpp 2018-05-16 09:40:33.779211978 +0200 @@ -2071,7 +2071,7 @@ // Local derived constants. // Further breakdown of DataLayout::trap_state, as promised by DataLayout. -const int DS_REASON_MASK = DataLayout::trap_mask >> 1; +const int DS_REASON_MASK = ((uint)DataLayout::trap_mask) >> 1; const int DS_RECOMPILE_BIT = DataLayout::trap_mask - DS_REASON_MASK; //---------------------------trap_state_reason--------------------------------- @@ -2170,6 +2170,7 @@ "array_check", "intrinsic" JVMCI_ONLY("_or_type_checked_inlining"), "bimorphic" JVMCI_ONLY("_or_optimized_type_check"), + "profile_predicate", "unloaded", "uninitialized", "unreached", --- old/src/hotspot/share/runtime/deoptimization.hpp 2018-05-16 09:40:45.886176841 +0200 +++ new/src/hotspot/share/runtime/deoptimization.hpp 2018-05-16 09:40:39.906194196 +0200 @@ -41,7 +41,7 @@ enum DeoptReason { Reason_many = -1, // indicates presence of several reasons Reason_none = 0, // indicates absence of a relevant deopt. - // Next 7 reasons are recorded per bytecode in DataLayout::trap_bits. + // Next 8 reasons are recorded per bytecode in DataLayout::trap_bits. // This is more complicated for JVMCI as JVMCI may deoptimize to *some* bytecode before the // bytecode that actually caused the deopt (with inlining, JVMCI may even deoptimize to a // bytecode in another method): @@ -62,6 +62,8 @@ Reason_optimized_type_check = Reason_bimorphic, #endif + Reason_profile_predicate, // compiler generated predicate moved from frequent branch in a loop failed + // recorded per method Reason_unloaded, // unloaded class or constant pool entry Reason_uninitialized, // bad class state (uninitialized) @@ -92,8 +94,8 @@ Reason_LIMIT, // Note: Keep this enum in sync. with _trap_reason_name. - Reason_RECORDED_LIMIT = Reason_bimorphic // some are not recorded per bc - // Note: Reason_RECORDED_LIMIT should be < 8 to fit into 3 bits of + Reason_RECORDED_LIMIT = Reason_profile_predicate // some are not recorded per bc + // Note: Reason_RECORDED_LIMIT should fit into 31 bits of // DataLayout::trap_bits. This dependency is enforced indirectly // via asserts, to avoid excessive direct header-to-header dependencies. // See Deoptimization::trap_state_reason and class DataLayout. --- old/src/hotspot/share/runtime/vmStructs.cpp 2018-05-16 09:40:52.049158955 +0200 +++ new/src/hotspot/share/runtime/vmStructs.cpp 2018-05-16 09:40:45.979176571 +0200 @@ -295,6 +295,7 @@ nonstatic_field(DataLayout, _header._struct._tag, u1) \ nonstatic_field(DataLayout, _header._struct._flags, u1) \ nonstatic_field(DataLayout, _header._struct._bci, u2) \ + nonstatic_field(DataLayout, _header._struct._traps, u4) \ nonstatic_field(DataLayout, _cells[0], intptr_t) \ nonstatic_field(MethodCounters, _nmethod_age, int) \ nonstatic_field(MethodCounters, _interpreter_invocation_limit, int) \ --- old/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/oops/DataLayout.java 2018-05-16 09:40:58.410140493 +0200 +++ new/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/oops/DataLayout.java 2018-05-16 09:40:52.197158525 +0200 @@ -47,19 +47,14 @@ public static final int parametersTypeDataTag = 12; public static final int speculativeTrapDataTag = 13; - // The _struct._flags word is formatted as [trapState:4 | flags:4]. - // The trap state breaks down further as [recompile:1 | reason:3]. + // The trap state breaks down as [recompile:1 | reason:31]. // This further breakdown is defined in deoptimization.cpp. // See Deoptimization.trapStateReason for an assert that // trapBits is big enough to hold reasons < reasonRecordedLimit. // // The trapState is collected only if ProfileTraps is true. - public static final int trapBits = 1+3; // 3: enough to distinguish [0..reasonRecordedLimit]. - public static final int trapShift = 8 - trapBits; + public static final int trapBits = 1+31; // 31: enough to distinguish [0..reasonRecordedLimit]. public static final int trapMask = Bits.rightNBits(trapBits); - public static final int trapMaskInPlace = (trapMask << trapShift); - public static final int flagLimit = trapShift; - public static final int flagMask = Bits.rightNBits(flagLimit); public static final int firstFlag = 0; private Address data; @@ -97,16 +92,17 @@ // Every data layout begins with a header. This header // contains a tag, which is used to indicate the size/layout - // of the data, 4 bits of flags, which can be used in any way, - // 4 bits of trap history (none/one reason/many reasons), + // of the data, 8 bits of flags, which can be used in any way, + // 32 bits of trap history (none/one reason/many reasons), // and a bci, which is used to tie this piece of data to a // specific bci in the bytecodes. // union { - // intptrT _bits; + // u8 _bits; // struct { // u1 _tag; // u1 _flags; // u2 _bci; + // u4 _traps; // } _struct; // } _header; @@ -119,10 +115,10 @@ // Size computation static int headerSizeInBytes() { - return MethodData.cellSize; + return MethodData.cellSize * headerSizeInCells(); } static int headerSizeInCells() { - return 1; + return VM.getVM().isLP64() ? 1 : 2; } static public int computeSizeInBytes(int cellCount) { @@ -146,7 +142,7 @@ // simplifying assumption that all N occurrences can be blamed // on that BCI. int trapState() { - return (flags() >> trapShift) & trapMask; + return data.getJIntAt(offset+4); } int flags() {