--- old/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp	2018-05-16 09:38:55.702496620 +0200
+++ new/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp	2018-05-16 09:38:49.650514185 +0200
@@ -1343,12 +1343,11 @@
       __ mov_metadata(mdo, md->constant_encoding());
       Address data_addr
         = __ form_address(rscratch2, mdo,
-                          md->byte_offset_of_slot(data, DataLayout::DataLayout::header_offset()),
-                          LogBytesPerWord);
-      int header_bits = DataLayout::flag_mask_to_header_mask(BitData::null_seen_byte_constant());
-      __ ldr(rscratch1, data_addr);
-      __ orr(rscratch1, rscratch1, header_bits);
-      __ str(rscratch1, data_addr);
+                          md->byte_offset_of_slot(data, DataLayout::flags_offset()),
+                          0);
+      __ ldrb(rscratch1, data_addr);
+      __ orr(rscratch1, rscratch1, BitData::null_seen_byte_constant());
+      __ strb(rscratch1, data_addr);
       __ b(*obj_is_null);
       __ bind(not_null);
     } else {
@@ -1421,7 +1420,7 @@
     Address counter_addr
       = __ form_address(rscratch2, mdo,
                         md->byte_offset_of_slot(data, CounterData::count_offset()),
-                        LogBytesPerWord);
+                        0);
     __ ldr(rscratch1, counter_addr);
     __ sub(rscratch1, rscratch1, DataLayout::counter_increment);
     __ str(rscratch1, counter_addr);
@@ -1470,12 +1469,11 @@
       __ mov_metadata(mdo, md->constant_encoding());
       Address data_addr
         = __ form_address(rscratch2, mdo,
-                          md->byte_offset_of_slot(data, DataLayout::header_offset()),
+                          md->byte_offset_of_slot(data, DataLayout::flags_offset()),
                           LogBytesPerInt);
-      int header_bits = DataLayout::flag_mask_to_header_mask(BitData::null_seen_byte_constant());
-      __ ldrw(rscratch1, data_addr);
-      __ orrw(rscratch1, rscratch1, header_bits);
-      __ strw(rscratch1, data_addr);
+      __ ldrb(rscratch1, data_addr);
+      __ orr(rscratch1, rscratch1, BitData::null_seen_byte_constant());
+      __ strb(rscratch1, data_addr);
       __ b(done);
       __ bind(not_null);
     } else {
--- old/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp	2018-05-16 09:39:01.834478824 +0200
+++ new/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp	2018-05-16 09:38:55.848496197 +0200
@@ -970,12 +970,11 @@
 void InterpreterMacroAssembler::set_mdp_flag_at(Register mdp_in,
                                                 int flag_byte_constant) {
   assert(ProfileInterpreter, "must be profiling interpreter");
-  int header_offset = in_bytes(DataLayout::header_offset());
-  int header_bits = DataLayout::flag_mask_to_header_mask(flag_byte_constant);
+  int flags_offset = in_bytes(DataLayout::flags_offset());
   // Set the flag
-  ldr(rscratch1, Address(mdp_in, header_offset));
-  orr(rscratch1, rscratch1, header_bits);
-  str(rscratch1, Address(mdp_in, header_offset));
+  ldrb(rscratch1, Address(mdp_in, flags_offset));
+  orr(rscratch1, rscratch1, flag_byte_constant);
+  strb(rscratch1, Address(mdp_in, flags_offset));
 }
 
 
--- old/src/hotspot/cpu/x86/assembler_x86.cpp	2018-05-16 09:39:13.219445782 +0200
+++ new/src/hotspot/cpu/x86/assembler_x86.cpp	2018-05-16 09:39:01.959478461 +0200
@@ -3338,6 +3338,14 @@
   emit_operand(src, dst);
 }
 
+void Assembler::orb(Address dst, int imm8) {
+  InstructionMark im(this);
+  prefix(dst);
+  emit_int8((unsigned char)0x80);
+  emit_operand(rcx, dst, 1);
+  emit_int8(imm8);
+}
+
 void Assembler::packuswb(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
--- old/src/hotspot/cpu/x86/assembler_x86.hpp	2018-05-16 09:39:19.525427481 +0200
+++ new/src/hotspot/cpu/x86/assembler_x86.hpp	2018-05-16 09:39:13.405445242 +0200
@@ -1536,6 +1536,8 @@
   void orl(Register dst, Register src);
   void orl(Address dst, Register src);
 
+  void orb(Address dst, int imm8);
+
   void orq(Address dst, int32_t imm32);
   void orq(Register dst, int32_t imm32);
   void orq(Register dst, Address src);
--- old/src/hotspot/cpu/x86/c1_LIRAssembler_x86.cpp	2018-05-16 09:39:25.859409098 +0200
+++ new/src/hotspot/cpu/x86/c1_LIRAssembler_x86.cpp	2018-05-16 09:39:19.667427069 +0200
@@ -1678,9 +1678,9 @@
     // Object is null; update MDO and exit
     Register mdo  = klass_RInfo;
     __ mov_metadata(mdo, md->constant_encoding());
-    Address data_addr(mdo, md->byte_offset_of_slot(data, DataLayout::header_offset()));
-    int header_bits = DataLayout::flag_mask_to_header_mask(BitData::null_seen_byte_constant());
-    __ orl(data_addr, header_bits);
+    Address data_addr(mdo, md->byte_offset_of_slot(data, DataLayout::flags_offset()));
+    int header_bits = BitData::null_seen_byte_constant();
+    __ orb(data_addr, header_bits);
     __ jmp(*obj_is_null);
     __ bind(not_null);
   } else {
@@ -1824,9 +1824,9 @@
       // Object is null; update MDO and exit
       Register mdo  = klass_RInfo;
       __ mov_metadata(mdo, md->constant_encoding());
-      Address data_addr(mdo, md->byte_offset_of_slot(data, DataLayout::header_offset()));
-      int header_bits = DataLayout::flag_mask_to_header_mask(BitData::null_seen_byte_constant());
-      __ orl(data_addr, header_bits);
+      Address data_addr(mdo, md->byte_offset_of_slot(data, DataLayout::flags_offset()));
+      int header_bits = BitData::null_seen_byte_constant();
+      __ orb(data_addr, header_bits);
       __ jmp(done);
       __ bind(not_null);
     } else {
--- old/src/hotspot/cpu/x86/interp_masm_x86.cpp	2018-05-16 09:39:32.003391267 +0200
+++ new/src/hotspot/cpu/x86/interp_masm_x86.cpp	2018-05-16 09:39:26.022408625 +0200
@@ -1435,10 +1435,10 @@
 void InterpreterMacroAssembler::set_mdp_flag_at(Register mdp_in,
                                                 int flag_byte_constant) {
   assert(ProfileInterpreter, "must be profiling interpreter");
-  int header_offset = in_bytes(DataLayout::header_offset());
-  int header_bits = DataLayout::flag_mask_to_header_mask(flag_byte_constant);
+  int header_offset = in_bytes(DataLayout::flags_offset());
+  int header_bits = flag_byte_constant;
   // Set the flag
-  orl(Address(mdp_in, header_offset), header_bits);
+  orb(Address(mdp_in, header_offset), header_bits);
 }
 
 
--- old/src/hotspot/share/oops/methodData.hpp	2018-05-16 09:39:38.151373424 +0200
+++ new/src/hotspot/share/oops/methodData.hpp	2018-05-16 09:39:32.163390802 +0200
@@ -83,16 +83,17 @@
 private:
   // Every data layout begins with a header.  This header
   // contains a tag, which is used to indicate the size/layout
-  // of the data, 4 bits of flags, which can be used in any way,
-  // 4 bits of trap history (none/one reason/many reasons),
+  // of the data, 8 bits of flags, which can be used in any way,
+  // 32 bits of trap history (none/one reason/many reasons),
   // and a bci, which is used to tie this piece of data to a
   // specific bci in the bytecodes.
   union {
-    intptr_t _bits;
+    u8 _bits;
     struct {
       u1 _tag;
       u1 _flags;
       u2 _bci;
+      u4 _traps;
     } _struct;
   } _header;
 
@@ -131,28 +132,23 @@
   };
 
   enum {
-    // The _struct._flags word is formatted as [trap_state:4 | flags:4].
-    // The trap state breaks down further as [recompile:1 | reason:3].
+    // The trap state breaks down as [recompile:1 | reason:31].
     // This further breakdown is defined in deoptimization.cpp.
     // See Deoptimization::trap_state_reason for an assert that
     // trap_bits is big enough to hold reasons < Reason_RECORDED_LIMIT.
     //
     // The trap_state is collected only if ProfileTraps is true.
-    trap_bits = 1+3,  // 3: enough to distinguish [0..Reason_RECORDED_LIMIT].
-    trap_shift = BitsPerByte - trap_bits,
+    trap_bits = 1+31,  // 31: enough to distinguish [0..Reason_RECORDED_LIMIT].
     trap_mask = right_n_bits(trap_bits),
-    trap_mask_in_place = (trap_mask << trap_shift),
-    flag_limit = trap_shift,
-    flag_mask = right_n_bits(flag_limit),
     first_flag = 0
   };
 
   // Size computation
   static int header_size_in_bytes() {
-    return cell_size;
+    return header_size_in_cells() * cell_size;
   }
   static int header_size_in_cells() {
-    return 1;
+    return LP64_ONLY(1) NOT_LP64(2);
   }
 
   static int compute_size_in_bytes(int cell_count) {
@@ -167,7 +163,7 @@
     return _header._struct._tag;
   }
 
-  // Return a few bits of trap state.  Range is [0..trap_mask].
+  // Return 32 bits of trap state.
   // The state tells if traps with zero, one, or many reasons have occurred.
   // It also tells whether zero or many recompilations have occurred.
   // The associated trap histogram in the MDO itself tells whether
@@ -175,14 +171,14 @@
   // occurred, and the MDO shows N occurrences of X, we make the
   // simplifying assumption that all N occurrences can be blamed
   // on that BCI.
-  int trap_state() const {
-    return ((_header._struct._flags >> trap_shift) & trap_mask);
+  uint trap_state() const {
+    return _header._struct._traps;
   }
 
-  void set_trap_state(int new_state) {
+  void set_trap_state(uint new_state) {
     assert(ProfileTraps, "used only under +ProfileTraps");
-    uint old_flags = (_header._struct._flags & flag_mask);
-    _header._struct._flags = (new_state << trap_shift) | old_flags;
+    uint old_flags = _header._struct._traps;
+    _header._struct._traps = new_state | old_flags;
   }
 
   u1 flags() const {
@@ -193,10 +189,10 @@
     return _header._struct._bci;
   }
 
-  void set_header(intptr_t value) {
+  void set_header(u8 value) {
     _header._bits = value;
   }
-  intptr_t header() {
+  u8 header() {
     return _header._bits;
   }
   void set_cell_at(int index, intptr_t value) {
@@ -207,12 +203,10 @@
     return _cells[index];
   }
 
-  void set_flag_at(int flag_number) {
-    assert(flag_number < flag_limit, "oob");
+  void set_flag_at(u1 flag_number) {
     _header._struct._flags |= (0x1 << flag_number);
   }
-  bool flag_at(int flag_number) const {
-    assert(flag_number < flag_limit, "oob");
+  bool flag_at(u1 flag_number) const {
     return (_header._struct._flags & (0x1 << flag_number)) != 0;
   }
 
@@ -238,14 +232,13 @@
   }
 #endif // CC_INTERP
   // Return a value which, when or-ed as a byte into _flags, sets the flag.
-  static int flag_number_to_byte_constant(int flag_number) {
-    assert(0 <= flag_number && flag_number < flag_limit, "oob");
+  static u1 flag_number_to_constant(u1 flag_number) {
     DataLayout temp; temp.set_header(0);
     temp.set_flag_at(flag_number);
     return temp._header._struct._flags;
   }
   // Return a value which, when or-ed as a word into _header, sets the flag.
-  static intptr_t flag_mask_to_header_mask(int byte_constant) {
+  static u8 flag_mask_to_header_mask(uint byte_constant) {
     DataLayout temp; temp.set_header(0);
     temp._header._struct._flags = byte_constant;
     return temp._header._bits;
@@ -364,8 +357,8 @@
   static ByteSize cell_offset(int index) {
     return DataLayout::cell_offset(index);
   }
-  static int flag_number_to_byte_constant(int flag_number) {
-    return DataLayout::flag_number_to_byte_constant(flag_number);
+  static int flag_number_to_constant(int flag_number) {
+    return DataLayout::flag_number_to_constant(flag_number);
   }
 
   ProfileData(DataLayout* data) {
@@ -574,7 +567,7 @@
 
   // Code generation support
   static int null_seen_byte_constant() {
-    return flag_number_to_byte_constant(null_seen_flag);
+    return flag_number_to_constant(null_seen_flag);
   }
 
   static ByteSize bit_data_size() {
@@ -2050,6 +2043,9 @@
 protected:
   enum {
     speculative_trap_method,
+#ifndef _LP64
+    speculative_trap_padding,
+#endif
     speculative_trap_cell_count
   };
 public:
@@ -2162,7 +2158,7 @@
 
   // Whole-method sticky bits and flags
   enum {
-    _trap_hist_limit    = 23 JVMCI_ONLY(+5),   // decoupled from Deoptimization::Reason_LIMIT
+    _trap_hist_limit    = 24 JVMCI_ONLY(+5),   // decoupled from Deoptimization::Reason_LIMIT
     _trap_hist_mask     = max_jubyte,
     _extra_data_count   = 4     // extra DataLayout headers, for trap history
   }; // Public flag values
--- old/src/hotspot/share/opto/c2_globals.hpp	2018-05-16 09:39:44.489355029 +0200
+++ new/src/hotspot/share/opto/c2_globals.hpp	2018-05-16 09:39:38.288373026 +0200
@@ -751,6 +751,9 @@
   product(uintx, LoopStripMiningIterShortLoop, 0,                           \
           "Loop with fewer iterations are not strip mined")                 \
           range(0, max_juint)                                               \
+                                                                            \
+  product(bool, UseProfiledLoopPredicate, true,                             \
+          "move predicates out of loops based on profiling data")           \
 
 C2_FLAGS(DECLARE_DEVELOPER_FLAG, \
          DECLARE_PD_DEVELOPER_FLAG, \
--- old/src/hotspot/share/opto/graphKit.cpp	2018-05-16 09:39:50.712336969 +0200
+++ new/src/hotspot/share/opto/graphKit.cpp	2018-05-16 09:39:44.626354632 +0200
@@ -3813,6 +3813,9 @@
   if (UseLoopPredicate) {
     add_predicate_impl(Deoptimization::Reason_predicate, nargs);
   }
+  if (UseProfiledLoopPredicate) {
+    add_predicate_impl(Deoptimization::Reason_profile_predicate, nargs);
+  }
   // loop's limit check predicate should be near the loop.
   add_predicate_impl(Deoptimization::Reason_loop_limit_check, nargs);
 }
--- old/src/hotspot/share/opto/loopPredicate.cpp	2018-05-16 09:40:01.910304470 +0200
+++ new/src/hotspot/share/opto/loopPredicate.cpp	2018-05-16 09:39:50.864336528 +0200
@@ -34,6 +34,8 @@
 #include "opto/opaquenode.hpp"
 #include "opto/rootnode.hpp"
 #include "opto/subnode.hpp"
+#include <fenv.h>
+#include <math.h>
 
 /*
  * The general idea of Loop Predication is to insert a predicate on the entry
@@ -318,18 +320,37 @@
   if (limit_check_proj != NULL) {
     entry = entry->in(0)->in(0);
   }
+  ProjNode* profile_predicate_proj = NULL;
+  ProjNode* predicate_proj = NULL;
+  if (UseProfiledLoopPredicate) {
+    profile_predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_profile_predicate);
+    if (profile_predicate_proj != NULL) {
+      entry = skip_loop_predicates(entry);
+    }
+  }
   if (UseLoopPredicate) {
-    ProjNode* predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
-    if (predicate_proj != NULL) { // right pattern that can be used by loop predication
-      // clone predicate
-      new_entry = clone_predicate(predicate_proj, new_entry,
-                                  Deoptimization::Reason_predicate,
-                                  loop_phase, igvn);
-      assert(new_entry != NULL && new_entry->is_Proj(), "IfTrue or IfFalse after clone predicate");
-      if (TraceLoopPredicate) {
-        tty->print("Loop Predicate cloned: ");
-        debug_only( new_entry->in(0)->dump(); )
-      }
+    predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
+  }
+  if (predicate_proj != NULL) { // right pattern that can be used by loop predication
+    // clone predicate
+    new_entry = clone_predicate(predicate_proj, new_entry,
+                                Deoptimization::Reason_predicate,
+                                loop_phase, igvn);
+    assert(new_entry != NULL && new_entry->is_Proj(), "IfTrue or IfFalse after clone predicate");
+    if (TraceLoopPredicate) {
+      tty->print("Loop Predicate cloned: ");
+      debug_only( new_entry->in(0)->dump(); );
+    }
+  }
+  if (profile_predicate_proj != NULL) { // right pattern that can be used by loop predication
+    // clone predicate
+    new_entry = clone_predicate(profile_predicate_proj, new_entry,
+                                Deoptimization::Reason_profile_predicate,
+                                loop_phase, igvn);
+    assert(new_entry != NULL && new_entry->is_Proj(), "IfTrue or IfFalse after clone predicate");
+    if (TraceLoopPredicate) {
+      tty->print("Loop Predicate cloned: ");
+      debug_only( new_entry->in(0)->dump(); );
     }
   }
   if (limit_check_proj != NULL && clone_limit_check) {
@@ -351,25 +372,36 @@
 //--------------------------skip_loop_predicates------------------------------
 // Skip related predicates.
 Node* PhaseIdealLoop::skip_loop_predicates(Node* entry) {
+  IfNode* iff = entry->in(0)->as_If();
+  ProjNode* uncommon_proj = iff->proj_out(1 - entry->as_Proj()->_con);
+  Node* rgn = uncommon_proj->unique_ctrl_out();
+  assert(rgn->is_Region() || rgn->is_Call(), "must be a region or call uct");
+  entry = entry->in(0)->in(0);
+  while (entry != NULL && entry->is_Proj() && entry->in(0)->is_If()) {
+    uncommon_proj = entry->in(0)->as_If()->proj_out(1 - entry->as_Proj()->_con);
+    if (uncommon_proj->unique_ctrl_out() != rgn)
+      break;
+    entry = entry->in(0)->in(0);
+  }
+  return entry;
+}
+
+Node* PhaseIdealLoop::skip_all_loop_predicates(Node* entry) {
   Node* predicate = NULL;
   predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
   if (predicate != NULL) {
     entry = entry->in(0)->in(0);
   }
+  if (UseProfiledLoopPredicate) {
+    predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_profile_predicate);
+    if (predicate != NULL) { // right pattern that can be used by loop predication
+      entry = skip_loop_predicates(entry);
+    }
+  }
   if (UseLoopPredicate) {
     predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
     if (predicate != NULL) { // right pattern that can be used by loop predication
-      IfNode* iff = entry->in(0)->as_If();
-      ProjNode* uncommon_proj = iff->proj_out(1 - entry->as_Proj()->_con);
-      Node* rgn = uncommon_proj->unique_ctrl_out();
-      assert(rgn->is_Region() || rgn->is_Call(), "must be a region or call uct");
-      entry = entry->in(0)->in(0);
-      while (entry != NULL && entry->is_Proj() && entry->in(0)->is_If()) {
-        uncommon_proj = entry->in(0)->as_If()->proj_out(1 - entry->as_Proj()->_con);
-        if (uncommon_proj->unique_ctrl_out() != rgn)
-          break;
-        entry = entry->in(0)->in(0);
-      }
+      entry = skip_loop_predicates(entry);
     }
   }
   return entry;
@@ -400,6 +432,12 @@
       return entry;
     }
   }
+  if (UseProfiledLoopPredicate) {
+    predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_profile_predicate);
+    if (predicate != NULL) { // right pattern that can be used by loop predication
+      return entry;
+    }
+  }
   return NULL;
 }
 
@@ -766,6 +804,413 @@
   return bol;
 }
 
+// Should loop predication look not only in the path from tail to head
+// but also in branches of the loop body?
+bool PhaseIdealLoop::loop_predication_should_follow_branches(IdealLoopTree *loop, ProjNode *predicate_proj, float& loop_trip_cnt) {
+  if (!UseProfiledLoopPredicate) {
+    return false;
+  }
+
+  if (predicate_proj == NULL) {
+    return false;
+  }
+
+  LoopNode* head = loop->_head->as_Loop();
+  bool follow_branches = true;
+  IdealLoopTree* l = loop->_child;
+  // For leaf loops and loops with a single inner loop
+  while (l != NULL && follow_branches) {
+    IdealLoopTree* child = l;
+    if (child->_child != NULL &&
+        child->_head->is_OuterStripMinedLoop()) {
+      assert(child->_child->_next == NULL, "only one inner loop for strip mined loop");
+      assert(child->_child->_head->is_CountedLoop() && child->_child->_head->as_CountedLoop()->is_strip_mined(), "inner loop should be strip mined");
+      child = child->_child;
+    }
+    if (child->_child != NULL || child->_irreducible) {
+      follow_branches = false;
+    }
+    l = l->_next;
+  }
+  if (follow_branches) {
+    loop->compute_profile_trip_cnt(this);
+    if (head->is_profile_trip_failed()) {
+      follow_branches = false;
+    } else {
+      loop_trip_cnt = head->profile_trip_cnt();
+      if (head->is_CountedLoop()) {
+        CountedLoopNode* cl = head->as_CountedLoop();
+        if (cl->phi() != NULL) {
+          const TypeInt* t = _igvn.type(cl->phi())->is_int();
+          float worst_case_trip_cnt = ((float)t->_hi - t->_lo) / ABS(cl->stride_con());
+          if (worst_case_trip_cnt < loop_trip_cnt) {
+            loop_trip_cnt = worst_case_trip_cnt;
+          }
+        }
+      }
+    }
+  }
+  return follow_branches;
+}
+
+// Compute probability of reaching some CFG node from a fixed
+// dominating CFG node
+class PathFrequency {
+private:
+  Node* _dom; // frequencies are computed relative to this node
+  Node_Stack _stack;
+  GrowableArray<float> _freqs_stack; // keep track of intermediate result at regions
+  GrowableArray<float> _freqs; // cache frequencies
+  PhaseIdealLoop* _phase;
+
+public:
+  PathFrequency(Node* dom, PhaseIdealLoop* phase)
+    : _dom(dom), _stack(0), _phase(phase) {
+  }
+
+  float to(Node* n) {
+    // post order walk on the CFG graph from n to _dom
+    fesetround(FE_TOWARDZERO); // make sure rounding doesn't push frequency above 1
+    IdealLoopTree* loop = _phase->get_loop(_dom);
+    Node* c = n;
+    for (;;) {
+      assert(_phase->get_loop(c) == loop, "have to be in the same loop");
+      if (c == _dom || _freqs.at_grow(c->_idx, -1) >= 0) {
+        float f = c == _dom ? 1 : _freqs.at(c->_idx);
+        Node* prev = c;
+        while (_stack.size() > 0 && prev == c) {
+          Node* n = _stack.node();
+          if (!n->is_Region()) {
+            if (_phase->get_loop(n) != _phase->get_loop(n->in(0))) {
+              // Found an inner loop: compute frequency of reaching this
+              // exit from the loop head by looking at the number of
+              // times each loop exit was taken
+              IdealLoopTree* inner_loop = _phase->get_loop(n->in(0));
+              LoopNode* inner_head = inner_loop->_head->as_Loop();
+              assert(_phase->get_loop(n) == loop, "only 1 inner loop");
+              if (inner_head->is_OuterStripMinedLoop()) {
+                inner_head->verify_strip_mined(1);
+                if (n->in(0) == inner_head->in(LoopNode::LoopBackControl)->in(0)) {
+                  n = n->in(0)->in(0)->in(0);
+                }
+                inner_loop = inner_loop->_child;
+                inner_head = inner_loop->_head->as_Loop();
+                inner_head->verify_strip_mined(1);
+              }
+              fesetround(FE_UPWARD);  // make sure rounding doesn't push frequency above 1
+              float loop_exit_cnt = 0.0f;
+              for (uint i = 0; i < inner_loop->_body.size(); i++) {
+                Node *n = inner_loop->_body[i];
+                float c = inner_loop->compute_profile_trip_cnt_helper(n);
+                loop_exit_cnt += c;
+              }
+              fesetround(FE_TOWARDZERO);
+              float cnt = -1;
+              if (n->in(0)->is_If()) {
+                IfNode* iff = n->in(0)->as_If();
+                float p = n->in(0)->as_If()->_prob;
+                if (n->Opcode() == Op_IfFalse) {
+                  p = 1 - p;
+                }
+                if (p > PROB_MIN) {
+                  cnt = p * iff->_fcnt;
+                } else {
+                  cnt = 0;
+                }
+              } else {
+                assert(n->in(0)->is_Jump(), "unsupported node kind");
+                JumpNode* jmp = n->in(0)->as_Jump();
+                float p = n->in(0)->as_Jump()->_probs[n->as_JumpProj()->_con];
+                cnt = p * jmp->_fcnt;
+              }
+              float this_exit_f = cnt > 0 ? cnt / loop_exit_cnt : 0;
+              assert(this_exit_f <= 1 && this_exit_f >= 0, "Incorrect frequency");
+              f = f * this_exit_f;
+              assert(f <= 1 && f >= 0, "Incorrect frequency");
+            } else {
+              float p = -1;
+              if (n->in(0)->is_If()) {
+                p = n->in(0)->as_If()->_prob;
+                if (n->Opcode() == Op_IfFalse) {
+                  p = 1 - p;
+                }
+              } else {
+                assert(n->in(0)->is_Jump(), "unsupported node kind");
+                p = n->in(0)->as_Jump()->_probs[n->as_JumpProj()->_con];
+              }
+              f = f * p;
+              assert(f <= 1 && f >= 0, "Incorrect frequency");
+            }
+            _freqs.at_put_grow(n->_idx, (float)f, -1);
+            _stack.pop();
+          } else {
+            float prev_f = _freqs_stack.pop();
+            float new_f = f;
+            f = new_f + prev_f;
+            assert(f <= 1 && f >= 0, "Incorrect frequency");
+            uint i = _stack.index();
+            if (i < n->req()) {
+              c = n->in(i);
+              _stack.set_index(i+1);
+              _freqs_stack.push(f);
+            } else {
+              _freqs.at_put_grow(n->_idx, f, -1);
+              _stack.pop();
+            }
+          }
+        }
+        if (_stack.size() == 0) {
+          fesetround(FE_TONEAREST);
+          assert(f >= 0 && f <= 1, "should have been computed");
+          return f;
+        }
+      } else if (c->is_Loop()) {
+        ShouldNotReachHere();
+        c = c->in(LoopNode::EntryControl);
+      } else if (c->is_Region()) {
+        _freqs_stack.push(0);
+        _stack.push(c, 2);
+        c = c->in(1);
+      } else {
+        if (c->is_IfProj()) {
+          IfNode* iff = c->in(0)->as_If();
+          if (iff->_prob == PROB_UNKNOWN) {
+            // assume never taken
+            _freqs.at_put_grow(c->_idx, 0, -1);
+          } else if (_phase->get_loop(c) != _phase->get_loop(iff)) {
+            if (iff->_fcnt == COUNT_UNKNOWN) {
+              // assume never taken
+              _freqs.at_put_grow(c->_idx, 0, -1);
+            } else {
+              // skip over loop
+              _stack.push(c, 1);
+              c = _phase->get_loop(c->in(0))->_head->as_Loop()->skip_strip_mined()->in(LoopNode::EntryControl);
+            }
+          } else {
+            _stack.push(c, 1);
+            c = iff;
+          }
+        } else if (c->is_JumpProj()) {
+          JumpNode* jmp = c->in(0)->as_Jump();
+          if (_phase->get_loop(c) != _phase->get_loop(jmp)) {
+            if (jmp->_fcnt == COUNT_UNKNOWN) {
+              // assume never taken
+              _freqs.at_put_grow(c->_idx, 0, -1);
+            } else {
+              // skip over loop
+              _stack.push(c, 1);
+              c = _phase->get_loop(c->in(0))->_head->as_Loop()->skip_strip_mined()->in(LoopNode::EntryControl);
+            }
+          } else {
+            _stack.push(c, 1);
+            c = jmp;
+          }
+        } else if (c->Opcode() == Op_CatchProj &&
+                   c->in(0)->Opcode() == Op_Catch &&
+                   c->in(0)->in(0)->is_Proj() &&
+                   c->in(0)->in(0)->in(0)->is_Call()) {
+          // assume exceptions are never thrown
+          uint con = c->as_Proj()->_con;
+          if (con == CatchProjNode::fall_through_index) {
+            Node* call = c->in(0)->in(0)->in(0)->in(0);
+            if (_phase->get_loop(call) != _phase->get_loop(c)) {
+              _freqs.at_put_grow(c->_idx, 0, -1);
+            } else {
+              c = call;
+            }
+          } else {
+            assert(con >= CatchProjNode::catch_all_index, "what else?");
+            _freqs.at_put_grow(c->_idx, 0, -1);
+          }
+        } else if (c->unique_ctrl_out() == NULL && !c->is_If() && !c->is_Jump()) {
+          ShouldNotReachHere();
+        } else {
+          c = c->in(0);
+        }
+      }
+    }
+    ShouldNotReachHere();
+    return -1;
+  }
+};
+
+void PhaseIdealLoop::loop_predication_follow_branches(Node *n, IdealLoopTree *loop, float loop_trip_cnt,
+                                                      PathFrequency& pf, Node_Stack& stack, VectorSet& seen,
+                                                      Node_List& if_proj_list) {
+  assert(n->is_Region(), "start from a region");
+  Node* tail = loop->tail();
+  stack.push(n, 1);
+  do {
+    Node* c = stack.node();
+    assert(c->is_Region() || c->is_IfProj(), "only region here");
+    uint i = stack.index();
+    
+    if (i < c->req()) {
+      stack.set_index(i+1);
+      Node* in = c->in(i);
+      while (!is_dominator(in, tail) && !seen.test_set(in->_idx)) {
+        IdealLoopTree* in_loop = get_loop(in);
+        if (in_loop != loop) {
+          in = in_loop->_head->in(LoopNode::EntryControl);
+        } else if (in->is_Region()) {
+          stack.push(in, 1);
+          break;
+        } else if (in->is_IfProj() &&
+                   in->as_Proj()->is_uncommon_trap_if_pattern(Deoptimization::Reason_none)) {
+          if (pf.to(in) * loop_trip_cnt >= 1) {
+            stack.push(in, 1);
+          }
+          in = in->in(0);
+        } else {
+          in = in->in(0);
+        }
+      }
+    } else {
+      if (c->is_IfProj()) {
+        if_proj_list.push(c);
+      }
+      stack.pop();
+    }
+
+  } while (stack.size() > 0);
+}
+
+
+bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree *loop, ProjNode* proj, ProjNode *predicate_proj,
+                                                  CountedLoopNode *cl, ConNode* zero, Invariance& invar,
+                                                  Deoptimization::DeoptReason reason) {
+  // Following are changed to nonnull when a predicate can be hoisted
+  ProjNode* new_predicate_proj = NULL;
+  IfNode*   iff  = proj->in(0)->as_If();
+  Node*     test = iff->in(1);
+  if (!test->is_Bool()){ //Conv2B, ...
+    return false;
+  }
+  BoolNode* bol = test->as_Bool();
+  if (invar.is_invariant(bol)) {
+    // Invariant test
+    new_predicate_proj = create_new_if_for_predicate(predicate_proj, NULL,
+                                                     reason,
+                                                     iff->Opcode());
+    Node* ctrl = new_predicate_proj->in(0)->as_If()->in(0);
+    BoolNode* new_predicate_bol = invar.clone(bol, ctrl)->as_Bool();
+
+    // Negate test if necessary
+    bool negated = false;
+    if (proj->_con != predicate_proj->_con) {
+      new_predicate_bol = new BoolNode(new_predicate_bol->in(1), new_predicate_bol->_test.negate());
+      register_new_node(new_predicate_bol, ctrl);
+      negated = true;
+    }
+    IfNode* new_predicate_iff = new_predicate_proj->in(0)->as_If();
+    _igvn.hash_delete(new_predicate_iff);
+    new_predicate_iff->set_req(1, new_predicate_bol);
+#ifndef PRODUCT
+    if (TraceLoopPredicate) {
+      tty->print("Predicate invariant if%s: %d ", negated ? " negated" : "", new_predicate_iff->_idx);
+      loop->dump_head();
+    } else if (TraceLoopOpts) {
+      tty->print("Predicate IC ");
+      loop->dump_head();
+    }
+#endif
+  } else if (cl != NULL && loop->is_range_check_if(iff, this, invar)) {
+    // Range check for counted loops
+    const Node*    cmp    = bol->in(1)->as_Cmp();
+    Node*          idx    = cmp->in(1);
+    assert(!invar.is_invariant(idx), "index is variant");
+    Node* rng = cmp->in(2);
+    assert(rng->Opcode() == Op_LoadRange || iff->is_RangeCheck() || _igvn.type(rng)->is_int()->_lo >= 0, "must be");
+    assert(invar.is_invariant(rng), "range must be invariant");
+    int scale    = 1;
+    Node* offset = zero;
+    bool ok = is_scaled_iv_plus_offset(idx, cl->phi(), &scale, &offset);
+    assert(ok, "must be index expression");
+
+    Node* init    = cl->init_trip();
+    // Limit is not exact.
+    // Calculate exact limit here.
+    // Note, counted loop's test is '<' or '>'.
+    Node* limit   = exact_limit(loop);
+    int  stride   = cl->stride()->get_int();
+
+    // Build if's for the upper and lower bound tests.  The
+    // lower_bound test will dominate the upper bound test and all
+    // cloned or created nodes will use the lower bound test as
+    // their declared control.
+
+    // Perform cloning to keep Invariance state correct since the
+    // late schedule will place invariant things in the loop.
+    Node *ctrl = predicate_proj->in(0)->as_If()->in(0);
+    rng = invar.clone(rng, ctrl);
+    if (offset && offset != zero) {
+      assert(invar.is_invariant(offset), "offset must be loop invariant");
+      offset = invar.clone(offset, ctrl);
+    }
+    // If predicate expressions may overflow in the integer range, longs are used.
+    bool overflow = false;
+
+    // Test the lower bound
+    BoolNode* lower_bound_bol = rc_predicate(loop, ctrl, scale, offset, init, limit, stride, rng, false, overflow);
+    // Negate test if necessary
+    bool negated = false;
+    if (proj->_con != predicate_proj->_con) {
+      lower_bound_bol = new BoolNode(lower_bound_bol->in(1), lower_bound_bol->_test.negate());
+      register_new_node(lower_bound_bol, ctrl);
+      negated = true;
+    }
+    ProjNode* lower_bound_proj = create_new_if_for_predicate(predicate_proj, NULL, reason, overflow ? Op_If : iff->Opcode());
+    IfNode* lower_bound_iff = lower_bound_proj->in(0)->as_If();
+    _igvn.hash_delete(lower_bound_iff);
+    lower_bound_iff->set_req(1, lower_bound_bol);
+    if (TraceLoopPredicate) tty->print_cr("lower bound check if: %s %d ", negated ? " negated" : "", lower_bound_iff->_idx);
+
+    // Test the upper bound
+    BoolNode* upper_bound_bol = rc_predicate(loop, lower_bound_proj, scale, offset, init, limit, stride, rng, true, overflow);
+    negated = false;
+    if (proj->_con != predicate_proj->_con) {
+      upper_bound_bol = new BoolNode(upper_bound_bol->in(1), upper_bound_bol->_test.negate());
+      register_new_node(upper_bound_bol, ctrl);
+      negated = true;
+    }
+    ProjNode* upper_bound_proj = create_new_if_for_predicate(predicate_proj, NULL, reason, overflow ? Op_If : iff->Opcode());
+    assert(upper_bound_proj->in(0)->as_If()->in(0) == lower_bound_proj, "should dominate");
+    IfNode* upper_bound_iff = upper_bound_proj->in(0)->as_If();
+    _igvn.hash_delete(upper_bound_iff);
+    upper_bound_iff->set_req(1, upper_bound_bol);
+    if (TraceLoopPredicate) tty->print_cr("upper bound check if: %s %d ", negated ? " negated" : "", lower_bound_iff->_idx);
+
+    // Fall through into rest of the clean up code which will move
+    // any dependent nodes onto the upper bound test.
+    new_predicate_proj = upper_bound_proj;
+
+    if (iff->is_RangeCheck()) {
+      new_predicate_proj = insert_skeleton_predicate(iff, loop, proj, predicate_proj, upper_bound_proj, scale, offset, init, limit, stride, rng, overflow, reason);
+    }
+
+#ifndef PRODUCT
+    if (TraceLoopOpts && !TraceLoopPredicate) {
+      tty->print("Predicate RC ");
+      loop->dump_head();
+    }
+#endif
+  } else {
+    // Loop variant check (for example, range check in non-counted loop)
+    // with uncommon trap.
+    return false;
+  }
+  assert(new_predicate_proj != NULL, "sanity");
+  // Success - attach condition (new_predicate_bol) to predicate if
+  invar.map_ctrl(proj, new_predicate_proj); // so that invariance test can be appropriate
+
+  // Eliminate the old If in the loop body
+  dominated_by( new_predicate_proj, iff, proj->_con != new_predicate_proj->_con );
+
+  C->set_major_progress();
+  return true;
+}
+
+
 // After pre/main/post loops are created, we'll put a copy of some
 // range checks between the pre and main loop to validate the initial
 // value of the induction variable for the main loop. Make a copy of
@@ -776,14 +1221,15 @@
                                                     ProjNode* upper_bound_proj,
                                                     int scale, Node* offset,
                                                     Node* init, Node* limit, jint stride,
-                                                    Node* rng, bool &overflow) {
+                                                    Node* rng, bool &overflow,
+                                                    Deoptimization::DeoptReason reason) {
   assert(proj->_con && predicate_proj->_con, "not a range check?");
   Node* opaque_init = new Opaque1Node(C, init);
   register_new_node(opaque_init, upper_bound_proj);
   BoolNode* bol = rc_predicate(loop, upper_bound_proj, scale, offset, opaque_init, limit, stride, rng, (stride > 0) != (scale > 0), overflow);
   Node* opaque_bol = new Opaque4Node(C, bol, _igvn.intcon(1)); // This will go away once loop opts are over
   register_new_node(opaque_bol, upper_bound_proj);
-  ProjNode* new_proj = create_new_if_for_predicate(predicate_proj, NULL, Deoptimization::Reason_predicate, overflow ? Op_If : iff->Opcode());
+  ProjNode* new_proj = create_new_if_for_predicate(predicate_proj, NULL, reason, overflow ? Op_If : iff->Opcode());
   _igvn.replace_input_of(new_proj->in(0), 1, opaque_bol);
   assert(opaque_init->outcnt() > 0, "should be used");
   return new_proj;
@@ -821,13 +1267,32 @@
   }
 
   Node* entry = head->skip_strip_mined()->in(LoopNode::EntryControl);
+  ProjNode *loop_limit_proj = NULL;
   ProjNode *predicate_proj = NULL;
+  ProjNode *profile_predicate_proj = NULL;
   // Loop limit check predicate should be near the loop.
-  predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
-  if (predicate_proj != NULL)
-    entry = predicate_proj->in(0)->in(0);
+  loop_limit_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+  if (loop_limit_proj != NULL) {
+    entry = loop_limit_proj->in(0)->in(0);
+  }
+  bool has_profile_predicates = false;
+  profile_predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_profile_predicate);
+  if (profile_predicate_proj != NULL) {
+    Node* n = skip_loop_predicates(entry);
+    // Check if predicates were already added to the profile predicate
+    // block
+    if (n != entry->in(0)->in(0)) {
+      has_profile_predicates = true;
+    }
+    entry = n;
+  }
   predicate_proj = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
-  if (!predicate_proj) {
+
+  float loop_trip_cnt = -1;
+  bool follow_branches = loop_predication_should_follow_branches(loop, profile_predicate_proj, loop_trip_cnt);
+  assert(!follow_branches || loop_trip_cnt >= 0, "negative trip count?");
+
+  if (predicate_proj == NULL && !follow_branches) {
 #ifndef PRODUCT
     if (TraceLoopPredicate) {
       tty->print("missing predicate:");
@@ -846,7 +1311,11 @@
   // Create list of if-projs such that a newer proj dominates all older
   // projs in the list, and they all dominate loop->tail()
   Node_List if_proj_list(area);
+  Node_List regions(area);
   Node *current_proj = loop->tail(); //start from tail
+
+
+  Node_List controls(area);
   while (current_proj != head) {
     if (loop == get_loop(current_proj) && // still in the loop ?
         current_proj->is_Proj()        && // is a projection  ?
@@ -854,161 +1323,79 @@
          current_proj->in(0)->Opcode() == Op_RangeCheck)) { // is a if projection ?
       if_proj_list.push(current_proj);
     }
+    if (follow_branches &&
+        current_proj->Opcode() == Op_Region &&
+        loop == get_loop(current_proj)) {
+      regions.push(current_proj);
+    }
     current_proj = idom(current_proj);
   }
 
   bool hoisted = false; // true if at least one proj is promoted
-  while (if_proj_list.size() > 0) {
-    // Following are changed to nonnull when a predicate can be hoisted
-    ProjNode* new_predicate_proj = NULL;
-
-    ProjNode* proj = if_proj_list.pop()->as_Proj();
-    IfNode*   iff  = proj->in(0)->as_If();
-
-    if (!proj->is_uncommon_trap_if_pattern(Deoptimization::Reason_none)) {
-      if (loop->is_loop_exit(iff)) {
-        // stop processing the remaining projs in the list because the execution of them
-        // depends on the condition of "iff" (iff->in(1)).
+
+  if (!has_profile_predicates) {
+    while (if_proj_list.size() > 0) {
+      Node* n = if_proj_list.pop();
+
+      ProjNode* proj = n->as_Proj();
+      IfNode*   iff  = proj->in(0)->as_If();
+
+      CallStaticJavaNode* call = proj->is_uncommon_trap_if_pattern(Deoptimization::Reason_none);
+      if (call == NULL) {
+        if (loop->is_loop_exit(iff)) {
+          // stop processing the remaining projs in the list because the execution of them
+          // depends on the condition of "iff" (iff->in(1)).
+          break;
+        } else {
+          // Both arms are inside the loop. There are two cases:
+          // (1) there is one backward branch. In this case, any remaining proj
+          //     in the if_proj list post-dominates "iff". So, the condition of "iff"
+          //     does not determine the execution the remining projs directly, and we
+          //     can safely continue.
+          // (2) both arms are forwarded, i.e. a diamond shape. In this case, "proj"
+          //     does not dominate loop->tail(), so it can not be in the if_proj list.
+          continue;
+        }
+      }
+      Deoptimization::DeoptReason reason = Deoptimization::trap_request_reason(call->uncommon_trap_request());
+      if (reason == Deoptimization::Reason_predicate) {
         break;
-      } else {
-        // Both arms are inside the loop. There are two cases:
-        // (1) there is one backward branch. In this case, any remaining proj
-        //     in the if_proj list post-dominates "iff". So, the condition of "iff"
-        //     does not determine the execution the remining projs directly, and we
-        //     can safely continue.
-        // (2) both arms are forwarded, i.e. a diamond shape. In this case, "proj"
-        //     does not dominate loop->tail(), so it can not be in the if_proj list.
-        continue;
       }
-    }
 
-    Node*     test = iff->in(1);
-    if (!test->is_Bool()){ //Conv2B, ...
-      continue;
-    }
-    BoolNode* bol = test->as_Bool();
-    if (invar.is_invariant(bol)) {
-      // Invariant test
-      new_predicate_proj = create_new_if_for_predicate(predicate_proj, NULL,
-                                                       Deoptimization::Reason_predicate,
-                                                       iff->Opcode());
-      Node* ctrl = new_predicate_proj->in(0)->as_If()->in(0);
-      BoolNode* new_predicate_bol = invar.clone(bol, ctrl)->as_Bool();
-
-      // Negate test if necessary
-      bool negated = false;
-      if (proj->_con != predicate_proj->_con) {
-        new_predicate_bol = new BoolNode(new_predicate_bol->in(1), new_predicate_bol->_test.negate());
-        register_new_node(new_predicate_bol, ctrl);
-        negated = true;
-      }
-      IfNode* new_predicate_iff = new_predicate_proj->in(0)->as_If();
-      _igvn.hash_delete(new_predicate_iff);
-      new_predicate_iff->set_req(1, new_predicate_bol);
-#ifndef PRODUCT
-      if (TraceLoopPredicate) {
-        tty->print("Predicate invariant if%s: %d ", negated ? " negated" : "", new_predicate_iff->_idx);
-        loop->dump_head();
-      } else if (TraceLoopOpts) {
-        tty->print("Predicate IC ");
-        loop->dump_head();
+      if (predicate_proj != NULL) {
+        hoisted = loop_predication_impl_helper(loop, proj, predicate_proj, cl, zero, invar, Deoptimization::Reason_predicate) | hoisted;
       }
-#endif
-    } else if (cl != NULL && loop->is_range_check_if(iff, this, invar)) {
-      // Range check for counted loops
-      const Node*    cmp    = bol->in(1)->as_Cmp();
-      Node*          idx    = cmp->in(1);
-      assert(!invar.is_invariant(idx), "index is variant");
-      Node* rng = cmp->in(2);
-      assert(rng->Opcode() == Op_LoadRange || iff->is_RangeCheck() || _igvn.type(rng)->is_int()->_lo >= 0, "must be");
-      assert(invar.is_invariant(rng), "range must be invariant");
-      int scale    = 1;
-      Node* offset = zero;
-      bool ok = is_scaled_iv_plus_offset(idx, cl->phi(), &scale, &offset);
-      assert(ok, "must be index expression");
-
-      Node* init    = cl->init_trip();
-      // Limit is not exact.
-      // Calculate exact limit here.
-      // Note, counted loop's test is '<' or '>'.
-      Node* limit   = exact_limit(loop);
-      int  stride   = cl->stride()->get_int();
-
-      // Build if's for the upper and lower bound tests.  The
-      // lower_bound test will dominate the upper bound test and all
-      // cloned or created nodes will use the lower bound test as
-      // their declared control.
-
-      // Perform cloning to keep Invariance state correct since the
-      // late schedule will place invariant things in the loop.
-      Node *ctrl = predicate_proj->in(0)->as_If()->in(0);
-      rng = invar.clone(rng, ctrl);
-      if (offset && offset != zero) {
-        assert(invar.is_invariant(offset), "offset must be loop invariant");
-        offset = invar.clone(offset, ctrl);
-      }
-      // If predicate expressions may overflow in the integer range, longs are used.
-      bool overflow = false;
-
-      // Test the lower bound
-      BoolNode* lower_bound_bol = rc_predicate(loop, ctrl, scale, offset, init, limit, stride, rng, false, overflow);
-      // Negate test if necessary
-      bool negated = false;
-      if (proj->_con != predicate_proj->_con) {
-        lower_bound_bol = new BoolNode(lower_bound_bol->in(1), lower_bound_bol->_test.negate());
-        register_new_node(lower_bound_bol, ctrl);
-        negated = true;
-      }
-      ProjNode* lower_bound_proj = create_new_if_for_predicate(predicate_proj, NULL, Deoptimization::Reason_predicate, overflow ? Op_If : iff->Opcode());
-      IfNode* lower_bound_iff = lower_bound_proj->in(0)->as_If();
-      _igvn.hash_delete(lower_bound_iff);
-      lower_bound_iff->set_req(1, lower_bound_bol);
-      if (TraceLoopPredicate) tty->print_cr("lower bound check if: %s %d ", negated ? " negated" : "", lower_bound_iff->_idx);
-
-      // Test the upper bound
-      BoolNode* upper_bound_bol = rc_predicate(loop, lower_bound_proj, scale, offset, init, limit, stride, rng, true, overflow);
-      negated = false;
-      if (proj->_con != predicate_proj->_con) {
-        upper_bound_bol = new BoolNode(upper_bound_bol->in(1), upper_bound_bol->_test.negate());
-        register_new_node(upper_bound_bol, ctrl);
-        negated = true;
-      }
-      ProjNode* upper_bound_proj = create_new_if_for_predicate(predicate_proj, NULL, Deoptimization::Reason_predicate, overflow ? Op_If : iff->Opcode());
-      assert(upper_bound_proj->in(0)->as_If()->in(0) == lower_bound_proj, "should dominate");
-      IfNode* upper_bound_iff = upper_bound_proj->in(0)->as_If();
-      _igvn.hash_delete(upper_bound_iff);
-      upper_bound_iff->set_req(1, upper_bound_bol);
-      if (TraceLoopPredicate) tty->print_cr("upper bound check if: %s %d ", negated ? " negated" : "", lower_bound_iff->_idx);
-
-      // Fall through into rest of the clean up code which will move
-      // any dependent nodes onto the upper bound test.
-      new_predicate_proj = upper_bound_proj;
+    } // end while
+  }
 
-      if (iff->is_RangeCheck()) {
-        new_predicate_proj = insert_skeleton_predicate(iff, loop, proj, predicate_proj, upper_bound_proj, scale, offset, init, limit, stride, rng, overflow);
+  Node_List if_proj_list_freq(area);
+  if (follow_branches) {
+    PathFrequency pf(loop->_head, this);
+
+    // Some projections were skipped by regular predicates because of
+    // an early loop exit. Try them with profile data.
+    while (if_proj_list.size() > 0) {
+      Node* proj = if_proj_list.pop();
+      float f = pf.to(proj);
+      if (proj->as_Proj()->is_uncommon_trap_if_pattern(Deoptimization::Reason_none) &&
+          f * loop_trip_cnt >= 1) {
+        hoisted = loop_predication_impl_helper(loop, proj->as_Proj(), profile_predicate_proj, cl, zero, invar, Deoptimization::Reason_profile_predicate) | hoisted;
       }
+    }
 
-#ifndef PRODUCT
-      if (TraceLoopOpts && !TraceLoopPredicate) {
-        tty->print("Predicate RC ");
-        loop->dump_head();
-      }
-#endif
-    } else {
-      // Loop variant check (for example, range check in non-counted loop)
-      // with uncommon trap.
-      continue;
-    }
-    assert(new_predicate_proj != NULL, "sanity");
-    // Success - attach condition (new_predicate_bol) to predicate if
-    invar.map_ctrl(proj, new_predicate_proj); // so that invariance test can be appropriate
-
-    // Eliminate the old If in the loop body
-    dominated_by( new_predicate_proj, iff, proj->_con != new_predicate_proj->_con );
-
-    hoisted = true;
-    C->set_major_progress();
-  } // end while
+    // And look into all branches
+    Node_Stack stack(0);
+    VectorSet seen(Thread::current()->resource_area());
+    while (regions.size() > 0) {
+      Node* c = regions.pop();
+      loop_predication_follow_branches(c, loop, loop_trip_cnt, pf, stack, seen, if_proj_list_freq);
+    }
+
+    for (uint i = 0; i < if_proj_list_freq.size(); i++) {
+      ProjNode* proj = if_proj_list_freq.at(i)->as_Proj();
+      hoisted = loop_predication_impl_helper(loop, proj, profile_predicate_proj, cl, zero, invar, Deoptimization::Reason_profile_predicate) | hoisted;
+    }
+  }
 
 #ifndef PRODUCT
   // report that the loop predication has been actually performed
--- old/src/hotspot/share/opto/loopTransform.cpp	2018-05-16 09:40:08.372285715 +0200
+++ new/src/hotspot/share/opto/loopTransform.cpp	2018-05-16 09:40:02.030304121 +0200
@@ -135,11 +135,45 @@
 //------------------------------compute_profile_trip_cnt----------------------------
 // Compute loop trip count from profile data as
 //    (backedge_count + loop_exit_count) / loop_exit_count
-void IdealLoopTree::compute_profile_trip_cnt( PhaseIdealLoop *phase ) {
-  if (!_head->is_CountedLoop()) {
+
+float IdealLoopTree::compute_profile_trip_cnt_helper(Node* n) {
+  if (n->is_If()) {
+    IfNode *iff = n->as_If();
+    if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
+      Node *exit = is_loop_exit(iff);
+      if (exit) {
+        float exit_prob = iff->_prob;
+        if (exit->Opcode() == Op_IfFalse) exit_prob = 1.0 - exit_prob;
+        if (exit_prob > PROB_MIN) {
+          float exit_cnt = iff->_fcnt * exit_prob;
+          return exit_cnt;
+        }
+      }
+    }
+  }
+  if (n->is_Jump()) {
+    JumpNode *jmp = n->as_Jump();
+    if (jmp->_fcnt != COUNT_UNKNOWN) {
+      float* probs = jmp->_probs;
+      float exit_prob = 0;
+      PhaseIdealLoop *phase = _phase;
+      for (DUIterator_Fast imax, i = jmp->fast_outs(imax); i < imax; i++) {
+        JumpProjNode* u = jmp->fast_out(i)->as_JumpProj();
+        if (!is_member(_phase->get_loop(u))) {
+          exit_prob += probs[u->_con];
+        }
+      }
+      return exit_prob * jmp->_fcnt;
+    }
+  }
+  return 0;
+}
+
+void IdealLoopTree::compute_profile_trip_cnt(PhaseIdealLoop *phase) {
+  if (!_head->is_Loop()) {
     return;
   }
-  CountedLoopNode* head = _head->as_CountedLoop();
+  LoopNode* head = _head->as_Loop();
   if (head->profile_trip_cnt() != COUNT_UNKNOWN) {
     return; // Already computed
   }
@@ -151,7 +185,8 @@
         back->in(0) &&
         back->in(0)->is_If() &&
         back->in(0)->as_If()->_fcnt != COUNT_UNKNOWN &&
-        back->in(0)->as_If()->_prob != PROB_UNKNOWN) {
+        back->in(0)->as_If()->_prob != PROB_UNKNOWN &&
+        (back->Opcode() == Op_IfTrue ? 1-back->in(0)->as_If()->_prob : back->in(0)->as_If()->_prob) > PROB_MIN) {
       break;
     }
     back = phase->idom(back);
@@ -160,26 +195,34 @@
     assert((back->Opcode() == Op_IfTrue || back->Opcode() == Op_IfFalse) &&
            back->in(0), "if-projection exists");
     IfNode* back_if = back->in(0)->as_If();
-    float loop_back_cnt = back_if->_fcnt * back_if->_prob;
+    float loop_back_cnt = back_if->_fcnt * (back->Opcode() == Op_IfTrue ? back_if->_prob : (1 - back_if->_prob));
 
     // Now compute a loop exit count
     float loop_exit_cnt = 0.0f;
-    for( uint i = 0; i < _body.size(); i++ ) {
-      Node *n = _body[i];
-      if( n->is_If() ) {
-        IfNode *iff = n->as_If();
-        if( iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN ) {
-          Node *exit = is_loop_exit(iff);
-          if( exit ) {
-            float exit_prob = iff->_prob;
-            if (exit->Opcode() == Op_IfFalse) exit_prob = 1.0 - exit_prob;
-            if (exit_prob > PROB_MIN) {
-              float exit_cnt = iff->_fcnt * exit_prob;
-              loop_exit_cnt += exit_cnt;
+    if (_child == NULL) {
+      for( uint i = 0; i < _body.size(); i++ ) {
+        Node *n = _body[i];
+        loop_exit_cnt += compute_profile_trip_cnt_helper(n);
+      }
+    } else {
+      ResourceMark rm;
+      Unique_Node_List wq;
+      wq.push(back);
+      for (uint i = 0; i < wq.size(); i++) {
+        Node *n = wq.at(i);
+        assert(n->is_CFG(), "only control nodes");
+        if (n != head) {
+          if (n->is_Region()) {
+            for (uint j = 1; j < n->req(); j++) {
+              wq.push(n->in(j));
             }
+          } else {
+            loop_exit_cnt += compute_profile_trip_cnt_helper(n);
+            wq.push(n->in(0));
           }
         }
       }
+
     }
     if (loop_exit_cnt > 0.0f) {
       trip_cnt = (loop_back_cnt + loop_exit_cnt) / loop_exit_cnt;
@@ -187,6 +230,8 @@
       // No exit count so use
       trip_cnt = loop_back_cnt;
     }
+  } else {
+    head->mark_profile_trip_failed();
   }
 #ifndef PRODUCT
   if (TraceProfileTripCount) {
@@ -1014,125 +1059,140 @@
 // the control paths must die too but the range checks were removed by
 // predication. The range checks that we add here guarantee that they
 // do.
-void PhaseIdealLoop::duplicate_predicates(CountedLoopNode* pre_head, Node* min_taken, Node* castii,
-                                          IdealLoopTree* outer_loop, LoopNode* outer_main_head,
-                                          uint dd_main_head) {
-  if (UseLoopPredicate) {
-    Node* entry = pre_head->in(LoopNode::EntryControl);
-    Node* predicate = NULL;
-    predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
-    if (predicate != NULL) {
-      entry = entry->in(0)->in(0);
-    }
-    predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
-    if (predicate != NULL) {
-      IfNode* iff = entry->in(0)->as_If();
-      ProjNode* uncommon_proj = iff->proj_out(1 - entry->as_Proj()->_con);
-      Node* rgn = uncommon_proj->unique_ctrl_out();
-      assert(rgn->is_Region() || rgn->is_Call(), "must be a region or call uct");
-      assert(iff->in(1)->in(1)->Opcode() == Op_Opaque1, "unexpected predicate shape");
-      entry = entry->in(0)->in(0);
-      Node* prev_proj = min_taken;
-      while (entry != NULL && entry->is_Proj() && entry->in(0)->is_If()) {
-        uncommon_proj = entry->in(0)->as_If()->proj_out(1 - entry->as_Proj()->_con);
-        if (uncommon_proj->unique_ctrl_out() != rgn)
-          break;
-        iff = entry->in(0)->as_If();
-        if (iff->in(1)->Opcode() == Op_Opaque4) {
-          Node_Stack to_clone(2);
-          to_clone.push(iff->in(1), 1);
-          uint current = C->unique();
-          Node* result = NULL;
-          // Look for the opaque node to replace with the init value
-          // and clone everything in between. We keep the Opaque4 node
-          // so the duplicated predicates are eliminated once loop
-          // opts are over: they are here only to keep the IR graph
-          // consistent.
-          do {
-            Node* n = to_clone.node();
-            uint i = to_clone.index();
-            Node* m = n->in(i);
-            int op = m->Opcode();
-            if (m->is_Bool() ||
-                m->is_Cmp() ||
-                op == Op_AndL ||
-                op == Op_OrL ||
-                op == Op_RShiftL ||
-                op == Op_LShiftL ||
-                op == Op_AddL ||
-                op == Op_AddI ||
-                op == Op_MulL ||
-                op == Op_MulI ||
-                op == Op_SubL ||
-                op == Op_SubI ||
-                op == Op_ConvI2L) {
-              to_clone.push(m, 1);
-              continue;
+void PhaseIdealLoop::duplicate_predicates_helper(Node* predicate, Node* castii, IdealLoopTree* outer_loop,
+                                                 LoopNode* outer_main_head, uint dd_main_head) {
+  if (predicate != NULL) {
+    IfNode* iff = predicate->in(0)->as_If();
+    ProjNode* uncommon_proj = iff->proj_out(1 - predicate->as_Proj()->_con);
+    Node* rgn = uncommon_proj->unique_ctrl_out();
+    assert(rgn->is_Region() || rgn->is_Call(), "must be a region or call uct");
+    assert(iff->in(1)->in(1)->Opcode() == Op_Opaque1, "unexpected predicate shape");
+    predicate = predicate->in(0)->in(0);
+    Node* current_proj = outer_main_head->in(LoopNode::EntryControl);
+    Node* prev_proj = current_proj;
+    while (predicate != NULL && predicate->is_Proj() && predicate->in(0)->is_If()) {
+      uncommon_proj = predicate->in(0)->as_If()->proj_out(1 - predicate->as_Proj()->_con);
+      if (uncommon_proj->unique_ctrl_out() != rgn)
+        break;
+      iff = predicate->in(0)->as_If();
+      if (iff->in(1)->Opcode() == Op_Opaque4) {
+        Node_Stack to_clone(2);
+        to_clone.push(iff->in(1), 1);
+        uint current = C->unique();
+        Node* result = NULL;
+        // Look for the opaque node to replace with the init value
+        // and clone everything in between. We keep the Opaque4 node
+        // so the duplicated predicates are eliminated once loop
+        // opts are over: they are here only to keep the IR graph
+        // consistent.
+        do {
+          Node* n = to_clone.node();
+          uint i = to_clone.index();
+          Node* m = n->in(i);
+          int op = m->Opcode();
+          if (m->is_Bool() ||
+              m->is_Cmp() ||
+              op == Op_AndL ||
+              op == Op_OrL ||
+              op == Op_RShiftL ||
+              op == Op_LShiftL ||
+              op == Op_AddL ||
+              op == Op_AddI ||
+              op == Op_MulL ||
+              op == Op_MulI ||
+              op == Op_SubL ||
+              op == Op_SubI ||
+              op == Op_ConvI2L) {
+            to_clone.push(m, 1);
+            continue;
+          }
+          if (op == Op_Opaque1) {
+            if (n->_idx < current) {
+              n = n->clone();
             }
-            if (op == Op_Opaque1) {
-              if (n->_idx < current) {
-                n = n->clone();
-              }
-              n->set_req(i, castii);
-              register_new_node(n, min_taken);
-              to_clone.set_node(n);
+            n->set_req(i, castii);
+            register_new_node(n, current_proj);
+            to_clone.set_node(n);
+          }
+          for (;;) {
+            Node* cur = to_clone.node();
+            uint j = to_clone.index();
+            if (j+1 < cur->req()) {
+              to_clone.set_index(j+1);
+              break;
             }
-            for (;;) {
-              Node* cur = to_clone.node();
-              uint j = to_clone.index();
-              if (j+1 < cur->req()) {
-                to_clone.set_index(j+1);
-                break;
-              }
-              to_clone.pop();
-              if (to_clone.size() == 0) {
-                result = cur;
-                break;
-              }
-              Node* next = to_clone.node();
-              j = to_clone.index();
-              if (cur->_idx >= current) {
-                if (next->_idx < current) {
-                  next = next->clone();
-                  register_new_node(next, min_taken);
-                  to_clone.set_node(next);
-                }
-                assert(next->in(j) != cur, "input should have been cloned");
-                next->set_req(j, cur);
+            to_clone.pop();
+            if (to_clone.size() == 0) {
+              result = cur;
+              break;
+            }
+            Node* next = to_clone.node();
+            j = to_clone.index();
+            if (cur->_idx >= current) {
+              if (next->_idx < current) {
+                next = next->clone();
+                register_new_node(next, current_proj);
+                to_clone.set_node(next);
               }
+              assert(next->in(j) != cur, "input should have been cloned");
+              next->set_req(j, cur);
             }
-          } while (result == NULL);
-          assert(result->_idx >= current, "new node expected");
+          }
+        } while (result == NULL);
+        assert(result->_idx >= current, "new node expected");
 
-          Node* proj = entry->clone();
-          Node* other_proj = uncommon_proj->clone();
-          Node* new_iff = iff->clone();
-          new_iff->set_req(1, result);
-          proj->set_req(0, new_iff);
-          other_proj->set_req(0, new_iff);
-          Node *frame = new ParmNode(C->start(), TypeFunc::FramePtr);
-          register_new_node(frame, C->start());
-          // It's impossible for the predicate to fail at runtime. Use
-          // an Halt node.
-          Node* halt = new HaltNode(other_proj, frame);
-          C->root()->add_req(halt);
-          new_iff->set_req(0, prev_proj);
-
-          register_control(new_iff, outer_loop->_parent, prev_proj);
-          register_control(proj, outer_loop->_parent, new_iff);
-          register_control(other_proj, _ltree_root, new_iff);
-          register_control(halt, _ltree_root, other_proj);
+        Node* proj = predicate->clone();
+        Node* other_proj = uncommon_proj->clone();
+        Node* new_iff = iff->clone();
+        new_iff->set_req(1, result);
+        proj->set_req(0, new_iff);
+        other_proj->set_req(0, new_iff);
+        Node *frame = new ParmNode(C->start(), TypeFunc::FramePtr);
+        register_new_node(frame, C->start());
+        // It's impossible for the predicate to fail at runtime. Use
+        // an Halt node.
+        Node* halt = new HaltNode(other_proj, frame);
+        C->root()->add_req(halt);
+        new_iff->set_req(0, prev_proj);
+
+        register_control(new_iff, outer_loop->_parent, prev_proj);
+        register_control(proj, outer_loop->_parent, new_iff);
+        register_control(other_proj, _ltree_root, new_iff);
+        register_control(halt, _ltree_root, other_proj);
 
-          prev_proj = proj;
-        }
-        entry = entry->in(0)->in(0);
+        prev_proj = proj;
       }
+      predicate = predicate->in(0)->in(0);
+    }
+    if (prev_proj != current_proj) {
       _igvn.replace_input_of(outer_main_head, LoopNode::EntryControl, prev_proj);
       set_idom(outer_main_head, prev_proj, dd_main_head);
     }
   }
 }
 
+void PhaseIdealLoop::duplicate_predicates(CountedLoopNode* pre_head, Node* castii, IdealLoopTree* outer_loop,
+                                          LoopNode* outer_main_head, uint dd_main_head) {
+  if (UseLoopPredicate) {
+    Node* entry = pre_head->in(LoopNode::EntryControl);
+    Node* predicate = NULL;
+    predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check);
+    if (predicate != NULL) {
+      entry = entry->in(0)->in(0);
+    }
+    Node* profile_predicate = NULL;
+    if (UseProfiledLoopPredicate) {
+      profile_predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_profile_predicate);
+      if (profile_predicate != NULL) {
+        entry = skip_loop_predicates(entry);
+      }
+    }
+    predicate = find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
+    duplicate_predicates_helper(predicate, castii, outer_loop, outer_main_head, dd_main_head);
+    duplicate_predicates_helper(profile_predicate, castii, outer_loop, outer_main_head, dd_main_head);
+  }
+}
+
 //------------------------------insert_pre_post_loops--------------------------
 // Insert pre and post loops.  If peel_only is set, the pre-loop can not have
 // more iterations added.  It acts as a 'peel' only, no lower-bound RCE, no
@@ -1276,7 +1336,7 @@
   // CastII for the main loop:
   Node* castii = cast_incr_before_loop( pre_incr, min_taken, main_head );
   assert(castii != NULL, "no castII inserted");
-  duplicate_predicates(pre_head, min_taken, castii, outer_loop, outer_main_head, dd_main_head);
+  duplicate_predicates(pre_head, castii, outer_loop, outer_main_head, dd_main_head);
 
   // Step B4: Shorten the pre-loop to run only 1 iteration (for now).
   // RCE and alignment may change this later.
@@ -2813,7 +2873,7 @@
   }
   if (needs_guard) {
     // Check for an obvious zero trip guard.
-    Node* inctrl = PhaseIdealLoop::skip_loop_predicates(cl->skip_predicates());
+    Node* inctrl = PhaseIdealLoop::skip_all_loop_predicates(cl->skip_predicates());
     if (inctrl->Opcode() == Op_IfTrue || inctrl->Opcode() == Op_IfFalse) {
       bool maybe_swapped = (inctrl->Opcode() == Op_IfFalse);
       // The test should look like just the backedge of a CountedLoop
--- old/src/hotspot/share/opto/loopUnswitch.cpp	2018-05-16 09:40:14.664267454 +0200
+++ new/src/hotspot/share/opto/loopUnswitch.cpp	2018-05-16 09:40:08.549285202 +0200
@@ -138,9 +138,19 @@
   Node* uniqc = proj_true->unique_ctrl_out();
   Node* entry = head->skip_strip_mined()->in(LoopNode::EntryControl);
   Node* predicate = find_predicate(entry);
+  if (predicate != NULL) {
+    entry = skip_loop_predicates(entry);
+  }
   if (predicate != NULL && UseLoopPredicate) {
     // We may have two predicates, find first.
-    entry = find_predicate(entry->in(0)->in(0));
+    Node* n = find_predicate(entry);
+    if (n != NULL) {
+      predicate = n;
+      entry = skip_loop_predicates(entry);
+    }
+  }
+  if (predicate != NULL && UseProfiledLoopPredicate) {
+    entry = find_predicate(entry);
     if (entry != NULL) predicate = entry;
   }
   if (predicate != NULL) predicate = predicate->in(0);
--- old/src/hotspot/share/opto/loopnode.cpp	2018-05-16 09:40:20.964249170 +0200
+++ new/src/hotspot/share/opto/loopnode.cpp	2018-05-16 09:40:14.778267124 +0200
@@ -1252,9 +1252,7 @@
   return l->outer_safepoint();
 }
 
-Node* CountedLoopNode::skip_predicates() {
-  if (is_main_loop()) {
-    Node* ctrl = skip_strip_mined()->in(LoopNode::EntryControl);
+Node* CountedLoopNode::skip_predicates_from_entry(Node* ctrl) {
     while (ctrl != NULL && ctrl->is_Proj() && ctrl->in(0)->is_If() &&
            ctrl->in(0)->as_If()->proj_out(1-ctrl->as_Proj()->_con)->outcnt() == 1 &&
            ctrl->in(0)->as_If()->proj_out(1-ctrl->as_Proj()->_con)->unique_out()->Opcode() == Op_Halt) {
@@ -1263,6 +1261,13 @@
 
     return ctrl;
   }
+
+Node* CountedLoopNode::skip_predicates() {
+  if (is_main_loop()) {
+    Node* ctrl = skip_strip_mined()->in(LoopNode::EntryControl);
+
+    return skip_predicates_from_entry(ctrl);
+  }
   return in(LoopNode::EntryControl);
 }
 
@@ -2371,6 +2376,13 @@
     entry = PhaseIdealLoop::find_predicate_insertion_point(entry, Deoptimization::Reason_predicate);
     if (entry != NULL) {
       tty->print(" predicated");
+      entry = PhaseIdealLoop::skip_loop_predicates(entry);
+    }
+  }
+  if (UseProfiledLoopPredicate) {
+    entry = PhaseIdealLoop::find_predicate_insertion_point(entry, Deoptimization::Reason_profile_predicate);
+    if (entry != NULL) {
+      tty->print(" profile_predicated");
     }
   }
   if (_head->is_CountedLoop()) {
@@ -2478,11 +2490,18 @@
     if (predicate_proj != NULL ) { // right pattern that can be used by loop predication
       assert(entry->in(0)->in(1)->in(1)->Opcode() == Op_Opaque1, "must be");
       useful_predicates.push(entry->in(0)->in(1)->in(1)); // good one
-      entry = entry->in(0)->in(0);
+      entry = skip_loop_predicates(entry);
     }
     predicate_proj = find_predicate(entry); // Predicate
     if (predicate_proj != NULL ) {
       useful_predicates.push(entry->in(0)->in(1)->in(1)); // good one
+      entry = skip_loop_predicates(entry);
+    }
+    if (UseProfiledLoopPredicate) {
+      predicate_proj = find_predicate(entry); // Predicate
+      if (predicate_proj != NULL ) {
+        useful_predicates.push(entry->in(0)->in(1)->in(1)); // good one
+      }
     }
   }
 
@@ -4165,12 +4184,33 @@
   // which can inhibit range check elimination.
   if (least != early) {
     Node* ctrl_out = least->unique_ctrl_out();
-    if (ctrl_out && ctrl_out->is_Loop() &&
-        least == ctrl_out->in(LoopNode::EntryControl) &&
-        (ctrl_out->is_CountedLoop() || ctrl_out->is_OuterStripMinedLoop())) {
-      Node* least_dom = idom(least);
-      if (get_loop(least_dom)->is_member(get_loop(least))) {
-        least = least_dom;
+    if (ctrl_out && ctrl_out->is_CountedLoop() &&
+        least == ctrl_out->in(LoopNode::EntryControl)) {
+      Node* new_ctrl = least;
+      // Move the node above predicates so a following pass of loop
+      // predication doesn't hoist a predicate that depends on it
+      // above that node.
+      if (find_predicate_insertion_point(new_ctrl, Deoptimization::Reason_loop_limit_check) != NULL) {
+        new_ctrl = new_ctrl->in(0)->in(0);
+        assert(is_dominator(early, new_ctrl), "least != early so we can move up the dominator tree");
+      }
+      if (find_predicate_insertion_point(new_ctrl, Deoptimization::Reason_profile_predicate) != NULL) {
+        Node* c = new_ctrl->in(0)->in(0);
+        assert(is_dominator(early, c), "least != early so we can move up the dominator tree");
+        new_ctrl = c;
+      }
+      if (find_predicate_insertion_point(new_ctrl, Deoptimization::Reason_predicate) != NULL) {
+        Node* c = new_ctrl->in(0)->in(0);
+        assert(is_dominator(early, c), "least != early so we can move up the dominator tree");
+        new_ctrl = c;
+      }
+      if (new_ctrl != ctrl_out) {
+        least = new_ctrl;
+      } else if (ctrl_out->is_CountedLoop() || ctrl_out->is_OuterStripMinedLoop()) {
+        Node* least_dom = idom(least);
+        if (get_loop(least_dom)->is_member(get_loop(least))) {
+          least = least_dom;
+        }
       }
     }
   }
--- old/src/hotspot/share/opto/loopnode.hpp	2018-05-16 09:40:27.383230541 +0200
+++ new/src/hotspot/share/opto/loopnode.hpp	2018-05-16 09:40:21.104248764 +0200
@@ -38,6 +38,7 @@
 class LoopNode;
 class Node;
 class OuterStripMinedLoopEndNode;
+class PathFrequency;
 class PhaseIdealLoop;
 class CountedLoopReserveKit;
 class VectorSet;
@@ -57,7 +58,7 @@
   // the semantics so it does not appear in the hash & cmp functions.
   virtual uint size_of() const { return sizeof(*this); }
 protected:
-  short _loop_flags;
+  uint _loop_flags;
   // Names for flag bitfields
   enum { Normal=0, Pre=1, Main=2, Post=3, PreMainPostFlagsMask=3,
          MainHasNoPreLoop=4,
@@ -73,26 +74,31 @@
          HasAtomicPostLoop=4096,
          HasRangeChecks=8192,
          IsMultiversioned=16384,
-         StripMined=32768};
+         StripMined=32768,
+         ProfileTripFailed=65536};
   char _unswitch_count;
   enum { _unswitch_max=3 };
   char _postloop_flags;
   enum { LoopNotRCEChecked = 0, LoopRCEChecked = 1, RCEPostLoop = 2 };
 
+  // Expected trip count from profile data
+  float _profile_trip_cnt;
+
 public:
   // Names for edge indices
   enum { Self=0, EntryControl, LoopBackControl };
 
-  int is_inner_loop() const { return _loop_flags & InnerLoop; }
+  uint is_inner_loop() const { return _loop_flags & InnerLoop; }
   void set_inner_loop() { _loop_flags |= InnerLoop; }
 
-  int range_checks_present() const { return _loop_flags & HasRangeChecks; }
-  int is_multiversioned() const { return _loop_flags & IsMultiversioned; }
-  int is_vectorized_loop() const { return _loop_flags & VectorizedLoop; }
-  int is_partial_peel_loop() const { return _loop_flags & PartialPeelLoop; }
+  uint range_checks_present() const { return _loop_flags & HasRangeChecks; }
+  uint is_multiversioned() const { return _loop_flags & IsMultiversioned; }
+  uint is_vectorized_loop() const { return _loop_flags & VectorizedLoop; }
+  uint is_partial_peel_loop() const { return _loop_flags & PartialPeelLoop; }
   void set_partial_peel_loop() { _loop_flags |= PartialPeelLoop; }
-  int partial_peel_has_failed() const { return _loop_flags & PartialPeelFailed; }
-  int is_strip_mined() const { return _loop_flags & StripMined; }
+  uint partial_peel_has_failed() const { return _loop_flags & PartialPeelFailed; }
+  uint is_strip_mined() const { return _loop_flags & StripMined; }
+  uint is_profile_trip_failed() const { return _loop_flags & ProfileTripFailed; }
 
   void mark_partial_peel_failed() { _loop_flags |= PartialPeelFailed; }
   void mark_has_reductions() { _loop_flags |= HasReductions; }
@@ -105,6 +111,7 @@
   void mark_is_multiversioned() { _loop_flags |= IsMultiversioned; }
   void mark_strip_mined() { _loop_flags |= StripMined; }
   void clear_strip_mined() { _loop_flags &= ~StripMined; }
+  void mark_profile_trip_failed() { _loop_flags |= ProfileTripFailed; }
 
   int unswitch_max() { return _unswitch_max; }
   int unswitch_count() { return _unswitch_count; }
@@ -119,7 +126,12 @@
     _unswitch_count = val;
   }
 
-  LoopNode(Node *entry, Node *backedge) : RegionNode(3), _loop_flags(0), _unswitch_count(0), _postloop_flags(0) {
+  void set_profile_trip_cnt(float ptc) { _profile_trip_cnt = ptc; }
+  float profile_trip_cnt()             { return _profile_trip_cnt; }
+
+  LoopNode(Node *entry, Node *backedge)
+    : RegionNode(3), _loop_flags(0), _unswitch_count(0),
+      _postloop_flags(0), _profile_trip_cnt(COUNT_UNKNOWN)  {
     init_class_id(Class_Loop);
     init_req(EntryControl, entry);
     init_req(LoopBackControl, backedge);
@@ -186,9 +198,6 @@
   // Known trip count calculated by compute_exact_trip_count()
   uint  _trip_count;
 
-  // Expected trip count from profile data
-  float _profile_trip_cnt;
-
   // Log2 of original loop bodies in unrolled loop
   int _unrolled_count_log2;
 
@@ -203,8 +212,8 @@
 public:
   CountedLoopNode( Node *entry, Node *backedge )
     : LoopNode(entry, backedge), _main_idx(0), _trip_count(max_juint),
-      _profile_trip_cnt(COUNT_UNKNOWN), _unrolled_count_log2(0),
-      _node_count_before_unroll(0), _slp_maximum_unroll_factor(0) {
+      _unrolled_count_log2(0), _node_count_before_unroll(0),
+      _slp_maximum_unroll_factor(0) {
     init_class_id(Class_CountedLoop);
     // Initialize _trip_count to the largest possible value.
     // Will be reset (lower) if the loop's trip count is known.
@@ -245,16 +254,16 @@
 
   // A 'main' loop that is ONLY unrolled or peeled, never RCE'd or
   // Aligned, may be missing it's pre-loop.
-  int is_normal_loop   () const { return (_loop_flags&PreMainPostFlagsMask) == Normal; }
-  int is_pre_loop      () const { return (_loop_flags&PreMainPostFlagsMask) == Pre;    }
-  int is_main_loop     () const { return (_loop_flags&PreMainPostFlagsMask) == Main;   }
-  int is_post_loop     () const { return (_loop_flags&PreMainPostFlagsMask) == Post;   }
-  int is_reduction_loop() const { return (_loop_flags&HasReductions) == HasReductions; }
-  int was_slp_analyzed () const { return (_loop_flags&WasSlpAnalyzed) == WasSlpAnalyzed; }
-  int has_passed_slp   () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; }
-  int do_unroll_only      () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; }
-  int is_main_no_pre_loop() const { return _loop_flags & MainHasNoPreLoop; }
-  int has_atomic_post_loop  () const { return (_loop_flags & HasAtomicPostLoop) == HasAtomicPostLoop; }
+  uint is_normal_loop   () const { return (_loop_flags&PreMainPostFlagsMask) == Normal; }
+  uint is_pre_loop      () const { return (_loop_flags&PreMainPostFlagsMask) == Pre;    }
+  uint is_main_loop     () const { return (_loop_flags&PreMainPostFlagsMask) == Main;   }
+  uint is_post_loop     () const { return (_loop_flags&PreMainPostFlagsMask) == Post;   }
+  uint is_reduction_loop() const { return (_loop_flags&HasReductions) == HasReductions; }
+  uint was_slp_analyzed () const { return (_loop_flags&WasSlpAnalyzed) == WasSlpAnalyzed; }
+  uint has_passed_slp   () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; }
+  uint do_unroll_only      () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; }
+  uint is_main_no_pre_loop() const { return _loop_flags & MainHasNoPreLoop; }
+  uint has_atomic_post_loop  () const { return (_loop_flags & HasAtomicPostLoop) == HasAtomicPostLoop; }
   void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; }
 
   int main_idx() const { return _main_idx; }
@@ -280,9 +289,6 @@
     _loop_flags &= ~PassedSlpAnalysis;
   }
 
-  void set_profile_trip_cnt(float ptc) { _profile_trip_cnt = ptc; }
-  float profile_trip_cnt()             { return _profile_trip_cnt; }
-
   void double_unrolled_count() { _unrolled_count_log2++; }
   int  unrolled_count()        { return 1 << MIN2(_unrolled_count_log2, BitsPerInt-3); }
 
@@ -301,6 +307,7 @@
   // If this is a main loop in a pre/main/post loop nest, walk over
   // the predicates that were inserted by
   // duplicate_predicates()/add_range_check_predicate()
+  static Node* skip_predicates_from_entry(Node* ctrl);
   Node* skip_predicates();
 
 #ifndef PRODUCT
@@ -588,6 +595,7 @@
   void compute_trip_count(PhaseIdealLoop* phase);
 
   // Compute loop trip count from profile data
+  float compute_profile_trip_cnt_helper(Node* n);
   void compute_profile_trip_cnt( PhaseIdealLoop *phase );
 
   // Reassociate invariant expressions.
@@ -730,9 +738,10 @@
   }
 
   Node* cast_incr_before_loop(Node* incr, Node* ctrl, Node* loop);
-  void duplicate_predicates(CountedLoopNode* pre_head, Node *min_taken, Node* castii,
-                            IdealLoopTree* outer_loop, LoopNode* outer_main_head,
-                            uint dd_main_head);
+  void duplicate_predicates_helper(Node* predicate, Node* castii, IdealLoopTree* outer_loop,
+                                   LoopNode* outer_main_head, uint dd_main_head);
+  void duplicate_predicates(CountedLoopNode* pre_head, Node* castii, IdealLoopTree* outer_loop,
+                            LoopNode* outer_main_head, uint dd_main_head);
 
 public:
 
@@ -1061,6 +1070,7 @@
                                          PhaseIterGVN* igvn);
   Node* clone_loop_predicates(Node* old_entry, Node* new_entry, bool clone_limit_check);
 
+  static Node* skip_all_loop_predicates(Node* entry);
   static Node* skip_loop_predicates(Node* entry);
 
   // Find a good location to insert a predicate
@@ -1075,12 +1085,20 @@
 
   // Implementation of the loop predication to promote checks outside the loop
   bool loop_predication_impl(IdealLoopTree *loop);
+  bool loop_predication_impl_helper(IdealLoopTree *loop, ProjNode* proj, ProjNode *predicate_proj,
+                                    CountedLoopNode *cl, ConNode* zero, Invariance& invar,
+                                    Deoptimization::DeoptReason reason);
+  bool loop_predication_should_follow_branches(IdealLoopTree *loop, ProjNode *predicate_proj, float& loop_trip_cnt);
+  void loop_predication_follow_branches(Node *c, IdealLoopTree *loop, float loop_trip_cnt,
+                                        PathFrequency& pf, Node_Stack& stack, VectorSet& seen,
+                                        Node_List& if_proj_list);
   ProjNode* insert_skeleton_predicate(IfNode* iff, IdealLoopTree *loop,
                                       ProjNode* proj, ProjNode *predicate_proj,
                                       ProjNode* upper_bound_proj,
                                       int scale, Node* offset,
                                       Node* init, Node* limit, jint stride,
-                                      Node* rng, bool& overflow);
+                                      Node* rng, bool& overflow,
+                                      Deoptimization::DeoptReason reason);
   Node* add_range_check_predicate(IdealLoopTree* loop, CountedLoopNode* cl,
                                   Node* predicate_proj, int scale_con, Node* offset,
                                   Node* limit, jint stride_con);
--- old/src/hotspot/share/opto/node.hpp	2018-05-16 09:40:33.660212324 +0200
+++ new/src/hotspot/share/opto/node.hpp	2018-05-16 09:40:27.510230172 +0200
@@ -73,6 +73,7 @@
 class FastLockNode;
 class FastUnlockNode;
 class IfNode;
+class IfProjNode;
 class IfFalseNode;
 class IfTrueNode;
 class InitializeNode;
@@ -672,8 +673,9 @@
     DEFINE_CLASS_ID(Proj,  Node, 3)
       DEFINE_CLASS_ID(CatchProj, Proj, 0)
       DEFINE_CLASS_ID(JumpProj,  Proj, 1)
-      DEFINE_CLASS_ID(IfTrue,    Proj, 2)
-      DEFINE_CLASS_ID(IfFalse,   Proj, 3)
+      DEFINE_CLASS_ID(IfProj,    Proj, 2)
+        DEFINE_CLASS_ID(IfTrue,    IfProj, 0)
+        DEFINE_CLASS_ID(IfFalse,   IfProj, 1)
       DEFINE_CLASS_ID(Parm,      Proj, 4)
       DEFINE_CLASS_ID(MachProj,  Proj, 5)
 
@@ -812,6 +814,7 @@
   DEFINE_CLASS_QUERY(FastUnlock)
   DEFINE_CLASS_QUERY(If)
   DEFINE_CLASS_QUERY(RangeCheck)
+  DEFINE_CLASS_QUERY(IfProj)
   DEFINE_CLASS_QUERY(IfFalse)
   DEFINE_CLASS_QUERY(IfTrue)
   DEFINE_CLASS_QUERY(Initialize)
--- old/src/hotspot/share/runtime/deoptimization.cpp	2018-05-16 09:40:39.790194533 +0200
+++ new/src/hotspot/share/runtime/deoptimization.cpp	2018-05-16 09:40:33.779211978 +0200
@@ -2071,7 +2071,7 @@
 
 // Local derived constants.
 // Further breakdown of DataLayout::trap_state, as promised by DataLayout.
-const int DS_REASON_MASK   = DataLayout::trap_mask >> 1;
+const int DS_REASON_MASK   = ((uint)DataLayout::trap_mask) >> 1;
 const int DS_RECOMPILE_BIT = DataLayout::trap_mask - DS_REASON_MASK;
 
 //---------------------------trap_state_reason---------------------------------
@@ -2170,6 +2170,7 @@
   "array_check",
   "intrinsic" JVMCI_ONLY("_or_type_checked_inlining"),
   "bimorphic" JVMCI_ONLY("_or_optimized_type_check"),
+  "profile_predicate",
   "unloaded",
   "uninitialized",
   "unreached",
--- old/src/hotspot/share/runtime/deoptimization.hpp	2018-05-16 09:40:45.886176841 +0200
+++ new/src/hotspot/share/runtime/deoptimization.hpp	2018-05-16 09:40:39.906194196 +0200
@@ -41,7 +41,7 @@
   enum DeoptReason {
     Reason_many = -1,             // indicates presence of several reasons
     Reason_none = 0,              // indicates absence of a relevant deopt.
-    // Next 7 reasons are recorded per bytecode in DataLayout::trap_bits.
+    // Next 8 reasons are recorded per bytecode in DataLayout::trap_bits.
     // This is more complicated for JVMCI as JVMCI may deoptimize to *some* bytecode before the
     // bytecode that actually caused the deopt (with inlining, JVMCI may even deoptimize to a
     // bytecode in another method):
@@ -62,6 +62,8 @@
     Reason_optimized_type_check   = Reason_bimorphic,
 #endif
 
+    Reason_profile_predicate,     // compiler generated predicate moved from frequent branch in a loop failed
+
     // recorded per method
     Reason_unloaded,              // unloaded class or constant pool entry
     Reason_uninitialized,         // bad class state (uninitialized)
@@ -92,8 +94,8 @@
     Reason_LIMIT,
 
     // Note:  Keep this enum in sync. with _trap_reason_name.
-    Reason_RECORDED_LIMIT = Reason_bimorphic  // some are not recorded per bc
-    // Note:  Reason_RECORDED_LIMIT should be < 8 to fit into 3 bits of
+    Reason_RECORDED_LIMIT = Reason_profile_predicate  // some are not recorded per bc
+    // Note:  Reason_RECORDED_LIMIT should fit into 31 bits of
     // DataLayout::trap_bits.  This dependency is enforced indirectly
     // via asserts, to avoid excessive direct header-to-header dependencies.
     // See Deoptimization::trap_state_reason and class DataLayout.
--- old/src/hotspot/share/runtime/vmStructs.cpp	2018-05-16 09:40:52.049158955 +0200
+++ new/src/hotspot/share/runtime/vmStructs.cpp	2018-05-16 09:40:45.979176571 +0200
@@ -295,6 +295,7 @@
   nonstatic_field(DataLayout,                  _header._struct._tag,                          u1)                                    \
   nonstatic_field(DataLayout,                  _header._struct._flags,                        u1)                                    \
   nonstatic_field(DataLayout,                  _header._struct._bci,                          u2)                                    \
+  nonstatic_field(DataLayout,                  _header._struct._traps,                        u4)                                    \
   nonstatic_field(DataLayout,                  _cells[0],                                     intptr_t)                              \
   nonstatic_field(MethodCounters,              _nmethod_age,                                  int)                                   \
   nonstatic_field(MethodCounters,              _interpreter_invocation_limit,                 int)                                   \
--- old/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/oops/DataLayout.java	2018-05-16 09:40:58.410140493 +0200
+++ new/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/oops/DataLayout.java	2018-05-16 09:40:52.197158525 +0200
@@ -47,19 +47,14 @@
   public static final int parametersTypeDataTag = 12;
   public static final int speculativeTrapDataTag = 13;
 
-  // The _struct._flags word is formatted as [trapState:4 | flags:4].
-  // The trap state breaks down further as [recompile:1 | reason:3].
+  // The trap state breaks down as [recompile:1 | reason:31].
   // This further breakdown is defined in deoptimization.cpp.
   // See Deoptimization.trapStateReason for an assert that
   // trapBits is big enough to hold reasons < reasonRecordedLimit.
   //
   // The trapState is collected only if ProfileTraps is true.
-  public static final int trapBits = 1+3;  // 3: enough to distinguish [0..reasonRecordedLimit].
-  public static final int trapShift = 8 - trapBits;
+  public static final int trapBits = 1+31;  // 31: enough to distinguish [0..reasonRecordedLimit].
   public static final int trapMask = Bits.rightNBits(trapBits);
-  public static final int trapMaskInPlace = (trapMask << trapShift);
-  public static final int flagLimit = trapShift;
-  public static final int flagMask = Bits.rightNBits(flagLimit);
   public static final int firstFlag = 0;
 
   private Address data;
@@ -97,16 +92,17 @@
 
   // Every data layout begins with a header.  This header
   // contains a tag, which is used to indicate the size/layout
-  // of the data, 4 bits of flags, which can be used in any way,
-  // 4 bits of trap history (none/one reason/many reasons),
+  // of the data, 8 bits of flags, which can be used in any way,
+  // 32 bits of trap history (none/one reason/many reasons),
   // and a bci, which is used to tie this piece of data to a
   // specific bci in the bytecodes.
   // union {
-  //   intptrT _bits;
+  //   u8 _bits;
   //   struct {
   //     u1 _tag;
   //     u1 _flags;
   //     u2 _bci;
+  //     u4 _traps;
   //   } _struct;
   // } _header;
 
@@ -119,10 +115,10 @@
 
   // Size computation
   static int headerSizeInBytes() {
-    return MethodData.cellSize;
+    return MethodData.cellSize * headerSizeInCells();
   }
   static int headerSizeInCells() {
-    return 1;
+      return VM.getVM().isLP64() ? 1 : 2;
   }
 
   static public int computeSizeInBytes(int cellCount) {
@@ -146,7 +142,7 @@
   // simplifying assumption that all N occurrences can be blamed
   // on that BCI.
   int trapState() {
-    return (flags() >> trapShift) & trapMask;
+    return data.getJIntAt(offset+4);
   }
 
   int flags() {