hotspot Udiff src/share/vm/opto/superword.cpp

src/share/vm/opto/superword.cpp

@@ -66,21 +66,22 @@
   _lpt(NULL),                             // loop tree node
   _lp(NULL),                              // LoopNode
   _bb(NULL),                              // basic block
   _iv(NULL),                              // induction var
   _race_possible(false),                  // cases where SDMU is true
+  _early_return(true),                    // analysis evaluations routine
   _num_work_vecs(0),                      // amount of vector work we have
   _num_reductions(0),                     // amount of reduction work we have
   _do_vector_loop(phase->C->do_vector_loop()),  // whether to do vectorization/simd style
   _ii_first(-1),                          // first loop generation index - only if do_vector_loop()
   _ii_last(-1),                           // last loop generation index - only if do_vector_loop()
   _ii_order(arena(), 8, 0, 0),
   _vector_loop_debug(phase->C->has_method() && phase->C->method_has_option("VectorizeDebug"))
 {}
 
 //------------------------------transform_loop---------------------------
-void SuperWord::transform_loop(IdealLoopTree* lpt) {
+void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
   assert(UseSuperWord, "should be");
   // Do vectors exist on this architecture?
   if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
 
   assert(lpt->_head->is_CountedLoop(), "must be");

@@ -111,12 +112,162 @@
   set_lp(cl);
 
   // For now, define one block which is the entire loop body
   set_bb(cl);
 
+  if (do_optimization) {
   assert(_packset.length() == 0, "packset must be empty");
   SLP_extract();
+  }
+}
+
+//------------------------------early unrolling analysis------------------------------
+void SuperWord::unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor) {
+  bool is_slp = true;
+  ResourceMark rm;
+  size_t ignored_size = lpt()->_body.size();
+  int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
+  Node_Stack nstack((int)ignored_size);
+  Node *cl_exit = cl->loopexit();
+
+  // First clear the entries
+  for (uint i = 0; i < lpt()->_body.size(); i++) {
+    ignored_loop_nodes[i] = -1;
+  }
+
+  int max_vector = Matcher::max_vector_size(T_INT);
+
+  // Process the loop, some/all of the stack entries will not be in order, ergo
+  // need to preprocess the ignored initial state before we process the loop
+  for (uint i = 0; i < lpt()->_body.size(); i++) {
+    Node* n = lpt()->_body.at(i);
+    if (n == cl->incr() ||
+      n->is_reduction() ||
+      n->is_AddP() ||
+      n->is_Cmp() ||
+      n->is_IfTrue() ||
+      n->is_CountedLoop() ||
+      (n == cl_exit)) {
+      ignored_loop_nodes[i] = n->_idx;
+      continue;
+    }
+
+    if (n->is_If()) {
+      IfNode *iff = n->as_If();
+      if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
+        if (lpt()->is_loop_exit(iff)) {
+          ignored_loop_nodes[i] = n->_idx;
+          continue;
+        }
+      }
+    }
+
+    if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
+      Node* n_tail = n->in(LoopNode::LoopBackControl);
+      if (n_tail != n->in(LoopNode::EntryControl)) {
+        if (!n_tail->is_Mem()) {
+          is_slp = false;
+          break;
+        }
+      }
+    }
+
+    // This must happen after check of phi/if
+    if (n->is_Phi() || n->is_If()) {
+      ignored_loop_nodes[i] = n->_idx;
+      continue;
+    }
+
+    if (n->is_LoadStore() || n->is_MergeMem() ||
+      (n->is_Proj() && !n->as_Proj()->is_CFG())) {
+      is_slp = false;
+      break;
+    }
+
+    if (n->is_Mem()) {
+      Node* adr = n->in(MemNode::Address);
+      Node* n_ctrl = _phase->get_ctrl(adr);
+
+      // save a queue of post process nodes
+      if (n_ctrl != NULL && lpt()->is_member(_phase->get_loop(n_ctrl))) {
+        MemNode* current = n->as_Mem();
+        BasicType bt = current->memory_type();
+        if (is_java_primitive(bt) == false) {
+          ignored_loop_nodes[i] = n->_idx;
+          continue;
+        }
+
+        // Process the memory expression
+        int stack_idx = 0;
+        bool have_side_effects = true;
+        if (adr->is_AddP() == false) {
+          nstack.push(adr, stack_idx++);
+        } else {
+          // Mark the components of the memory operation in nstack
+          SWPointer p1(current, this, &nstack, true);
+          have_side_effects = p1.node_stack()->is_nonempty();
+        }
+
+        // Process the pointer stack
+        while (have_side_effects) {
+          Node* pointer_node = nstack.node();
+          for (uint j = 0; j < lpt()->_body.size(); j++) {
+            Node* cur_node = lpt()->_body.at(j);
+            if (cur_node == pointer_node) {
+              ignored_loop_nodes[j] = cur_node->_idx;
+              break;
+            }
+          }
+          nstack.pop();
+          have_side_effects = nstack.is_nonempty();
+        }
+      }
+    }
+  }
+
+  if (is_slp) {
+    // Now we try to find the maximum supported consistent vector which the machine
+    // description can use
+    for (uint i = 0; i < lpt()->_body.size(); i++) {
+      if (ignored_loop_nodes[i] != -1) continue;
+
+      BasicType bt;
+      Node* n = lpt()->_body.at(i);
+      if (n->is_Store()) {
+        bt = n->as_Mem()->memory_type();
+      }
+      else {
+        bt = n->bottom_type()->basic_type();
+      }
+
+      int cur_max_vector = Matcher::max_vector_size(bt);
+
+      // If a max vector exists which is not larger than _local_loop_unroll_factor
+      // stop looking, we already have the max vector to map to.
+      if (cur_max_vector <= local_loop_unroll_factor) {
+        is_slp = false;
+#ifndef PRODUCT
+        if (TraceSuperWordLoopUnrollAnalysis) {
+          tty->print_cr("slp analysis fails: unroll limit equals max vector\n");
+        }
+#endif
+        break;
+      }
+
+      // Map the maximal common vector
+      if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
+        if (cur_max_vector < max_vector) {
+          max_vector = cur_max_vector;
+        }
+      }
+    }
+    if (is_slp) {
+      local_loop_unroll_factor = max_vector;
+    }
+    cl->mark_passed_slp();
+    cl->set_slp_max_unroll(local_loop_unroll_factor);
+  }
 }
 
 //------------------------------SLP_extract---------------------------
 // Extract the superword level parallelism
 //

@@ -266,16 +417,16 @@
       // this reference to a vector-aligned address.
       best_align_to_mem_ref = mem_ref;
       best_iv_adjustment = iv_adjustment;
     }
 
-    SWPointer align_to_ref_p(mem_ref, this);
+    SWPointer align_to_ref_p(mem_ref, this, NULL, false);
     // Set alignment relative to "align_to_ref" for all related memory operations.
     for (int i = memops.size() - 1; i >= 0; i--) {
       MemNode* s = memops.at(i)->as_Mem();
       if (isomorphic(s, mem_ref)) {
-        SWPointer p2(s, this);
+        SWPointer p2(s, this, NULL, false);
         if (p2.comparable(align_to_ref_p)) {
           int align = memory_alignment(s, iv_adjustment);
           set_alignment(s, align);
         }
       }

@@ -292,11 +443,11 @@
           // Do not vectorize a memory access with more elements per vector
           // if unaligned memory access is not allowed because number of
           // iterations in pre-loop will be not enough to align it.
           create_pack = false;
         } else {
-          SWPointer p2(best_align_to_mem_ref, this);
+          SWPointer p2(best_align_to_mem_ref, this, NULL, false);
           if (align_to_ref_p.invar() != p2.invar()) {
             // Do not vectorize memory accesses with different invariants
             // if unaligned memory accesses are not allowed.
             create_pack = false;
           }

@@ -409,20 +560,20 @@
   GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
 
   // Count number of comparable memory ops
   for (uint i = 0; i < memops.size(); i++) {
     MemNode* s1 = memops.at(i)->as_Mem();
-    SWPointer p1(s1, this);
+    SWPointer p1(s1, this, NULL, false);
     // Discard if pre loop can't align this reference
     if (!ref_is_alignable(p1)) {
       *cmp_ct.adr_at(i) = 0;
       continue;
     }
     for (uint j = i+1; j < memops.size(); j++) {
       MemNode* s2 = memops.at(j)->as_Mem();
       if (isomorphic(s1, s2)) {
-        SWPointer p2(s2, this);
+        SWPointer p2(s2, this, NULL, false);
         if (p1.comparable(p2)) {
           (*cmp_ct.adr_at(i))++;
           (*cmp_ct.adr_at(j))++;
         }
       }

@@ -439,11 +590,11 @@
   for (uint j = 0; j < memops.size(); j++) {
     MemNode* s = memops.at(j)->as_Mem();
     if (s->is_Store()) {
       int vw = vector_width_in_bytes(s);
       assert(vw > 1, "sanity");
-      SWPointer p(s, this);
+      SWPointer p(s, this, NULL, false);
       if (cmp_ct.at(j) >  max_ct ||
           cmp_ct.at(j) == max_ct &&
             (vw >  max_vw ||
              vw == max_vw &&
               (data_size(s) <  min_size ||

@@ -462,11 +613,11 @@
     for (uint j = 0; j < memops.size(); j++) {
       MemNode* s = memops.at(j)->as_Mem();
       if (s->is_Load()) {
         int vw = vector_width_in_bytes(s);
         assert(vw > 1, "sanity");
-        SWPointer p(s, this);
+        SWPointer p(s, this, NULL, false);
         if (cmp_ct.at(j) >  max_ct ||
             cmp_ct.at(j) == max_ct &&
               (vw >  max_vw ||
                vw == max_vw &&
                 (data_size(s) <  min_size ||

@@ -573,11 +724,11 @@
 }
 
 //---------------------------get_iv_adjustment---------------------------
 // Calculate loop's iv adjustment for this memory ops.
 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
-  SWPointer align_to_ref_p(mem_ref, this);
+  SWPointer align_to_ref_p(mem_ref, this, NULL, false);
   int offset = align_to_ref_p.offset_in_bytes();
   int scale  = align_to_ref_p.scale_in_bytes();
   int elt_size = align_to_ref_p.memory_size();
   int vw       = vector_width_in_bytes(mem_ref);
   assert(vw > 1, "sanity");

@@ -647,17 +798,17 @@
 
       // If no dependency yet, use slice
       if (_dg.dep(s1)->in_cnt() == 0) {
         _dg.make_edge(slice, s1);
       }
-      SWPointer p1(s1->as_Mem(), this);
+      SWPointer p1(s1->as_Mem(), this, NULL, false);
       bool sink_dependent = true;
       for (int k = j - 1; k >= 0; k--) {
         Node* s2 = _nlist.at(k);
         if (s1->is_Load() && s2->is_Load())
           continue;
-        SWPointer p2(s2->as_Mem(), this);
+        SWPointer p2(s2->as_Mem(), this, NULL, false);
 
         int cmp = p1.cmp(p2);
         if (SuperWordRTDepCheck &&
             p1.base() != p2.base() && p1.valid() && p2.valid()) {
           // Create a runtime check to disambiguate

@@ -793,12 +944,12 @@
   // FIXME - co_locate_pack fails on Stores in different mem-slices, so
   // only pack memops that are in the same alias set until that's fixed.
   if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
       _phase->C->get_alias_index(s2->as_Mem()->adr_type()))
     return false;
-  SWPointer p1(s1->as_Mem(), this);
-  SWPointer p2(s2->as_Mem(), this);
+  SWPointer p1(s1->as_Mem(), this, NULL, false);
+  SWPointer p2(s2->as_Mem(), this, NULL, false);
   if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
   int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
   return diff == data_size(s1);
 }

@@ -1613,17 +1764,17 @@
       Node* first   = executed_first(p);
       int   opc = n->Opcode();
       if (n->is_Load()) {
         Node* ctl = n->in(MemNode::Control);
         Node* mem = first->in(MemNode::Memory);
-        SWPointer p1(n->as_Mem(), this);
+        SWPointer p1(n->as_Mem(), this, NULL, false);
         // Identify the memory dependency for the new loadVector node by
         // walking up through memory chain.
         // This is done to give flexibility to the new loadVector node so that
         // it can move above independent storeVector nodes.
         while (mem->is_StoreVector()) {
-          SWPointer p2(mem->as_Mem(), this);
+          SWPointer p2(mem->as_Mem(), this, NULL, false);
           int cmp = p1.cmp(p2);
           if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
             mem = mem->in(MemNode::Memory);
           } else {
             break; // dependent memory

@@ -2136,11 +2287,11 @@
 }
 
 //------------------------------memory_alignment---------------------------
 // Alignment within a vector memory reference
 int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
-  SWPointer p(s, this);
+  SWPointer p(s, this, NULL, false);
   if (!p.valid()) {
     return bottom_align;
   }
   int vw = vector_width_in_bytes(s);
   if (vw < 2) {

@@ -2300,11 +2451,11 @@
   // Ensure the original loop limit is available from the
   // pre-loop Opaque1 node.
   Node *orig_limit = pre_opaq->original_loop_limit();
   assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
 
-  SWPointer align_to_ref_p(align_to_ref, this);
+  SWPointer align_to_ref_p(align_to_ref, this, NULL, false);
   assert(align_to_ref_p.valid(), "sanity");
 
   // Given:
   //     lim0 == original pre loop limit
   //     V == v_align (power of 2)

@@ -2474,10 +2625,11 @@
   _lpt = NULL;
   _lp = NULL;
   _bb = NULL;
   _iv = NULL;
   _race_possible = 0;
+  _early_return = false;
   _num_work_vecs = 0;
   _num_reductions = 0;
 }
 
 //------------------------------restart---------------------------

@@ -2544,13 +2696,15 @@
 
 
 //==============================SWPointer===========================
 
 //----------------------------SWPointer------------------------
-SWPointer::SWPointer(MemNode* mem, SuperWord* slp) :
+SWPointer::SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only) :
   _mem(mem), _slp(slp),  _base(NULL),  _adr(NULL),
-  _scale(0), _offset(0), _invar(NULL), _negate_invar(false) {
+  _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
+  _nstack(nstack), _analyze_only(analyze_only),
+  _stack_idx(0) {
 
   Node* adr = mem->in(MemNode::Address);
   if (!adr->is_AddP()) {
     assert(!valid(), "too complex");
     return;

@@ -2584,11 +2738,13 @@
 
 // Following is used to create a temporary object during
 // the pattern match of an address expression.
 SWPointer::SWPointer(SWPointer* p) :
   _mem(p->_mem), _slp(p->_slp),  _base(NULL),  _adr(NULL),
-  _scale(0), _offset(0), _invar(NULL), _negate_invar(false) {}
+  _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
+  _nstack(p->_nstack), _analyze_only(p->_analyze_only),
+  _stack_idx(p->_stack_idx) {}
 
 //------------------------scaled_iv_plus_offset--------------------
 // Match: k*iv + offset
 // where: k is a constant that maybe zero, and
 //        offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional

@@ -2627,10 +2783,13 @@
   }
   if (n == iv()) {
     _scale = 1;
     return true;
   }
+  if (_analyze_only && (invariant(n) == false)) {
+    _nstack->push(n, _stack_idx++);
+  }
   int opc = n->Opcode();
   if (opc == Op_MulI) {
     if (n->in(1) == iv() && n->in(2)->is_Con()) {
       _scale = n->in(2)->get_int();
       return true;

@@ -2684,10 +2843,13 @@
       return true;
     }
     return false;
   }
   if (_invar != NULL) return false; // already have an invariant
+  if (_analyze_only && (invariant(n) == false)) {
+    _nstack->push(n, _stack_idx++);
+  }
   if (opc == Op_AddI) {
     if (n->in(2)->is_Con() && invariant(n->in(1))) {
       _negate_invar = negate;
       _invar = n->in(1);
       _offset += negate ? -(n->in(2)->get_int()) : n->in(2)->get_int();

< prev index next >