< prev index next >

src/share/vm/opto/superword.cpp

Print this page

        

@@ -49,10 +49,11 @@
   _igvn(phase->_igvn),
   _arena(phase->C->comp_arena()),
   _packset(arena(), 8,  0, NULL),         // packs for the current block
   _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
   _block(arena(), 8,  0, NULL),           // nodes in current block
+  _post_block(arena(), 8, 0, NULL),       // nodes common to current block which are marked as post loop vectorizable
   _data_entry(arena(), 8,  0, NULL),      // nodes with all inputs from outside
   _mem_slice_head(arena(), 8,  0, NULL),  // memory slice heads
   _mem_slice_tail(arena(), 8,  0, NULL),  // memory slice tails
   _node_info(arena(), 8,  0, SWNodeInfo::initial), // info needed per node
   _clone_map(phase->C->clone_map()),      // map of nodes created in cloning

@@ -97,14 +98,34 @@
   assert(lpt->_head->is_CountedLoop(), "must be");
   CountedLoopNode *cl = lpt->_head->as_CountedLoop();
 
   if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
 
-  if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops
+  bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
+  if (post_loop_allowed) {
+    if (cl->is_reduction_loop()) return; // no predication mapping
+    Node *limit = cl->limit();
+    if (limit->is_Con()) return; // non constant limits only
+    // Now check the limit for expressions we do not handle
+    if (limit->is_Add()) {
+      Node *in2 = limit->in(2);
+      if (in2->is_Con()) {
+        int val = in2->get_int();
+        // should not try to program these cases
+        if (val < 0) return;
+      }
+    }
+  }
+
+  // skip any loop that has not been assigned max unroll by analysis
+  if (do_optimization) {
+    if (cl->slp_max_unroll() == 0) return;
+  }
+
   // Check for no control flow in body (other than exit)
   Node *cl_exit = cl->loopexit();
-  if (cl_exit->in(0) != lpt->_head) {
+  if (cl->is_main_loop() && (cl_exit->in(0) != lpt->_head)) {
     #ifndef PRODUCT
       if (TraceSuperWord) {
         tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
         tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
         tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();

@@ -118,19 +139,20 @@
   // Make sure the are no extra control users of the loop backedge
   if (cl->back_control()->outcnt() != 1) {
     return;
   }
 
-  // We only re-enter slp when we vector mapped a queried loop and we want to
-  // continue unrolling, in this case, slp is not subsequently done.
-  if (cl->do_unroll_only()) return;
+  // Skip any loops already optimized by slp
+  if (cl->is_vectorized_loop()) return;
 
+  if (cl->is_main_loop()) {
   // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
   CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
   if (pre_end == NULL) return;
   Node *pre_opaq1 = pre_end->limit();
   if (pre_opaq1->Opcode() != Op_Opaque1) return;
+  }
 
   init(); // initialize data structures
 
   set_lpt(lpt);
   set_lp(cl);

@@ -139,10 +161,23 @@
   set_bb(cl);
 
   if (do_optimization) {
     assert(_packset.length() == 0, "packset must be empty");
     SLP_extract();
+    if (PostLoopMultiversioning && Matcher::has_predicated_vectors()) {
+      if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) {
+        IdealLoopTree *lpt_next = lpt->_next;
+        CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
+        _phase->has_range_checks(lpt_next);
+        if (cl_next->is_post_loop() && !cl_next->range_checks_present()) {
+          if (!cl_next->is_vectorized_loop()) {
+            int slp_max_unroll_factor = cl->slp_max_unroll();
+            cl_next->set_slp_max_unroll(slp_max_unroll_factor);
+          }
+        }
+      }
+    }
   }
 }
 
 //------------------------------early unrolling analysis------------------------------
 void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {

@@ -151,17 +186,21 @@
   size_t ignored_size = lpt()->_body.size();
   int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
   Node_Stack nstack((int)ignored_size);
   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
   Node *cl_exit = cl->loopexit();
+  int rpo_idx = _post_block.length();
+
+  assert(rpo_idx == 0, "post loop block is empty");
 
   // First clear the entries
   for (uint i = 0; i < lpt()->_body.size(); i++) {
     ignored_loop_nodes[i] = -1;
   }
 
   int max_vector = Matcher::max_vector_size(T_INT);
+  bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
 
   // Process the loop, some/all of the stack entries will not be in order, ergo
   // need to preprocess the ignored initial state before we process the loop
   for (uint i = 0; i < lpt()->_body.size(); i++) {
     Node* n = lpt()->_body.at(i);

@@ -256,20 +295,39 @@
   }
 
   if (is_slp) {
     // Now we try to find the maximum supported consistent vector which the machine
     // description can use
+    bool small_basic_type = false;
     for (uint i = 0; i < lpt()->_body.size(); i++) {
       if (ignored_loop_nodes[i] != -1) continue;
 
       BasicType bt;
       Node* n = lpt()->_body.at(i);
       if (n->is_Mem()) {
         bt = n->as_Mem()->memory_type();
       } else {
         bt = n->bottom_type()->basic_type();
       }
+
+      if (post_loop_allowed) {
+        if (!small_basic_type) {
+          switch (bt) {
+          case T_CHAR:
+          case T_BYTE:
+          case T_SHORT:
+            small_basic_type = true;
+            break;
+          case T_LONG:
+            // TODO: Remove when support completed for mask context with LONG.
+            //       The ad files need to be augmented.
+            small_basic_type = true;
+            break;
+          }
+        }
+      }
+
       if (is_java_primitive(bt) == false) continue;
 
       int cur_max_vector = Matcher::max_vector_size(bt);
 
       // If a max vector exists which is not larger than _local_loop_unroll_factor

@@ -285,19 +343,32 @@
       // Map the maximal common vector
       if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
         if (cur_max_vector < max_vector) {
           max_vector = cur_max_vector;
         }
+
+        // We only process post loops on predicated targets where we want to
+        // mask map the loop to a single iteration
+        if (post_loop_allowed) {
+          _post_block.at_put_grow(rpo_idx++, n);
+        }
       }
     }
     if (is_slp) {
       local_loop_unroll_factor = max_vector;
       cl->mark_passed_slp();
     }
     cl->mark_was_slp();
+    if (cl->is_main_loop()) {
+      cl->set_slp_max_unroll(local_loop_unroll_factor);
+    } else if (post_loop_allowed) {
+      if (!small_basic_type) {
+        // avoid replication context for small basic types in programmable masked loops
     cl->set_slp_max_unroll(local_loop_unroll_factor);
   }
+    }
+  }
 }
 
 //------------------------------SLP_extract---------------------------
 // Extract the superword level parallelism
 //

@@ -347,16 +418,20 @@
 #endif
   // Ready the block
   if (!construct_bb()) {
     return; // Exit if no interesting nodes or complex graph.
   }
+
   // build    _dg, _disjoint_ptrs
   dependence_graph();
 
   // compute function depth(Node*)
   compute_max_depth();
 
+  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
+  bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
+  if (cl->is_main_loop()) {
   if (_do_vector_loop) {
     if (mark_generations() != -1) {
       hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly
 
       if (!construct_bb()) {

@@ -407,10 +482,43 @@
   }
 
   filter_packs();
 
   schedule();
+  } else if (post_loop_allowed) {
+    int saved_mapped_unroll_factor = cl->slp_max_unroll();
+    if (saved_mapped_unroll_factor) {
+      int vector_mapped_unroll_factor = saved_mapped_unroll_factor;
+
+      // now reset the slp_unroll_factor so that we can check the analysis mapped
+      // what the vector loop was mapped to
+      cl->set_slp_max_unroll(0);
+
+      // do the analysis on the post loop
+      unrolling_analysis(vector_mapped_unroll_factor);
+
+      // if our analyzed loop is a canonical fit, start processing it
+      if (vector_mapped_unroll_factor == saved_mapped_unroll_factor) {
+        // now add the vector nodes to packsets
+        for (int i = 0; i < _post_block.length(); i++) {
+          Node* n = _post_block.at(i);
+          Node_List* singleton = new Node_List();
+          singleton->push(n);
+          _packset.append(singleton);
+          set_my_pack(n, singleton);
+        }
+
+        // map base types for vector usage
+        compute_vector_element_type();
+      } else {
+        return;
+      }
+    } else {
+      // for some reason we could not map the slp analysis state of the vectorized loop
+      return;
+    }
+  }
 
   output();
 }
 
 //------------------------------find_adjacent_refs---------------------------

@@ -808,10 +916,11 @@
 //---------------------------dependence_graph---------------------------
 // Construct dependency graph.
 // Add dependence edges to load/store nodes for memory dependence
 //    A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
 void SuperWord::dependence_graph() {
+  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
   // First, assign a dependence node to each memory node
   for (int i = 0; i < _block.length(); i++ ) {
     Node *n = _block.at(i);
     if (n->is_Mem() || n->is_Phi() && n->bottom_type() == Type::MEMORY) {
       _dg.make_node(n);

@@ -822,11 +931,13 @@
   for (int i = 0; i < _mem_slice_head.length(); i++) {
     Node* n      = _mem_slice_head.at(i);
     Node* n_tail = _mem_slice_tail.at(i);
 
     // Get slice in predecessor order (last is first)
+    if (cl->is_main_loop()) {
     mem_slice_preds(n_tail, n, _nlist);
+    }
 
 #ifndef PRODUCT
     if(TraceSuperWord && Verbose) {
       tty->print_cr("SuperWord::dependence_graph: built a new mem slice");
       for (int j = _nlist.length() - 1; j >= 0 ; j--) {

@@ -2026,24 +2137,27 @@
     tty->print("SuperWord::output    ");
     lpt()->dump_head();
   }
 #endif
 
+  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
+  if (cl->is_main_loop()) {
   // MUST ENSURE main loop's initial value is properly aligned:
   //  (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
 
   align_initial_loop_index(align_to_ref());
 
   // Insert extract (unpack) operations for scalar uses
   for (int i = 0; i < _packset.length(); i++) {
     insert_extracts(_packset.at(i));
   }
+  }
 
   Compile* C = _phase->C;
-  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
   uint max_vlen_in_bytes = 0;
   uint max_vlen = 0;
+  bool can_process_post_loop = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
 
   NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop before create_reserve_version_of_loop"); print_loop(true);})
 
   CountedLoopReserveKit make_reversable(_phase, _lpt, do_reserve_copy());
 

@@ -2061,10 +2175,14 @@
       uint vlen = p->size();
       uint vlen_in_bytes = 0;
       Node* vn = NULL;
       Node* low_adr = p->at(0);
       Node* first   = executed_first(p);
+      if (can_process_post_loop) {
+        // override vlen with the main loops vector length
+        vlen = cl->slp_max_unroll();
+      }
       NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d executed first, %d executed last in pack", first->_idx, n->_idx); print_pack(p);})
       int   opc = n->Opcode();
       if (n->is_Load()) {
         Node* ctl = n->in(MemNode::Control);
         Node* mem = first->in(MemNode::Memory);

@@ -2150,10 +2268,14 @@
         // Promote operand to vector (Sqrt/Abs/Neg are 2 address instructions)
         Node* in = vector_opd(p, 1);
         vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
       } else if (is_cmov_pack(p)) {
+        if (can_process_post_loop) {
+          // do not refactor of flow in post loop context
+          return;
+        }
         if (!n->is_CMove()) {
           continue;
         }
         // place here CMoveVDNode
         NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: print before CMove vectorization"); print_loop(false);})

@@ -2214,18 +2336,27 @@
           return; //and reverse to backup IG
         }
         ShouldNotReachHere();
       }
 
+      _block.at_put(i, vn);
       _igvn.register_new_node_with_optimizer(vn);
       _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
       for (uint j = 0; j < p->size(); j++) {
         Node* pm = p->at(j);
         _igvn.replace_node(pm, vn);
       }
       _igvn._worklist.push(vn);
 
+      if (can_process_post_loop) {
+        // first check if the vector size if the maximum vector which we can use on the machine,
+        // other vector size have reduced values for predicated data mapping.
+        if (vlen_in_bytes != (uint)MaxVectorSize) {
+          return;
+        }
+      }
+
       if (vlen_in_bytes > max_vlen_in_bytes) {
         max_vlen = vlen;
         max_vlen_in_bytes = vlen_in_bytes;
       }
 #ifdef ASSERT

@@ -2244,19 +2375,41 @@
       uint slp_max_unroll_factor = cl->slp_max_unroll();
       if (slp_max_unroll_factor == max_vlen) {
         if (TraceSuperWordLoopUnrollAnalysis) {
           tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte);
         }
-        // For atomic unrolled loops which are vector mapped, instigate more unrolling.
+
+        // For atomic unrolled loops which are vector mapped, instigate more unrolling
         cl->set_notpassed_slp();
-        // if vector resources are limited, do not allow additional unrolling
+        if (cl->is_main_loop()) {
+          // if vector resources are limited, do not allow additional unrolling, also
+          // do not unroll more on pure vector loops which were not reduced so that we can
+          // program the post loop to single iteration execution.
         if (FLOATPRESSURE > 8) {
           C->set_major_progress();
-        }
         cl->mark_do_unroll_only();
+          }
+        }
+
         if (do_reserve_copy()) {
           cl->mark_loop_vectorized();
+          if (can_process_post_loop) {
+            // Now create the difference of trip and limit and use it as our mask index.
+            // Note: We limited the unroll of the vectorized loop so that
+            //       only vlen-1 size iterations can remain to be mask programmed.
+            Node *incr = cl->incr();
+            SubINode *index = new SubINode(cl->limit(), cl->init_trip());
+            _igvn.register_new_node_with_optimizer(index);
+            MaskCreateINode *mask = new MaskCreateINode(_phase->get_ctrl(cl->init_trip()), index);
+            _igvn.register_new_node_with_optimizer(mask);
+            // make this a single iteration loop
+            AddINode *new_incr = new AddINode(incr->in(1), mask);
+            _igvn.register_new_node_with_optimizer(new_incr);
+            _phase->set_ctrl(new_incr, _phase->get_ctrl(incr));
+            _igvn.replace_node(incr, new_incr);
+            cl->mark_is_multiversioned();
+          }
         }
       }
     }
   }
 

@@ -2271,10 +2424,16 @@
 // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
 Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
   Node* p0 = p->at(0);
   uint vlen = p->size();
   Node* opd = p0->in(opd_idx);
+  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
+
+  if (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop()) {
+    // override vlen with the main loops vector length
+    vlen = cl->slp_max_unroll();
+  }
 
   if (same_inputs(p, opd_idx)) {
     if (opd->is_Vector() || opd->is_LoadVector()) {
       assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector");
       if (opd_idx == 2 && VectorNode::is_shift(p0)) {

@@ -3087,17 +3246,17 @@
   CountedLoopNode* loop_node = pre_end->loopnode();
   if (loop_node == NULL || !loop_node->is_pre_loop()) return NULL;
   return pre_end;
 }
 
-
 //------------------------------init---------------------------
 void SuperWord::init() {
   _dg.init();
   _packset.clear();
   _disjoint_ptrs.clear();
   _block.clear();
+  _post_block.clear();
   _data_entry.clear();
   _mem_slice_head.clear();
   _mem_slice_tail.clear();
   _iteration_first.clear();
   _iteration_last.clear();

@@ -3117,10 +3276,11 @@
 void SuperWord::restart() {
   _dg.init();
   _packset.clear();
   _disjoint_ptrs.clear();
   _block.clear();
+  _post_block.clear();
   _data_entry.clear();
   _mem_slice_head.clear();
   _mem_slice_tail.clear();
   _node_info.clear();
 }
< prev index next >