< prev index next >

src/share/vm/opto/superword.cpp

Print this page




  83   assert(UseSuperWord, "should be");
  84   // Do vectors exist on this architecture?
  85   if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
  86 
  87   assert(lpt->_head->is_CountedLoop(), "must be");
  88   CountedLoopNode *cl = lpt->_head->as_CountedLoop();
  89 
  90   if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
  91 
  92   if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops
  93 
  94   // Check for no control flow in body (other than exit)
  95   Node *cl_exit = cl->loopexit();
  96   if (cl_exit->in(0) != lpt->_head) return;
  97 
  98   // Make sure the are no extra control users of the loop backedge
  99   if (cl->back_control()->outcnt() != 1) {
 100     return;
 101   }
 102 




 103   // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
 104   CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
 105   if (pre_end == NULL) return;
 106   Node *pre_opaq1 = pre_end->limit();
 107   if (pre_opaq1->Opcode() != Op_Opaque1) return;
 108 
 109   init(); // initialize data structures
 110 
 111   set_lpt(lpt);
 112   set_lp(cl);
 113 
 114   // For now, define one block which is the entire loop body
 115   set_bb(cl);
 116 
 117   if (do_optimization) {
 118     assert(_packset.length() == 0, "packset must be empty");
 119     SLP_extract();
 120   }
 121 }
 122 
 123 //------------------------------early unrolling analysis------------------------------
 124 void SuperWord::unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor) {
 125   bool is_slp = true;
 126   ResourceMark rm;
 127   size_t ignored_size = lpt()->_body.size();
 128   int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
 129   Node_Stack nstack((int)ignored_size);

 130   Node *cl_exit = cl->loopexit();
 131 
 132   // First clear the entries
 133   for (uint i = 0; i < lpt()->_body.size(); i++) {
 134     ignored_loop_nodes[i] = -1;
 135   }
 136 
 137   int max_vector = Matcher::max_vector_size(T_INT);
 138 
 139   // Process the loop, some/all of the stack entries will not be in order, ergo
 140   // need to preprocess the ignored initial state before we process the loop
 141   for (uint i = 0; i < lpt()->_body.size(); i++) {
 142     Node* n = lpt()->_body.at(i);
 143     if (n == cl->incr() ||
 144       n->is_reduction() ||
 145       n->is_AddP() ||
 146       n->is_Cmp() ||
 147       n->is_IfTrue() ||
 148       n->is_CountedLoop() ||
 149       (n == cl_exit)) {


 224   }
 225 
 226   if (is_slp) {
 227     // Now we try to find the maximum supported consistent vector which the machine
 228     // description can use
 229     for (uint i = 0; i < lpt()->_body.size(); i++) {
 230       if (ignored_loop_nodes[i] != -1) continue;
 231 
 232       BasicType bt;
 233       Node* n = lpt()->_body.at(i);
 234       if (n->is_Store()) {
 235         bt = n->as_Mem()->memory_type();
 236       } else {
 237         bt = n->bottom_type()->basic_type();
 238       }
 239 
 240       int cur_max_vector = Matcher::max_vector_size(bt);
 241 
 242       // If a max vector exists which is not larger than _local_loop_unroll_factor
 243       // stop looking, we already have the max vector to map to.
 244       if (cur_max_vector <= local_loop_unroll_factor) {
 245         is_slp = false;
 246 #ifndef PRODUCT
 247         if (TraceSuperWordLoopUnrollAnalysis) {
 248           tty->print_cr("slp analysis fails: unroll limit equals max vector\n");
 249         }
 250 #endif
 251         break;
 252       }
 253 
 254       // Map the maximal common vector
 255       if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
 256         if (cur_max_vector < max_vector) {
 257           max_vector = cur_max_vector;
 258         }
 259       }
 260     }
 261     if (is_slp) {
 262       local_loop_unroll_factor = max_vector;
 263     }
 264     cl->mark_passed_slp();


 265     cl->set_slp_max_unroll(local_loop_unroll_factor);
 266   }
 267 }
 268 
 269 //------------------------------SLP_extract---------------------------
 270 // Extract the superword level parallelism
 271 //
 272 // 1) A reverse post-order of nodes in the block is constructed.  By scanning
 273 //    this list from first to last, all definitions are visited before their uses.
 274 //
 275 // 2) A point-to-point dependence graph is constructed between memory references.
 276 //    This simplies the upcoming "independence" checker.
 277 //
 278 // 3) The maximum depth in the node graph from the beginning of the block
 279 //    to each node is computed.  This is used to prune the graph search
 280 //    in the independence checker.
 281 //
 282 // 4) For integer types, the necessary bit width is propagated backwards
 283 //    from stores to allow packed operations on byte, char, and short
 284 //    integers.  This reverses the promotion to type "int" that javac


1733   if (_packset.length() == 0) return;
1734 
1735 #ifndef PRODUCT
1736   if (TraceLoopOpts) {
1737     tty->print("SuperWord    ");
1738     lpt()->dump_head();
1739   }
1740 #endif
1741 
1742   // MUST ENSURE main loop's initial value is properly aligned:
1743   //  (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
1744 
1745   align_initial_loop_index(align_to_ref());
1746 
1747   // Insert extract (unpack) operations for scalar uses
1748   for (int i = 0; i < _packset.length(); i++) {
1749     insert_extracts(_packset.at(i));
1750   }
1751 
1752   Compile* C = _phase->C;

1753   uint max_vlen_in_bytes = 0;

1754   for (int i = 0; i < _block.length(); i++) {
1755     Node* n = _block.at(i);
1756     Node_List* p = my_pack(n);
1757     if (p && n == executed_last(p)) {
1758       uint vlen = p->size();
1759       uint vlen_in_bytes = 0;
1760       Node* vn = NULL;
1761       Node* low_adr = p->at(0);
1762       Node* first   = executed_first(p);
1763       int   opc = n->Opcode();
1764       if (n->is_Load()) {
1765         Node* ctl = n->in(MemNode::Control);
1766         Node* mem = first->in(MemNode::Memory);
1767         SWPointer p1(n->as_Mem(), this, NULL, false);
1768         // Identify the memory dependency for the new loadVector node by
1769         // walking up through memory chain.
1770         // This is done to give flexibility to the new loadVector node so that
1771         // it can move above independent storeVector nodes.
1772         while (mem->is_StoreVector()) {
1773           SWPointer p2(mem->as_Mem(), this, NULL, false);


1816           } else {
1817             vlen_in_bytes = in2->as_Vector()->length_in_bytes();
1818           }
1819         } else {
1820           vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
1821           vlen_in_bytes = vn->as_Vector()->length_in_bytes();
1822         }
1823       } else {
1824         ShouldNotReachHere();
1825       }
1826       assert(vn != NULL, "sanity");
1827       _igvn.register_new_node_with_optimizer(vn);
1828       _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
1829       for (uint j = 0; j < p->size(); j++) {
1830         Node* pm = p->at(j);
1831         _igvn.replace_node(pm, vn);
1832       }
1833       _igvn._worklist.push(vn);
1834 
1835       if (vlen_in_bytes > max_vlen_in_bytes) {

1836         max_vlen_in_bytes = vlen_in_bytes;
1837       }
1838 #ifdef ASSERT
1839       if (TraceNewVectors) {
1840         tty->print("new Vector node: ");
1841         vn->dump();
1842       }
1843 #endif
1844     }
1845   }
1846   C->set_max_vector_size(max_vlen_in_bytes);












1847 }
1848 
1849 //------------------------------vector_opd---------------------------
1850 // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
1851 Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
1852   Node* p0 = p->at(0);
1853   uint vlen = p->size();
1854   Node* opd = p0->in(opd_idx);
1855 
1856   if (same_inputs(p, opd_idx)) {
1857     if (opd->is_Vector() || opd->is_LoadVector()) {
1858       assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector");
1859       return opd; // input is matching vector
1860     }
1861     if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
1862       Compile* C = _phase->C;
1863       Node* cnt = opd;
1864       // Vector instructions do not mask shift count, do it here.
1865       juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
1866       const TypeInt* t = opd->find_int_type();




  83   assert(UseSuperWord, "should be");
  84   // Do vectors exist on this architecture?
  85   if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
  86 
  87   assert(lpt->_head->is_CountedLoop(), "must be");
  88   CountedLoopNode *cl = lpt->_head->as_CountedLoop();
  89 
  90   if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
  91 
  92   if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops
  93 
  94   // Check for no control flow in body (other than exit)
  95   Node *cl_exit = cl->loopexit();
  96   if (cl_exit->in(0) != lpt->_head) return;
  97 
  98   // Make sure the are no extra control users of the loop backedge
  99   if (cl->back_control()->outcnt() != 1) {
 100     return;
 101   }
 102 
 103   // We only re-enter slp when we vector mapped a queried loop and we want to
 104   // continue unrolling, in this case, slp is not subsequently done.
 105   if (cl->ignore_slp()) return;
 106 
 107   // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
 108   CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
 109   if (pre_end == NULL) return;
 110   Node *pre_opaq1 = pre_end->limit();
 111   if (pre_opaq1->Opcode() != Op_Opaque1) return;
 112 
 113   init(); // initialize data structures
 114 
 115   set_lpt(lpt);
 116   set_lp(cl);
 117 
 118   // For now, define one block which is the entire loop body
 119   set_bb(cl);
 120 
 121   if (do_optimization) {
 122     assert(_packset.length() == 0, "packset must be empty");
 123     SLP_extract();
 124   }
 125 }
 126 
 127 //------------------------------early unrolling analysis------------------------------
 128 void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
 129   bool is_slp = true;
 130   ResourceMark rm;
 131   size_t ignored_size = lpt()->_body.size();
 132   int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
 133   Node_Stack nstack((int)ignored_size);
 134   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
 135   Node *cl_exit = cl->loopexit();
 136 
 137   // First clear the entries
 138   for (uint i = 0; i < lpt()->_body.size(); i++) {
 139     ignored_loop_nodes[i] = -1;
 140   }
 141 
 142   int max_vector = Matcher::max_vector_size(T_INT);
 143 
 144   // Process the loop, some/all of the stack entries will not be in order, ergo
 145   // need to preprocess the ignored initial state before we process the loop
 146   for (uint i = 0; i < lpt()->_body.size(); i++) {
 147     Node* n = lpt()->_body.at(i);
 148     if (n == cl->incr() ||
 149       n->is_reduction() ||
 150       n->is_AddP() ||
 151       n->is_Cmp() ||
 152       n->is_IfTrue() ||
 153       n->is_CountedLoop() ||
 154       (n == cl_exit)) {


 229   }
 230 
 231   if (is_slp) {
 232     // Now we try to find the maximum supported consistent vector which the machine
 233     // description can use
 234     for (uint i = 0; i < lpt()->_body.size(); i++) {
 235       if (ignored_loop_nodes[i] != -1) continue;
 236 
 237       BasicType bt;
 238       Node* n = lpt()->_body.at(i);
 239       if (n->is_Store()) {
 240         bt = n->as_Mem()->memory_type();
 241       } else {
 242         bt = n->bottom_type()->basic_type();
 243       }
 244 
 245       int cur_max_vector = Matcher::max_vector_size(bt);
 246 
 247       // If a max vector exists which is not larger than _local_loop_unroll_factor
 248       // stop looking, we already have the max vector to map to.
 249       if (cur_max_vector < local_loop_unroll_factor) {
 250         is_slp = false;
 251         NOT_PRODUCT(if (TraceSuperWordLoopUnrollAnalysis) tty->print_cr("slp analysis fails: unroll limit greater than max vector\n"));




 252         break;
 253       }
 254 
 255       // Map the maximal common vector
 256       if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
 257         if (cur_max_vector < max_vector) {
 258           max_vector = cur_max_vector;
 259         }
 260       }
 261     }
 262     if (is_slp) {
 263       local_loop_unroll_factor = max_vector;

 264       cl->mark_passed_slp();
 265     }
 266     cl->mark_was_slp();
 267     cl->set_slp_max_unroll(local_loop_unroll_factor);
 268   }
 269 }
 270 
 271 //------------------------------SLP_extract---------------------------
 272 // Extract the superword level parallelism
 273 //
 274 // 1) A reverse post-order of nodes in the block is constructed.  By scanning
 275 //    this list from first to last, all definitions are visited before their uses.
 276 //
 277 // 2) A point-to-point dependence graph is constructed between memory references.
 278 //    This simplies the upcoming "independence" checker.
 279 //
 280 // 3) The maximum depth in the node graph from the beginning of the block
 281 //    to each node is computed.  This is used to prune the graph search
 282 //    in the independence checker.
 283 //
 284 // 4) For integer types, the necessary bit width is propagated backwards
 285 //    from stores to allow packed operations on byte, char, and short
 286 //    integers.  This reverses the promotion to type "int" that javac


1735   if (_packset.length() == 0) return;
1736 
1737 #ifndef PRODUCT
1738   if (TraceLoopOpts) {
1739     tty->print("SuperWord    ");
1740     lpt()->dump_head();
1741   }
1742 #endif
1743 
1744   // MUST ENSURE main loop's initial value is properly aligned:
1745   //  (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
1746 
1747   align_initial_loop_index(align_to_ref());
1748 
1749   // Insert extract (unpack) operations for scalar uses
1750   for (int i = 0; i < _packset.length(); i++) {
1751     insert_extracts(_packset.at(i));
1752   }
1753 
1754   Compile* C = _phase->C;
1755   CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
1756   uint max_vlen_in_bytes = 0;
1757   uint max_vlen = 0;
1758   for (int i = 0; i < _block.length(); i++) {
1759     Node* n = _block.at(i);
1760     Node_List* p = my_pack(n);
1761     if (p && n == executed_last(p)) {
1762       uint vlen = p->size();
1763       uint vlen_in_bytes = 0;
1764       Node* vn = NULL;
1765       Node* low_adr = p->at(0);
1766       Node* first   = executed_first(p);
1767       int   opc = n->Opcode();
1768       if (n->is_Load()) {
1769         Node* ctl = n->in(MemNode::Control);
1770         Node* mem = first->in(MemNode::Memory);
1771         SWPointer p1(n->as_Mem(), this, NULL, false);
1772         // Identify the memory dependency for the new loadVector node by
1773         // walking up through memory chain.
1774         // This is done to give flexibility to the new loadVector node so that
1775         // it can move above independent storeVector nodes.
1776         while (mem->is_StoreVector()) {
1777           SWPointer p2(mem->as_Mem(), this, NULL, false);


1820           } else {
1821             vlen_in_bytes = in2->as_Vector()->length_in_bytes();
1822           }
1823         } else {
1824           vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
1825           vlen_in_bytes = vn->as_Vector()->length_in_bytes();
1826         }
1827       } else {
1828         ShouldNotReachHere();
1829       }
1830       assert(vn != NULL, "sanity");
1831       _igvn.register_new_node_with_optimizer(vn);
1832       _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
1833       for (uint j = 0; j < p->size(); j++) {
1834         Node* pm = p->at(j);
1835         _igvn.replace_node(pm, vn);
1836       }
1837       _igvn._worklist.push(vn);
1838 
1839       if (vlen_in_bytes > max_vlen_in_bytes) {
1840         max_vlen = vlen;
1841         max_vlen_in_bytes = vlen_in_bytes;
1842       }
1843 #ifdef ASSERT
1844       if (TraceNewVectors) {
1845         tty->print("new Vector node: ");
1846         vn->dump();
1847       }
1848 #endif
1849     }
1850   }
1851   C->set_max_vector_size(max_vlen_in_bytes);
1852   if (SuperWordLoopUnrollAnalysis) {
1853     if (cl->has_passed_slp()) {
1854       int slp_max_unroll_factor = cl->slp_max_unroll();
1855       if (slp_max_unroll_factor == max_vlen) {
1856         NOT_PRODUCT(if (TraceSuperWordLoopUnrollAnalysis) tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte));
1857         // For atomic unrolled loops which are vector mapped, instigate more unrolling.
1858         cl->set_notpassed_slp();
1859         C->set_major_progress();
1860         cl->mark_no_slp();
1861       }
1862     }
1863   }
1864 }
1865 
1866 //------------------------------vector_opd---------------------------
1867 // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
1868 Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
1869   Node* p0 = p->at(0);
1870   uint vlen = p->size();
1871   Node* opd = p0->in(opd_idx);
1872 
1873   if (same_inputs(p, opd_idx)) {
1874     if (opd->is_Vector() || opd->is_LoadVector()) {
1875       assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector");
1876       return opd; // input is matching vector
1877     }
1878     if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
1879       Compile* C = _phase->C;
1880       Node* cnt = opd;
1881       // Vector instructions do not mask shift count, do it here.
1882       juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
1883       const TypeInt* t = opd->find_int_type();


< prev index next >