83 assert(UseSuperWord, "should be");
84 // Do vectors exist on this architecture?
85 if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
86
87 assert(lpt->_head->is_CountedLoop(), "must be");
88 CountedLoopNode *cl = lpt->_head->as_CountedLoop();
89
90 if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
91
92 if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops
93
94 // Check for no control flow in body (other than exit)
95 Node *cl_exit = cl->loopexit();
96 if (cl_exit->in(0) != lpt->_head) return;
97
98 // Make sure the are no extra control users of the loop backedge
99 if (cl->back_control()->outcnt() != 1) {
100 return;
101 }
102
103 // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
104 CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
105 if (pre_end == NULL) return;
106 Node *pre_opaq1 = pre_end->limit();
107 if (pre_opaq1->Opcode() != Op_Opaque1) return;
108
109 init(); // initialize data structures
110
111 set_lpt(lpt);
112 set_lp(cl);
113
114 // For now, define one block which is the entire loop body
115 set_bb(cl);
116
117 if (do_optimization) {
118 assert(_packset.length() == 0, "packset must be empty");
119 SLP_extract();
120 }
121 }
122
123 //------------------------------early unrolling analysis------------------------------
124 void SuperWord::unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor) {
125 bool is_slp = true;
126 ResourceMark rm;
127 size_t ignored_size = lpt()->_body.size();
128 int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
129 Node_Stack nstack((int)ignored_size);
130 Node *cl_exit = cl->loopexit();
131
132 // First clear the entries
133 for (uint i = 0; i < lpt()->_body.size(); i++) {
134 ignored_loop_nodes[i] = -1;
135 }
136
137 int max_vector = Matcher::max_vector_size(T_INT);
138
139 // Process the loop, some/all of the stack entries will not be in order, ergo
140 // need to preprocess the ignored initial state before we process the loop
141 for (uint i = 0; i < lpt()->_body.size(); i++) {
142 Node* n = lpt()->_body.at(i);
143 if (n == cl->incr() ||
144 n->is_reduction() ||
145 n->is_AddP() ||
146 n->is_Cmp() ||
147 n->is_IfTrue() ||
148 n->is_CountedLoop() ||
149 (n == cl_exit)) {
224 }
225
226 if (is_slp) {
227 // Now we try to find the maximum supported consistent vector which the machine
228 // description can use
229 for (uint i = 0; i < lpt()->_body.size(); i++) {
230 if (ignored_loop_nodes[i] != -1) continue;
231
232 BasicType bt;
233 Node* n = lpt()->_body.at(i);
234 if (n->is_Store()) {
235 bt = n->as_Mem()->memory_type();
236 } else {
237 bt = n->bottom_type()->basic_type();
238 }
239
240 int cur_max_vector = Matcher::max_vector_size(bt);
241
242 // If a max vector exists which is not larger than _local_loop_unroll_factor
243 // stop looking, we already have the max vector to map to.
244 if (cur_max_vector <= local_loop_unroll_factor) {
245 is_slp = false;
246 #ifndef PRODUCT
247 if (TraceSuperWordLoopUnrollAnalysis) {
248 tty->print_cr("slp analysis fails: unroll limit equals max vector\n");
249 }
250 #endif
251 break;
252 }
253
254 // Map the maximal common vector
255 if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
256 if (cur_max_vector < max_vector) {
257 max_vector = cur_max_vector;
258 }
259 }
260 }
261 if (is_slp) {
262 local_loop_unroll_factor = max_vector;
263 }
264 cl->mark_passed_slp();
265 cl->set_slp_max_unroll(local_loop_unroll_factor);
266 }
267 }
268
269 //------------------------------SLP_extract---------------------------
270 // Extract the superword level parallelism
271 //
272 // 1) A reverse post-order of nodes in the block is constructed. By scanning
273 // this list from first to last, all definitions are visited before their uses.
274 //
275 // 2) A point-to-point dependence graph is constructed between memory references.
276 // This simplies the upcoming "independence" checker.
277 //
278 // 3) The maximum depth in the node graph from the beginning of the block
279 // to each node is computed. This is used to prune the graph search
280 // in the independence checker.
281 //
282 // 4) For integer types, the necessary bit width is propagated backwards
283 // from stores to allow packed operations on byte, char, and short
284 // integers. This reverses the promotion to type "int" that javac
1733 if (_packset.length() == 0) return;
1734
1735 #ifndef PRODUCT
1736 if (TraceLoopOpts) {
1737 tty->print("SuperWord ");
1738 lpt()->dump_head();
1739 }
1740 #endif
1741
1742 // MUST ENSURE main loop's initial value is properly aligned:
1743 // (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
1744
1745 align_initial_loop_index(align_to_ref());
1746
1747 // Insert extract (unpack) operations for scalar uses
1748 for (int i = 0; i < _packset.length(); i++) {
1749 insert_extracts(_packset.at(i));
1750 }
1751
1752 Compile* C = _phase->C;
1753 uint max_vlen_in_bytes = 0;
1754 for (int i = 0; i < _block.length(); i++) {
1755 Node* n = _block.at(i);
1756 Node_List* p = my_pack(n);
1757 if (p && n == executed_last(p)) {
1758 uint vlen = p->size();
1759 uint vlen_in_bytes = 0;
1760 Node* vn = NULL;
1761 Node* low_adr = p->at(0);
1762 Node* first = executed_first(p);
1763 int opc = n->Opcode();
1764 if (n->is_Load()) {
1765 Node* ctl = n->in(MemNode::Control);
1766 Node* mem = first->in(MemNode::Memory);
1767 SWPointer p1(n->as_Mem(), this, NULL, false);
1768 // Identify the memory dependency for the new loadVector node by
1769 // walking up through memory chain.
1770 // This is done to give flexibility to the new loadVector node so that
1771 // it can move above independent storeVector nodes.
1772 while (mem->is_StoreVector()) {
1773 SWPointer p2(mem->as_Mem(), this, NULL, false);
1816 } else {
1817 vlen_in_bytes = in2->as_Vector()->length_in_bytes();
1818 }
1819 } else {
1820 vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
1821 vlen_in_bytes = vn->as_Vector()->length_in_bytes();
1822 }
1823 } else {
1824 ShouldNotReachHere();
1825 }
1826 assert(vn != NULL, "sanity");
1827 _igvn.register_new_node_with_optimizer(vn);
1828 _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
1829 for (uint j = 0; j < p->size(); j++) {
1830 Node* pm = p->at(j);
1831 _igvn.replace_node(pm, vn);
1832 }
1833 _igvn._worklist.push(vn);
1834
1835 if (vlen_in_bytes > max_vlen_in_bytes) {
1836 max_vlen_in_bytes = vlen_in_bytes;
1837 }
1838 #ifdef ASSERT
1839 if (TraceNewVectors) {
1840 tty->print("new Vector node: ");
1841 vn->dump();
1842 }
1843 #endif
1844 }
1845 }
1846 C->set_max_vector_size(max_vlen_in_bytes);
1847 }
1848
1849 //------------------------------vector_opd---------------------------
1850 // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
1851 Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
1852 Node* p0 = p->at(0);
1853 uint vlen = p->size();
1854 Node* opd = p0->in(opd_idx);
1855
1856 if (same_inputs(p, opd_idx)) {
1857 if (opd->is_Vector() || opd->is_LoadVector()) {
1858 assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector");
1859 return opd; // input is matching vector
1860 }
1861 if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
1862 Compile* C = _phase->C;
1863 Node* cnt = opd;
1864 // Vector instructions do not mask shift count, do it here.
1865 juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
1866 const TypeInt* t = opd->find_int_type();
|
83 assert(UseSuperWord, "should be");
84 // Do vectors exist on this architecture?
85 if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
86
87 assert(lpt->_head->is_CountedLoop(), "must be");
88 CountedLoopNode *cl = lpt->_head->as_CountedLoop();
89
90 if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
91
92 if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops
93
94 // Check for no control flow in body (other than exit)
95 Node *cl_exit = cl->loopexit();
96 if (cl_exit->in(0) != lpt->_head) return;
97
98 // Make sure the are no extra control users of the loop backedge
99 if (cl->back_control()->outcnt() != 1) {
100 return;
101 }
102
103 // We only re-enter slp when we vector mapped a queried loop and we want to
104 // continue unrolling, in this case, slp is not subsequently done.
105 if (cl->ignore_slp()) return;
106
107 // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
108 CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
109 if (pre_end == NULL) return;
110 Node *pre_opaq1 = pre_end->limit();
111 if (pre_opaq1->Opcode() != Op_Opaque1) return;
112
113 init(); // initialize data structures
114
115 set_lpt(lpt);
116 set_lp(cl);
117
118 // For now, define one block which is the entire loop body
119 set_bb(cl);
120
121 if (do_optimization) {
122 assert(_packset.length() == 0, "packset must be empty");
123 SLP_extract();
124 }
125 }
126
127 //------------------------------early unrolling analysis------------------------------
128 void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
129 bool is_slp = true;
130 ResourceMark rm;
131 size_t ignored_size = lpt()->_body.size();
132 int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
133 Node_Stack nstack((int)ignored_size);
134 CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
135 Node *cl_exit = cl->loopexit();
136
137 // First clear the entries
138 for (uint i = 0; i < lpt()->_body.size(); i++) {
139 ignored_loop_nodes[i] = -1;
140 }
141
142 int max_vector = Matcher::max_vector_size(T_INT);
143
144 // Process the loop, some/all of the stack entries will not be in order, ergo
145 // need to preprocess the ignored initial state before we process the loop
146 for (uint i = 0; i < lpt()->_body.size(); i++) {
147 Node* n = lpt()->_body.at(i);
148 if (n == cl->incr() ||
149 n->is_reduction() ||
150 n->is_AddP() ||
151 n->is_Cmp() ||
152 n->is_IfTrue() ||
153 n->is_CountedLoop() ||
154 (n == cl_exit)) {
229 }
230
231 if (is_slp) {
232 // Now we try to find the maximum supported consistent vector which the machine
233 // description can use
234 for (uint i = 0; i < lpt()->_body.size(); i++) {
235 if (ignored_loop_nodes[i] != -1) continue;
236
237 BasicType bt;
238 Node* n = lpt()->_body.at(i);
239 if (n->is_Store()) {
240 bt = n->as_Mem()->memory_type();
241 } else {
242 bt = n->bottom_type()->basic_type();
243 }
244
245 int cur_max_vector = Matcher::max_vector_size(bt);
246
247 // If a max vector exists which is not larger than _local_loop_unroll_factor
248 // stop looking, we already have the max vector to map to.
249 if (cur_max_vector < local_loop_unroll_factor) {
250 is_slp = false;
251 NOT_PRODUCT(if (TraceSuperWordLoopUnrollAnalysis) tty->print_cr("slp analysis fails: unroll limit greater than max vector\n"));
252 break;
253 }
254
255 // Map the maximal common vector
256 if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
257 if (cur_max_vector < max_vector) {
258 max_vector = cur_max_vector;
259 }
260 }
261 }
262 if (is_slp) {
263 local_loop_unroll_factor = max_vector;
264 cl->mark_passed_slp();
265 }
266 cl->mark_was_slp();
267 cl->set_slp_max_unroll(local_loop_unroll_factor);
268 }
269 }
270
271 //------------------------------SLP_extract---------------------------
272 // Extract the superword level parallelism
273 //
274 // 1) A reverse post-order of nodes in the block is constructed. By scanning
275 // this list from first to last, all definitions are visited before their uses.
276 //
277 // 2) A point-to-point dependence graph is constructed between memory references.
278 // This simplies the upcoming "independence" checker.
279 //
280 // 3) The maximum depth in the node graph from the beginning of the block
281 // to each node is computed. This is used to prune the graph search
282 // in the independence checker.
283 //
284 // 4) For integer types, the necessary bit width is propagated backwards
285 // from stores to allow packed operations on byte, char, and short
286 // integers. This reverses the promotion to type "int" that javac
1735 if (_packset.length() == 0) return;
1736
1737 #ifndef PRODUCT
1738 if (TraceLoopOpts) {
1739 tty->print("SuperWord ");
1740 lpt()->dump_head();
1741 }
1742 #endif
1743
1744 // MUST ENSURE main loop's initial value is properly aligned:
1745 // (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
1746
1747 align_initial_loop_index(align_to_ref());
1748
1749 // Insert extract (unpack) operations for scalar uses
1750 for (int i = 0; i < _packset.length(); i++) {
1751 insert_extracts(_packset.at(i));
1752 }
1753
1754 Compile* C = _phase->C;
1755 CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
1756 uint max_vlen_in_bytes = 0;
1757 uint max_vlen = 0;
1758 for (int i = 0; i < _block.length(); i++) {
1759 Node* n = _block.at(i);
1760 Node_List* p = my_pack(n);
1761 if (p && n == executed_last(p)) {
1762 uint vlen = p->size();
1763 uint vlen_in_bytes = 0;
1764 Node* vn = NULL;
1765 Node* low_adr = p->at(0);
1766 Node* first = executed_first(p);
1767 int opc = n->Opcode();
1768 if (n->is_Load()) {
1769 Node* ctl = n->in(MemNode::Control);
1770 Node* mem = first->in(MemNode::Memory);
1771 SWPointer p1(n->as_Mem(), this, NULL, false);
1772 // Identify the memory dependency for the new loadVector node by
1773 // walking up through memory chain.
1774 // This is done to give flexibility to the new loadVector node so that
1775 // it can move above independent storeVector nodes.
1776 while (mem->is_StoreVector()) {
1777 SWPointer p2(mem->as_Mem(), this, NULL, false);
1820 } else {
1821 vlen_in_bytes = in2->as_Vector()->length_in_bytes();
1822 }
1823 } else {
1824 vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
1825 vlen_in_bytes = vn->as_Vector()->length_in_bytes();
1826 }
1827 } else {
1828 ShouldNotReachHere();
1829 }
1830 assert(vn != NULL, "sanity");
1831 _igvn.register_new_node_with_optimizer(vn);
1832 _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
1833 for (uint j = 0; j < p->size(); j++) {
1834 Node* pm = p->at(j);
1835 _igvn.replace_node(pm, vn);
1836 }
1837 _igvn._worklist.push(vn);
1838
1839 if (vlen_in_bytes > max_vlen_in_bytes) {
1840 max_vlen = vlen;
1841 max_vlen_in_bytes = vlen_in_bytes;
1842 }
1843 #ifdef ASSERT
1844 if (TraceNewVectors) {
1845 tty->print("new Vector node: ");
1846 vn->dump();
1847 }
1848 #endif
1849 }
1850 }
1851 C->set_max_vector_size(max_vlen_in_bytes);
1852 if (SuperWordLoopUnrollAnalysis) {
1853 if (cl->has_passed_slp()) {
1854 int slp_max_unroll_factor = cl->slp_max_unroll();
1855 if (slp_max_unroll_factor == max_vlen) {
1856 NOT_PRODUCT(if (TraceSuperWordLoopUnrollAnalysis) tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte));
1857 // For atomic unrolled loops which are vector mapped, instigate more unrolling.
1858 cl->set_notpassed_slp();
1859 C->set_major_progress();
1860 cl->mark_no_slp();
1861 }
1862 }
1863 }
1864 }
1865
1866 //------------------------------vector_opd---------------------------
1867 // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
1868 Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
1869 Node* p0 = p->at(0);
1870 uint vlen = p->size();
1871 Node* opd = p0->in(opd_idx);
1872
1873 if (same_inputs(p, opd_idx)) {
1874 if (opd->is_Vector() || opd->is_LoadVector()) {
1875 assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector");
1876 return opd; // input is matching vector
1877 }
1878 if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
1879 Compile* C = _phase->C;
1880 Node* cnt = opd;
1881 // Vector instructions do not mask shift count, do it here.
1882 juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1);
1883 const TypeInt* t = opd->find_int_type();
|