Print this page
rev 4505 : 8014189: JVM crash with SEGV in ConnectionGraph::record_for_escape_analysis()
Summary: Add NULL checks and asserts for Type::make_ptr() returned value.
Reviewed-by: kvn
Split |
Split |
Close |
Expand all |
Collapse all |
--- old/src/share/vm/opto/output.cpp
+++ new/src/share/vm/opto/output.cpp
1 1 /*
2 2 * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved.
3 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 4 *
5 5 * This code is free software; you can redistribute it and/or modify it
6 6 * under the terms of the GNU General Public License version 2 only, as
7 7 * published by the Free Software Foundation.
8 8 *
9 9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 12 * version 2 for more details (a copy is included in the LICENSE file that
13 13 * accompanied this code).
14 14 *
15 15 * You should have received a copy of the GNU General Public License version
16 16 * 2 along with this work; if not, write to the Free Software Foundation,
17 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 18 *
19 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 20 * or visit www.oracle.com if you need additional information or have any
21 21 * questions.
22 22 *
23 23 */
24 24
25 25 #include "precompiled.hpp"
26 26 #include "asm/assembler.inline.hpp"
27 27 #include "code/debugInfo.hpp"
28 28 #include "code/debugInfoRec.hpp"
29 29 #include "compiler/compileBroker.hpp"
30 30 #include "compiler/oopMap.hpp"
31 31 #include "memory/allocation.inline.hpp"
32 32 #include "opto/callnode.hpp"
33 33 #include "opto/cfgnode.hpp"
34 34 #include "opto/locknode.hpp"
35 35 #include "opto/machnode.hpp"
36 36 #include "opto/output.hpp"
37 37 #include "opto/regalloc.hpp"
38 38 #include "opto/runtime.hpp"
39 39 #include "opto/subnode.hpp"
40 40 #include "opto/type.hpp"
41 41 #include "runtime/handles.inline.hpp"
42 42 #include "utilities/xmlstream.hpp"
43 43
44 44 extern uint size_java_to_interp();
45 45 extern uint reloc_java_to_interp();
46 46 extern uint size_exception_handler();
47 47 extern uint size_deopt_handler();
48 48
49 49 #ifndef PRODUCT
50 50 #define DEBUG_ARG(x) , x
51 51 #else
52 52 #define DEBUG_ARG(x)
53 53 #endif
54 54
55 55 extern int emit_exception_handler(CodeBuffer &cbuf);
56 56 extern int emit_deopt_handler(CodeBuffer &cbuf);
57 57
58 58 //------------------------------Output-----------------------------------------
59 59 // Convert Nodes to instruction bits and pass off to the VM
60 60 void Compile::Output() {
61 61 // RootNode goes
62 62 assert( _cfg->_broot->_nodes.size() == 0, "" );
63 63
64 64 // The number of new nodes (mostly MachNop) is proportional to
65 65 // the number of java calls and inner loops which are aligned.
66 66 if ( C->check_node_count((NodeLimitFudgeFactor + C->java_calls()*3 +
67 67 C->inner_loops()*(OptoLoopAlignment-1)),
68 68 "out of nodes before code generation" ) ) {
69 69 return;
70 70 }
71 71 // Make sure I can find the Start Node
72 72 Block_Array& bbs = _cfg->_bbs;
73 73 Block *entry = _cfg->_blocks[1];
74 74 Block *broot = _cfg->_broot;
75 75
76 76 const StartNode *start = entry->_nodes[0]->as_Start();
77 77
78 78 // Replace StartNode with prolog
79 79 MachPrologNode *prolog = new (this) MachPrologNode();
80 80 entry->_nodes.map( 0, prolog );
81 81 bbs.map( prolog->_idx, entry );
82 82 bbs.map( start->_idx, NULL ); // start is no longer in any block
83 83
84 84 // Virtual methods need an unverified entry point
85 85
86 86 if( is_osr_compilation() ) {
87 87 if( PoisonOSREntry ) {
88 88 // TODO: Should use a ShouldNotReachHereNode...
89 89 _cfg->insert( broot, 0, new (this) MachBreakpointNode() );
90 90 }
91 91 } else {
92 92 if( _method && !_method->flags().is_static() ) {
93 93 // Insert unvalidated entry point
94 94 _cfg->insert( broot, 0, new (this) MachUEPNode() );
95 95 }
96 96
97 97 }
98 98
99 99
100 100 // Break before main entry point
101 101 if( (_method && _method->break_at_execute())
102 102 #ifndef PRODUCT
103 103 ||(OptoBreakpoint && is_method_compilation())
104 104 ||(OptoBreakpointOSR && is_osr_compilation())
105 105 ||(OptoBreakpointC2R && !_method)
106 106 #endif
107 107 ) {
108 108 // checking for _method means that OptoBreakpoint does not apply to
109 109 // runtime stubs or frame converters
110 110 _cfg->insert( entry, 1, new (this) MachBreakpointNode() );
111 111 }
112 112
113 113 // Insert epilogs before every return
114 114 for( uint i=0; i<_cfg->_num_blocks; i++ ) {
115 115 Block *b = _cfg->_blocks[i];
116 116 if( !b->is_connector() && b->non_connector_successor(0) == _cfg->_broot ) { // Found a program exit point?
117 117 Node *m = b->end();
118 118 if( m->is_Mach() && m->as_Mach()->ideal_Opcode() != Op_Halt ) {
119 119 MachEpilogNode *epilog = new (this) MachEpilogNode(m->as_Mach()->ideal_Opcode() == Op_Return);
120 120 b->add_inst( epilog );
121 121 bbs.map(epilog->_idx, b);
122 122 //_regalloc->set_bad(epilog->_idx); // Already initialized this way.
123 123 }
124 124 }
125 125 }
126 126
127 127 # ifdef ENABLE_ZAP_DEAD_LOCALS
128 128 if ( ZapDeadCompiledLocals ) Insert_zap_nodes();
129 129 # endif
130 130
131 131 uint* blk_starts = NEW_RESOURCE_ARRAY(uint,_cfg->_num_blocks+1);
132 132 blk_starts[0] = 0;
133 133
134 134 // Initialize code buffer and process short branches.
135 135 CodeBuffer* cb = init_buffer(blk_starts);
136 136
137 137 if (cb == NULL || failing()) return;
138 138
139 139 ScheduleAndBundle();
140 140
141 141 #ifndef PRODUCT
142 142 if (trace_opto_output()) {
143 143 tty->print("\n---- After ScheduleAndBundle ----\n");
144 144 for (uint i = 0; i < _cfg->_num_blocks; i++) {
145 145 tty->print("\nBB#%03d:\n", i);
146 146 Block *bb = _cfg->_blocks[i];
147 147 for (uint j = 0; j < bb->_nodes.size(); j++) {
148 148 Node *n = bb->_nodes[j];
149 149 OptoReg::Name reg = _regalloc->get_reg_first(n);
150 150 tty->print(" %-6s ", reg >= 0 && reg < REG_COUNT ? Matcher::regName[reg] : "");
151 151 n->dump();
152 152 }
153 153 }
154 154 }
155 155 #endif
156 156
157 157 if (failing()) return;
158 158
159 159 BuildOopMaps();
160 160
161 161 if (failing()) return;
162 162
163 163 fill_buffer(cb, blk_starts);
164 164 }
165 165
166 166 bool Compile::need_stack_bang(int frame_size_in_bytes) const {
167 167 // Determine if we need to generate a stack overflow check.
168 168 // Do it if the method is not a stub function and
169 169 // has java calls or has frame size > vm_page_size/8.
170 170 return (UseStackBanging && stub_function() == NULL &&
171 171 (has_java_calls() || frame_size_in_bytes > os::vm_page_size()>>3));
172 172 }
173 173
174 174 bool Compile::need_register_stack_bang() const {
175 175 // Determine if we need to generate a register stack overflow check.
176 176 // This is only used on architectures which have split register
177 177 // and memory stacks (ie. IA64).
178 178 // Bang if the method is not a stub function and has java calls
179 179 return (stub_function() == NULL && has_java_calls());
180 180 }
181 181
182 182 # ifdef ENABLE_ZAP_DEAD_LOCALS
183 183
184 184
185 185 // In order to catch compiler oop-map bugs, we have implemented
186 186 // a debugging mode called ZapDeadCompilerLocals.
187 187 // This mode causes the compiler to insert a call to a runtime routine,
188 188 // "zap_dead_locals", right before each place in compiled code
189 189 // that could potentially be a gc-point (i.e., a safepoint or oop map point).
190 190 // The runtime routine checks that locations mapped as oops are really
191 191 // oops, that locations mapped as values do not look like oops,
192 192 // and that locations mapped as dead are not used later
193 193 // (by zapping them to an invalid address).
194 194
195 195 int Compile::_CompiledZap_count = 0;
196 196
197 197 void Compile::Insert_zap_nodes() {
198 198 bool skip = false;
199 199
200 200
201 201 // Dink with static counts because code code without the extra
202 202 // runtime calls is MUCH faster for debugging purposes
203 203
204 204 if ( CompileZapFirst == 0 ) ; // nothing special
205 205 else if ( CompileZapFirst > CompiledZap_count() ) skip = true;
206 206 else if ( CompileZapFirst == CompiledZap_count() )
207 207 warning("starting zap compilation after skipping");
208 208
209 209 if ( CompileZapLast == -1 ) ; // nothing special
210 210 else if ( CompileZapLast < CompiledZap_count() ) skip = true;
211 211 else if ( CompileZapLast == CompiledZap_count() )
212 212 warning("about to compile last zap");
213 213
214 214 ++_CompiledZap_count; // counts skipped zaps, too
215 215
216 216 if ( skip ) return;
217 217
218 218
219 219 if ( _method == NULL )
220 220 return; // no safepoints/oopmaps emitted for calls in stubs,so we don't care
221 221
222 222 // Insert call to zap runtime stub before every node with an oop map
223 223 for( uint i=0; i<_cfg->_num_blocks; i++ ) {
224 224 Block *b = _cfg->_blocks[i];
225 225 for ( uint j = 0; j < b->_nodes.size(); ++j ) {
226 226 Node *n = b->_nodes[j];
227 227
228 228 // Determining if we should insert a zap-a-lot node in output.
229 229 // We do that for all nodes that has oopmap info, except for calls
230 230 // to allocation. Calls to allocation passes in the old top-of-eden pointer
231 231 // and expect the C code to reset it. Hence, there can be no safepoints between
232 232 // the inlined-allocation and the call to new_Java, etc.
233 233 // We also cannot zap monitor calls, as they must hold the microlock
234 234 // during the call to Zap, which also wants to grab the microlock.
235 235 bool insert = n->is_MachSafePoint() && (n->as_MachSafePoint()->oop_map() != NULL);
236 236 if ( insert ) { // it is MachSafePoint
237 237 if ( !n->is_MachCall() ) {
238 238 insert = false;
239 239 } else if ( n->is_MachCall() ) {
240 240 MachCallNode* call = n->as_MachCall();
241 241 if (call->entry_point() == OptoRuntime::new_instance_Java() ||
242 242 call->entry_point() == OptoRuntime::new_array_Java() ||
243 243 call->entry_point() == OptoRuntime::multianewarray2_Java() ||
244 244 call->entry_point() == OptoRuntime::multianewarray3_Java() ||
245 245 call->entry_point() == OptoRuntime::multianewarray4_Java() ||
246 246 call->entry_point() == OptoRuntime::multianewarray5_Java() ||
247 247 call->entry_point() == OptoRuntime::slow_arraycopy_Java() ||
248 248 call->entry_point() == OptoRuntime::complete_monitor_locking_Java()
249 249 ) {
250 250 insert = false;
251 251 }
252 252 }
253 253 if (insert) {
254 254 Node *zap = call_zap_node(n->as_MachSafePoint(), i);
255 255 b->_nodes.insert( j, zap );
256 256 _cfg->_bbs.map( zap->_idx, b );
257 257 ++j;
258 258 }
259 259 }
260 260 }
261 261 }
262 262 }
263 263
264 264
265 265 Node* Compile::call_zap_node(MachSafePointNode* node_to_check, int block_no) {
266 266 const TypeFunc *tf = OptoRuntime::zap_dead_locals_Type();
267 267 CallStaticJavaNode* ideal_node =
268 268 new (this) CallStaticJavaNode( tf,
269 269 OptoRuntime::zap_dead_locals_stub(_method->flags().is_native()),
270 270 "call zap dead locals stub", 0, TypePtr::BOTTOM);
271 271 // We need to copy the OopMap from the site we're zapping at.
272 272 // We have to make a copy, because the zap site might not be
273 273 // a call site, and zap_dead is a call site.
274 274 OopMap* clone = node_to_check->oop_map()->deep_copy();
275 275
276 276 // Add the cloned OopMap to the zap node
277 277 ideal_node->set_oop_map(clone);
278 278 return _matcher->match_sfpt(ideal_node);
279 279 }
280 280
281 281 //------------------------------is_node_getting_a_safepoint--------------------
282 282 bool Compile::is_node_getting_a_safepoint( Node* n) {
283 283 // This code duplicates the logic prior to the call of add_safepoint
284 284 // below in this file.
285 285 if( n->is_MachSafePoint() ) return true;
286 286 return false;
287 287 }
288 288
289 289 # endif // ENABLE_ZAP_DEAD_LOCALS
290 290
291 291 //------------------------------compute_loop_first_inst_sizes------------------
292 292 // Compute the size of first NumberOfLoopInstrToAlign instructions at the top
293 293 // of a loop. When aligning a loop we need to provide enough instructions
294 294 // in cpu's fetch buffer to feed decoders. The loop alignment could be
295 295 // avoided if we have enough instructions in fetch buffer at the head of a loop.
296 296 // By default, the size is set to 999999 by Block's constructor so that
297 297 // a loop will be aligned if the size is not reset here.
298 298 //
299 299 // Note: Mach instructions could contain several HW instructions
300 300 // so the size is estimated only.
301 301 //
302 302 void Compile::compute_loop_first_inst_sizes() {
303 303 // The next condition is used to gate the loop alignment optimization.
304 304 // Don't aligned a loop if there are enough instructions at the head of a loop
305 305 // or alignment padding is larger then MaxLoopPad. By default, MaxLoopPad
306 306 // is equal to OptoLoopAlignment-1 except on new Intel cpus, where it is
307 307 // equal to 11 bytes which is the largest address NOP instruction.
308 308 if( MaxLoopPad < OptoLoopAlignment-1 ) {
309 309 uint last_block = _cfg->_num_blocks-1;
310 310 for( uint i=1; i <= last_block; i++ ) {
311 311 Block *b = _cfg->_blocks[i];
312 312 // Check the first loop's block which requires an alignment.
313 313 if( b->loop_alignment() > (uint)relocInfo::addr_unit() ) {
314 314 uint sum_size = 0;
315 315 uint inst_cnt = NumberOfLoopInstrToAlign;
316 316 inst_cnt = b->compute_first_inst_size(sum_size, inst_cnt, _regalloc);
317 317
318 318 // Check subsequent fallthrough blocks if the loop's first
319 319 // block(s) does not have enough instructions.
320 320 Block *nb = b;
321 321 while( inst_cnt > 0 &&
322 322 i < last_block &&
323 323 !_cfg->_blocks[i+1]->has_loop_alignment() &&
324 324 !nb->has_successor(b) ) {
325 325 i++;
326 326 nb = _cfg->_blocks[i];
327 327 inst_cnt = nb->compute_first_inst_size(sum_size, inst_cnt, _regalloc);
328 328 } // while( inst_cnt > 0 && i < last_block )
329 329
330 330 b->set_first_inst_size(sum_size);
331 331 } // f( b->head()->is_Loop() )
332 332 } // for( i <= last_block )
333 333 } // if( MaxLoopPad < OptoLoopAlignment-1 )
334 334 }
335 335
336 336 //----------------------shorten_branches---------------------------------------
337 337 // The architecture description provides short branch variants for some long
338 338 // branch instructions. Replace eligible long branches with short branches.
339 339 void Compile::shorten_branches(uint* blk_starts, int& code_size, int& reloc_size, int& stub_size) {
340 340
341 341 // ------------------
342 342 // Compute size of each block, method size, and relocation information size
343 343 uint nblocks = _cfg->_num_blocks;
344 344
345 345 uint* jmp_offset = NEW_RESOURCE_ARRAY(uint,nblocks);
346 346 uint* jmp_size = NEW_RESOURCE_ARRAY(uint,nblocks);
347 347 int* jmp_nidx = NEW_RESOURCE_ARRAY(int ,nblocks);
348 348 DEBUG_ONLY( uint *jmp_target = NEW_RESOURCE_ARRAY(uint,nblocks); )
349 349 DEBUG_ONLY( uint *jmp_rule = NEW_RESOURCE_ARRAY(uint,nblocks); )
350 350
351 351 bool has_short_branch_candidate = false;
352 352
353 353 // Initialize the sizes to 0
354 354 code_size = 0; // Size in bytes of generated code
355 355 stub_size = 0; // Size in bytes of all stub entries
356 356 // Size in bytes of all relocation entries, including those in local stubs.
357 357 // Start with 2-bytes of reloc info for the unvalidated entry point
358 358 reloc_size = 1; // Number of relocation entries
359 359
360 360 // Make three passes. The first computes pessimistic blk_starts,
361 361 // relative jmp_offset and reloc_size information. The second performs
362 362 // short branch substitution using the pessimistic sizing. The
363 363 // third inserts nops where needed.
364 364
365 365 // Step one, perform a pessimistic sizing pass.
366 366 uint last_call_adr = max_uint;
367 367 uint last_avoid_back_to_back_adr = max_uint;
368 368 uint nop_size = (new (this) MachNopNode())->size(_regalloc);
369 369 for (uint i = 0; i < nblocks; i++) { // For all blocks
370 370 Block *b = _cfg->_blocks[i];
371 371
372 372 // During short branch replacement, we store the relative (to blk_starts)
373 373 // offset of jump in jmp_offset, rather than the absolute offset of jump.
374 374 // This is so that we do not need to recompute sizes of all nodes when
375 375 // we compute correct blk_starts in our next sizing pass.
376 376 jmp_offset[i] = 0;
377 377 jmp_size[i] = 0;
378 378 jmp_nidx[i] = -1;
379 379 DEBUG_ONLY( jmp_target[i] = 0; )
380 380 DEBUG_ONLY( jmp_rule[i] = 0; )
381 381
382 382 // Sum all instruction sizes to compute block size
383 383 uint last_inst = b->_nodes.size();
384 384 uint blk_size = 0;
385 385 for (uint j = 0; j < last_inst; j++) {
386 386 Node* nj = b->_nodes[j];
387 387 // Handle machine instruction nodes
388 388 if (nj->is_Mach()) {
389 389 MachNode *mach = nj->as_Mach();
390 390 blk_size += (mach->alignment_required() - 1) * relocInfo::addr_unit(); // assume worst case padding
391 391 reloc_size += mach->reloc();
392 392 if( mach->is_MachCall() ) {
393 393 MachCallNode *mcall = mach->as_MachCall();
394 394 // This destination address is NOT PC-relative
395 395
396 396 mcall->method_set((intptr_t)mcall->entry_point());
397 397
398 398 if( mcall->is_MachCallJava() && mcall->as_MachCallJava()->_method ) {
399 399 stub_size += size_java_to_interp();
400 400 reloc_size += reloc_java_to_interp();
401 401 }
402 402 } else if (mach->is_MachSafePoint()) {
403 403 // If call/safepoint are adjacent, account for possible
404 404 // nop to disambiguate the two safepoints.
405 405 // ScheduleAndBundle() can rearrange nodes in a block,
406 406 // check for all offsets inside this block.
407 407 if (last_call_adr >= blk_starts[i]) {
408 408 blk_size += nop_size;
409 409 }
410 410 }
411 411 if (mach->avoid_back_to_back()) {
412 412 // Nop is inserted between "avoid back to back" instructions.
413 413 // ScheduleAndBundle() can rearrange nodes in a block,
414 414 // check for all offsets inside this block.
415 415 if (last_avoid_back_to_back_adr >= blk_starts[i]) {
416 416 blk_size += nop_size;
417 417 }
418 418 }
419 419 if (mach->may_be_short_branch()) {
420 420 if (!nj->is_MachBranch()) {
421 421 #ifndef PRODUCT
422 422 nj->dump(3);
423 423 #endif
424 424 Unimplemented();
425 425 }
426 426 assert(jmp_nidx[i] == -1, "block should have only one branch");
427 427 jmp_offset[i] = blk_size;
428 428 jmp_size[i] = nj->size(_regalloc);
429 429 jmp_nidx[i] = j;
430 430 has_short_branch_candidate = true;
431 431 }
432 432 }
433 433 blk_size += nj->size(_regalloc);
434 434 // Remember end of call offset
435 435 if (nj->is_MachCall() && !nj->is_MachCallLeaf()) {
436 436 last_call_adr = blk_starts[i]+blk_size;
437 437 }
438 438 // Remember end of avoid_back_to_back offset
439 439 if (nj->is_Mach() && nj->as_Mach()->avoid_back_to_back()) {
440 440 last_avoid_back_to_back_adr = blk_starts[i]+blk_size;
441 441 }
442 442 }
443 443
444 444 // When the next block starts a loop, we may insert pad NOP
445 445 // instructions. Since we cannot know our future alignment,
446 446 // assume the worst.
447 447 if (i< nblocks-1) {
448 448 Block *nb = _cfg->_blocks[i+1];
449 449 int max_loop_pad = nb->code_alignment()-relocInfo::addr_unit();
450 450 if (max_loop_pad > 0) {
451 451 assert(is_power_of_2(max_loop_pad+relocInfo::addr_unit()), "");
452 452 // Adjust last_call_adr and/or last_avoid_back_to_back_adr.
453 453 // If either is the last instruction in this block, bump by
454 454 // max_loop_pad in lock-step with blk_size, so sizing
455 455 // calculations in subsequent blocks still can conservatively
456 456 // detect that it may the last instruction in this block.
457 457 if (last_call_adr == blk_starts[i]+blk_size) {
458 458 last_call_adr += max_loop_pad;
459 459 }
460 460 if (last_avoid_back_to_back_adr == blk_starts[i]+blk_size) {
461 461 last_avoid_back_to_back_adr += max_loop_pad;
462 462 }
463 463 blk_size += max_loop_pad;
464 464 }
465 465 }
466 466
467 467 // Save block size; update total method size
468 468 blk_starts[i+1] = blk_starts[i]+blk_size;
469 469 }
470 470
471 471 // Step two, replace eligible long jumps.
472 472 bool progress = true;
473 473 uint last_may_be_short_branch_adr = max_uint;
474 474 while (has_short_branch_candidate && progress) {
475 475 progress = false;
476 476 has_short_branch_candidate = false;
477 477 int adjust_block_start = 0;
478 478 for (uint i = 0; i < nblocks; i++) {
479 479 Block *b = _cfg->_blocks[i];
480 480 int idx = jmp_nidx[i];
481 481 MachNode* mach = (idx == -1) ? NULL: b->_nodes[idx]->as_Mach();
482 482 if (mach != NULL && mach->may_be_short_branch()) {
483 483 #ifdef ASSERT
484 484 assert(jmp_size[i] > 0 && mach->is_MachBranch(), "sanity");
485 485 int j;
486 486 // Find the branch; ignore trailing NOPs.
487 487 for (j = b->_nodes.size()-1; j>=0; j--) {
488 488 Node* n = b->_nodes[j];
489 489 if (!n->is_Mach() || n->as_Mach()->ideal_Opcode() != Op_Con)
490 490 break;
491 491 }
492 492 assert(j >= 0 && j == idx && b->_nodes[j] == (Node*)mach, "sanity");
493 493 #endif
494 494 int br_size = jmp_size[i];
495 495 int br_offs = blk_starts[i] + jmp_offset[i];
496 496
497 497 // This requires the TRUE branch target be in succs[0]
498 498 uint bnum = b->non_connector_successor(0)->_pre_order;
499 499 int offset = blk_starts[bnum] - br_offs;
500 500 if (bnum > i) { // adjust following block's offset
501 501 offset -= adjust_block_start;
502 502 }
503 503 // In the following code a nop could be inserted before
504 504 // the branch which will increase the backward distance.
505 505 bool needs_padding = ((uint)br_offs == last_may_be_short_branch_adr);
506 506 if (needs_padding && offset <= 0)
507 507 offset -= nop_size;
508 508
509 509 if (_matcher->is_short_branch_offset(mach->rule(), br_size, offset)) {
510 510 // We've got a winner. Replace this branch.
511 511 MachNode* replacement = mach->as_MachBranch()->short_branch_version(this);
512 512
513 513 // Update the jmp_size.
514 514 int new_size = replacement->size(_regalloc);
515 515 int diff = br_size - new_size;
516 516 assert(diff >= (int)nop_size, "short_branch size should be smaller");
517 517 // Conservatively take into accound padding between
518 518 // avoid_back_to_back branches. Previous branch could be
519 519 // converted into avoid_back_to_back branch during next
520 520 // rounds.
521 521 if (needs_padding && replacement->avoid_back_to_back()) {
522 522 jmp_offset[i] += nop_size;
523 523 diff -= nop_size;
524 524 }
525 525 adjust_block_start += diff;
526 526 b->_nodes.map(idx, replacement);
527 527 mach->subsume_by(replacement, C);
528 528 mach = replacement;
529 529 progress = true;
530 530
531 531 jmp_size[i] = new_size;
532 532 DEBUG_ONLY( jmp_target[i] = bnum; );
533 533 DEBUG_ONLY( jmp_rule[i] = mach->rule(); );
534 534 } else {
535 535 // The jump distance is not short, try again during next iteration.
536 536 has_short_branch_candidate = true;
537 537 }
538 538 } // (mach->may_be_short_branch())
539 539 if (mach != NULL && (mach->may_be_short_branch() ||
540 540 mach->avoid_back_to_back())) {
541 541 last_may_be_short_branch_adr = blk_starts[i] + jmp_offset[i] + jmp_size[i];
542 542 }
543 543 blk_starts[i+1] -= adjust_block_start;
544 544 }
545 545 }
546 546
547 547 #ifdef ASSERT
548 548 for (uint i = 0; i < nblocks; i++) { // For all blocks
549 549 if (jmp_target[i] != 0) {
550 550 int br_size = jmp_size[i];
551 551 int offset = blk_starts[jmp_target[i]]-(blk_starts[i] + jmp_offset[i]);
552 552 if (!_matcher->is_short_branch_offset(jmp_rule[i], br_size, offset)) {
553 553 tty->print_cr("target (%d) - jmp_offset(%d) = offset (%d), jump_size(%d), jmp_block B%d, target_block B%d", blk_starts[jmp_target[i]], blk_starts[i] + jmp_offset[i], offset, br_size, i, jmp_target[i]);
554 554 }
555 555 assert(_matcher->is_short_branch_offset(jmp_rule[i], br_size, offset), "Displacement too large for short jmp");
556 556 }
557 557 }
558 558 #endif
559 559
560 560 // Step 3, compute the offsets of all blocks, will be done in fill_buffer()
561 561 // after ScheduleAndBundle().
562 562
563 563 // ------------------
564 564 // Compute size for code buffer
565 565 code_size = blk_starts[nblocks];
566 566
567 567 // Relocation records
568 568 reloc_size += 1; // Relo entry for exception handler
569 569
570 570 // Adjust reloc_size to number of record of relocation info
571 571 // Min is 2 bytes, max is probably 6 or 8, with a tax up to 25% for
572 572 // a relocation index.
573 573 // The CodeBuffer will expand the locs array if this estimate is too low.
574 574 reloc_size *= 10 / sizeof(relocInfo);
575 575 }
576 576
577 577 //------------------------------FillLocArray-----------------------------------
578 578 // Create a bit of debug info and append it to the array. The mapping is from
579 579 // Java local or expression stack to constant, register or stack-slot. For
580 580 // doubles, insert 2 mappings and return 1 (to tell the caller that the next
581 581 // entry has been taken care of and caller should skip it).
582 582 static LocationValue *new_loc_value( PhaseRegAlloc *ra, OptoReg::Name regnum, Location::Type l_type ) {
583 583 // This should never have accepted Bad before
584 584 assert(OptoReg::is_valid(regnum), "location must be valid");
585 585 return (OptoReg::is_reg(regnum))
586 586 ? new LocationValue(Location::new_reg_loc(l_type, OptoReg::as_VMReg(regnum)) )
587 587 : new LocationValue(Location::new_stk_loc(l_type, ra->reg2offset(regnum)));
588 588 }
589 589
590 590
591 591 ObjectValue*
592 592 Compile::sv_for_node_id(GrowableArray<ScopeValue*> *objs, int id) {
593 593 for (int i = 0; i < objs->length(); i++) {
594 594 assert(objs->at(i)->is_object(), "corrupt object cache");
595 595 ObjectValue* sv = (ObjectValue*) objs->at(i);
596 596 if (sv->id() == id) {
597 597 return sv;
598 598 }
599 599 }
600 600 // Otherwise..
601 601 return NULL;
602 602 }
603 603
604 604 void Compile::set_sv_for_object_node(GrowableArray<ScopeValue*> *objs,
605 605 ObjectValue* sv ) {
606 606 assert(sv_for_node_id(objs, sv->id()) == NULL, "Precondition");
607 607 objs->append(sv);
608 608 }
609 609
610 610
611 611 void Compile::FillLocArray( int idx, MachSafePointNode* sfpt, Node *local,
612 612 GrowableArray<ScopeValue*> *array,
613 613 GrowableArray<ScopeValue*> *objs ) {
614 614 assert( local, "use _top instead of null" );
615 615 if (array->length() != idx) {
616 616 assert(array->length() == idx + 1, "Unexpected array count");
617 617 // Old functionality:
618 618 // return
619 619 // New functionality:
620 620 // Assert if the local is not top. In product mode let the new node
621 621 // override the old entry.
622 622 assert(local == top(), "LocArray collision");
623 623 if (local == top()) {
624 624 return;
625 625 }
626 626 array->pop();
627 627 }
628 628 const Type *t = local->bottom_type();
629 629
630 630 // Is it a safepoint scalar object node?
631 631 if (local->is_SafePointScalarObject()) {
632 632 SafePointScalarObjectNode* spobj = local->as_SafePointScalarObject();
633 633
634 634 ObjectValue* sv = Compile::sv_for_node_id(objs, spobj->_idx);
635 635 if (sv == NULL) {
636 636 ciKlass* cik = t->is_oopptr()->klass();
637 637 assert(cik->is_instance_klass() ||
638 638 cik->is_array_klass(), "Not supported allocation.");
639 639 sv = new ObjectValue(spobj->_idx,
640 640 new ConstantOopWriteValue(cik->constant_encoding()));
641 641 Compile::set_sv_for_object_node(objs, sv);
642 642
643 643 uint first_ind = spobj->first_index();
644 644 for (uint i = 0; i < spobj->n_fields(); i++) {
645 645 Node* fld_node = sfpt->in(first_ind+i);
646 646 (void)FillLocArray(sv->field_values()->length(), sfpt, fld_node, sv->field_values(), objs);
647 647 }
648 648 }
649 649 array->append(sv);
650 650 return;
651 651 }
652 652
653 653 // Grab the register number for the local
654 654 OptoReg::Name regnum = _regalloc->get_reg_first(local);
655 655 if( OptoReg::is_valid(regnum) ) {// Got a register/stack?
656 656 // Record the double as two float registers.
657 657 // The register mask for such a value always specifies two adjacent
658 658 // float registers, with the lower register number even.
659 659 // Normally, the allocation of high and low words to these registers
660 660 // is irrelevant, because nearly all operations on register pairs
661 661 // (e.g., StoreD) treat them as a single unit.
662 662 // Here, we assume in addition that the words in these two registers
663 663 // stored "naturally" (by operations like StoreD and double stores
664 664 // within the interpreter) such that the lower-numbered register
665 665 // is written to the lower memory address. This may seem like
666 666 // a machine dependency, but it is not--it is a requirement on
667 667 // the author of the <arch>.ad file to ensure that, for every
668 668 // even/odd double-register pair to which a double may be allocated,
669 669 // the word in the even single-register is stored to the first
670 670 // memory word. (Note that register numbers are completely
671 671 // arbitrary, and are not tied to any machine-level encodings.)
672 672 #ifdef _LP64
673 673 if( t->base() == Type::DoubleBot || t->base() == Type::DoubleCon ) {
674 674 array->append(new ConstantIntValue(0));
675 675 array->append(new_loc_value( _regalloc, regnum, Location::dbl ));
676 676 } else if ( t->base() == Type::Long ) {
677 677 array->append(new ConstantIntValue(0));
678 678 array->append(new_loc_value( _regalloc, regnum, Location::lng ));
679 679 } else if ( t->base() == Type::RawPtr ) {
680 680 // jsr/ret return address which must be restored into a the full
681 681 // width 64-bit stack slot.
682 682 array->append(new_loc_value( _regalloc, regnum, Location::lng ));
683 683 }
684 684 #else //_LP64
685 685 #ifdef SPARC
686 686 if (t->base() == Type::Long && OptoReg::is_reg(regnum)) {
687 687 // For SPARC we have to swap high and low words for
688 688 // long values stored in a single-register (g0-g7).
689 689 array->append(new_loc_value( _regalloc, regnum , Location::normal ));
690 690 array->append(new_loc_value( _regalloc, OptoReg::add(regnum,1), Location::normal ));
691 691 } else
692 692 #endif //SPARC
693 693 if( t->base() == Type::DoubleBot || t->base() == Type::DoubleCon || t->base() == Type::Long ) {
694 694 // Repack the double/long as two jints.
695 695 // The convention the interpreter uses is that the second local
696 696 // holds the first raw word of the native double representation.
697 697 // This is actually reasonable, since locals and stack arrays
698 698 // grow downwards in all implementations.
699 699 // (If, on some machine, the interpreter's Java locals or stack
700 700 // were to grow upwards, the embedded doubles would be word-swapped.)
701 701 array->append(new_loc_value( _regalloc, OptoReg::add(regnum,1), Location::normal ));
702 702 array->append(new_loc_value( _regalloc, regnum , Location::normal ));
703 703 }
704 704 #endif //_LP64
705 705 else if( (t->base() == Type::FloatBot || t->base() == Type::FloatCon) &&
706 706 OptoReg::is_reg(regnum) ) {
707 707 array->append(new_loc_value( _regalloc, regnum, Matcher::float_in_double()
708 708 ? Location::float_in_dbl : Location::normal ));
709 709 } else if( t->base() == Type::Int && OptoReg::is_reg(regnum) ) {
710 710 array->append(new_loc_value( _regalloc, regnum, Matcher::int_in_long
711 711 ? Location::int_in_long : Location::normal ));
712 712 } else if( t->base() == Type::NarrowOop ) {
713 713 array->append(new_loc_value( _regalloc, regnum, Location::narrowoop ));
714 714 } else {
715 715 array->append(new_loc_value( _regalloc, regnum, _regalloc->is_oop(local) ? Location::oop : Location::normal ));
716 716 }
717 717 return;
718 718 }
719 719
720 720 // No register. It must be constant data.
721 721 switch (t->base()) {
722 722 case Type::Half: // Second half of a double
723 723 ShouldNotReachHere(); // Caller should skip 2nd halves
724 724 break;
725 725 case Type::AnyPtr:
726 726 array->append(new ConstantOopWriteValue(NULL));
727 727 break;
728 728 case Type::AryPtr:
729 729 case Type::InstPtr:
730 730 case Type::KlassPtr: // fall through
731 731 array->append(new ConstantOopWriteValue(t->isa_oopptr()->const_oop()->constant_encoding()));
732 732 break;
733 733 case Type::NarrowOop:
734 734 if (t == TypeNarrowOop::NULL_PTR) {
735 735 array->append(new ConstantOopWriteValue(NULL));
736 736 } else {
737 737 array->append(new ConstantOopWriteValue(t->make_ptr()->isa_oopptr()->const_oop()->constant_encoding()));
738 738 }
739 739 break;
740 740 case Type::Int:
741 741 array->append(new ConstantIntValue(t->is_int()->get_con()));
742 742 break;
743 743 case Type::RawPtr:
744 744 // A return address (T_ADDRESS).
745 745 assert((intptr_t)t->is_ptr()->get_con() < (intptr_t)0x10000, "must be a valid BCI");
746 746 #ifdef _LP64
747 747 // Must be restored to the full-width 64-bit stack slot.
748 748 array->append(new ConstantLongValue(t->is_ptr()->get_con()));
749 749 #else
750 750 array->append(new ConstantIntValue(t->is_ptr()->get_con()));
751 751 #endif
752 752 break;
753 753 case Type::FloatCon: {
754 754 float f = t->is_float_constant()->getf();
755 755 array->append(new ConstantIntValue(jint_cast(f)));
756 756 break;
757 757 }
758 758 case Type::DoubleCon: {
759 759 jdouble d = t->is_double_constant()->getd();
760 760 #ifdef _LP64
761 761 array->append(new ConstantIntValue(0));
762 762 array->append(new ConstantDoubleValue(d));
763 763 #else
764 764 // Repack the double as two jints.
765 765 // The convention the interpreter uses is that the second local
766 766 // holds the first raw word of the native double representation.
767 767 // This is actually reasonable, since locals and stack arrays
768 768 // grow downwards in all implementations.
769 769 // (If, on some machine, the interpreter's Java locals or stack
770 770 // were to grow upwards, the embedded doubles would be word-swapped.)
771 771 jint *dp = (jint*)&d;
772 772 array->append(new ConstantIntValue(dp[1]));
773 773 array->append(new ConstantIntValue(dp[0]));
774 774 #endif
775 775 break;
776 776 }
777 777 case Type::Long: {
778 778 jlong d = t->is_long()->get_con();
779 779 #ifdef _LP64
780 780 array->append(new ConstantIntValue(0));
781 781 array->append(new ConstantLongValue(d));
782 782 #else
783 783 // Repack the long as two jints.
784 784 // The convention the interpreter uses is that the second local
785 785 // holds the first raw word of the native double representation.
786 786 // This is actually reasonable, since locals and stack arrays
787 787 // grow downwards in all implementations.
788 788 // (If, on some machine, the interpreter's Java locals or stack
789 789 // were to grow upwards, the embedded doubles would be word-swapped.)
790 790 jint *dp = (jint*)&d;
791 791 array->append(new ConstantIntValue(dp[1]));
792 792 array->append(new ConstantIntValue(dp[0]));
793 793 #endif
794 794 break;
795 795 }
796 796 case Type::Top: // Add an illegal value here
797 797 array->append(new LocationValue(Location()));
798 798 break;
799 799 default:
800 800 ShouldNotReachHere();
801 801 break;
802 802 }
803 803 }
804 804
805 805 // Determine if this node starts a bundle
806 806 bool Compile::starts_bundle(const Node *n) const {
807 807 return (_node_bundling_limit > n->_idx &&
808 808 _node_bundling_base[n->_idx].starts_bundle());
809 809 }
810 810
811 811 //--------------------------Process_OopMap_Node--------------------------------
812 812 void Compile::Process_OopMap_Node(MachNode *mach, int current_offset) {
813 813
814 814 // Handle special safepoint nodes for synchronization
815 815 MachSafePointNode *sfn = mach->as_MachSafePoint();
816 816 MachCallNode *mcall;
817 817
818 818 #ifdef ENABLE_ZAP_DEAD_LOCALS
819 819 assert( is_node_getting_a_safepoint(mach), "logic does not match; false negative");
820 820 #endif
821 821
822 822 int safepoint_pc_offset = current_offset;
823 823 bool is_method_handle_invoke = false;
824 824 bool return_oop = false;
825 825
826 826 // Add the safepoint in the DebugInfoRecorder
827 827 if( !mach->is_MachCall() ) {
828 828 mcall = NULL;
829 829 debug_info()->add_safepoint(safepoint_pc_offset, sfn->_oop_map);
830 830 } else {
831 831 mcall = mach->as_MachCall();
832 832
833 833 // Is the call a MethodHandle call?
834 834 if (mcall->is_MachCallJava()) {
835 835 if (mcall->as_MachCallJava()->_method_handle_invoke) {
836 836 assert(has_method_handle_invokes(), "must have been set during call generation");
837 837 is_method_handle_invoke = true;
838 838 }
839 839 }
840 840
841 841 // Check if a call returns an object.
842 842 if (mcall->return_value_is_used() &&
843 843 mcall->tf()->range()->field_at(TypeFunc::Parms)->isa_ptr()) {
844 844 return_oop = true;
845 845 }
846 846 safepoint_pc_offset += mcall->ret_addr_offset();
847 847 debug_info()->add_safepoint(safepoint_pc_offset, mcall->_oop_map);
848 848 }
849 849
850 850 // Loop over the JVMState list to add scope information
851 851 // Do not skip safepoints with a NULL method, they need monitor info
852 852 JVMState* youngest_jvms = sfn->jvms();
853 853 int max_depth = youngest_jvms->depth();
854 854
855 855 // Allocate the object pool for scalar-replaced objects -- the map from
856 856 // small-integer keys (which can be recorded in the local and ostack
857 857 // arrays) to descriptions of the object state.
858 858 GrowableArray<ScopeValue*> *objs = new GrowableArray<ScopeValue*>();
859 859
860 860 // Visit scopes from oldest to youngest.
861 861 for (int depth = 1; depth <= max_depth; depth++) {
862 862 JVMState* jvms = youngest_jvms->of_depth(depth);
863 863 int idx;
864 864 ciMethod* method = jvms->has_method() ? jvms->method() : NULL;
865 865 // Safepoints that do not have method() set only provide oop-map and monitor info
866 866 // to support GC; these do not support deoptimization.
867 867 int num_locs = (method == NULL) ? 0 : jvms->loc_size();
868 868 int num_exps = (method == NULL) ? 0 : jvms->stk_size();
869 869 int num_mon = jvms->nof_monitors();
870 870 assert(method == NULL || jvms->bci() < 0 || num_locs == method->max_locals(),
871 871 "JVMS local count must match that of the method");
872 872
873 873 // Add Local and Expression Stack Information
874 874
875 875 // Insert locals into the locarray
876 876 GrowableArray<ScopeValue*> *locarray = new GrowableArray<ScopeValue*>(num_locs);
877 877 for( idx = 0; idx < num_locs; idx++ ) {
878 878 FillLocArray( idx, sfn, sfn->local(jvms, idx), locarray, objs );
879 879 }
880 880
881 881 // Insert expression stack entries into the exparray
882 882 GrowableArray<ScopeValue*> *exparray = new GrowableArray<ScopeValue*>(num_exps);
883 883 for( idx = 0; idx < num_exps; idx++ ) {
884 884 FillLocArray( idx, sfn, sfn->stack(jvms, idx), exparray, objs );
885 885 }
886 886
887 887 // Add in mappings of the monitors
888 888 assert( !method ||
889 889 !method->is_synchronized() ||
890 890 method->is_native() ||
891 891 num_mon > 0 ||
892 892 !GenerateSynchronizationCode,
893 893 "monitors must always exist for synchronized methods");
894 894
895 895 // Build the growable array of ScopeValues for exp stack
896 896 GrowableArray<MonitorValue*> *monarray = new GrowableArray<MonitorValue*>(num_mon);
897 897
898 898 // Loop over monitors and insert into array
899 899 for(idx = 0; idx < num_mon; idx++) {
900 900 // Grab the node that defines this monitor
901 901 Node* box_node = sfn->monitor_box(jvms, idx);
902 902 Node* obj_node = sfn->monitor_obj(jvms, idx);
903 903
904 904 // Create ScopeValue for object
905 905 ScopeValue *scval = NULL;
906 906
907 907 if( obj_node->is_SafePointScalarObject() ) {
908 908 SafePointScalarObjectNode* spobj = obj_node->as_SafePointScalarObject();
909 909 scval = Compile::sv_for_node_id(objs, spobj->_idx);
910 910 if (scval == NULL) {
911 911 const Type *t = obj_node->bottom_type();
912 912 ciKlass* cik = t->is_oopptr()->klass();
913 913 assert(cik->is_instance_klass() ||
914 914 cik->is_array_klass(), "Not supported allocation.");
915 915 ObjectValue* sv = new ObjectValue(spobj->_idx,
916 916 new ConstantOopWriteValue(cik->constant_encoding()));
917 917 Compile::set_sv_for_object_node(objs, sv);
918 918
919 919 uint first_ind = spobj->first_index();
920 920 for (uint i = 0; i < spobj->n_fields(); i++) {
921 921 Node* fld_node = sfn->in(first_ind+i);
922 922 (void)FillLocArray(sv->field_values()->length(), sfn, fld_node, sv->field_values(), objs);
923 923 }
↓ open down ↓ |
923 lines elided |
↑ open up ↑ |
924 924 scval = sv;
925 925 }
926 926 } else if( !obj_node->is_Con() ) {
927 927 OptoReg::Name obj_reg = _regalloc->get_reg_first(obj_node);
928 928 if( obj_node->bottom_type()->base() == Type::NarrowOop ) {
929 929 scval = new_loc_value( _regalloc, obj_reg, Location::narrowoop );
930 930 } else {
931 931 scval = new_loc_value( _regalloc, obj_reg, Location::oop );
932 932 }
933 933 } else {
934 - const TypePtr *tp = obj_node->bottom_type()->make_ptr();
934 + const TypePtr *tp = obj_node->get_ptr_type();
935 935 scval = new ConstantOopWriteValue(tp->is_oopptr()->const_oop()->constant_encoding());
936 936 }
937 937
938 938 OptoReg::Name box_reg = BoxLockNode::reg(box_node);
939 939 Location basic_lock = Location::new_stk_loc(Location::normal,_regalloc->reg2offset(box_reg));
940 940 bool eliminated = (box_node->is_BoxLock() && box_node->as_BoxLock()->is_eliminated());
941 941 monarray->append(new MonitorValue(scval, basic_lock, eliminated));
942 942 }
943 943
944 944 // We dump the object pool first, since deoptimization reads it in first.
945 945 debug_info()->dump_object_pool(objs);
946 946
947 947 // Build first class objects to pass to scope
948 948 DebugToken *locvals = debug_info()->create_scope_values(locarray);
949 949 DebugToken *expvals = debug_info()->create_scope_values(exparray);
950 950 DebugToken *monvals = debug_info()->create_monitor_values(monarray);
951 951
952 952 // Make method available for all Safepoints
953 953 ciMethod* scope_method = method ? method : _method;
954 954 // Describe the scope here
955 955 assert(jvms->bci() >= InvocationEntryBci && jvms->bci() <= 0x10000, "must be a valid or entry BCI");
956 956 assert(!jvms->should_reexecute() || depth == max_depth, "reexecute allowed only for the youngest");
957 957 // Now we can describe the scope.
958 958 debug_info()->describe_scope(safepoint_pc_offset, scope_method, jvms->bci(), jvms->should_reexecute(), is_method_handle_invoke, return_oop, locvals, expvals, monvals);
959 959 } // End jvms loop
960 960
961 961 // Mark the end of the scope set.
962 962 debug_info()->end_safepoint(safepoint_pc_offset);
963 963 }
964 964
965 965
966 966
967 967 // A simplified version of Process_OopMap_Node, to handle non-safepoints.
968 968 class NonSafepointEmitter {
969 969 Compile* C;
970 970 JVMState* _pending_jvms;
971 971 int _pending_offset;
972 972
973 973 void emit_non_safepoint();
974 974
975 975 public:
976 976 NonSafepointEmitter(Compile* compile) {
977 977 this->C = compile;
978 978 _pending_jvms = NULL;
979 979 _pending_offset = 0;
980 980 }
981 981
982 982 void observe_instruction(Node* n, int pc_offset) {
983 983 if (!C->debug_info()->recording_non_safepoints()) return;
984 984
985 985 Node_Notes* nn = C->node_notes_at(n->_idx);
986 986 if (nn == NULL || nn->jvms() == NULL) return;
987 987 if (_pending_jvms != NULL &&
988 988 _pending_jvms->same_calls_as(nn->jvms())) {
989 989 // Repeated JVMS? Stretch it up here.
990 990 _pending_offset = pc_offset;
991 991 } else {
992 992 if (_pending_jvms != NULL &&
993 993 _pending_offset < pc_offset) {
994 994 emit_non_safepoint();
995 995 }
996 996 _pending_jvms = NULL;
997 997 if (pc_offset > C->debug_info()->last_pc_offset()) {
998 998 // This is the only way _pending_jvms can become non-NULL:
999 999 _pending_jvms = nn->jvms();
1000 1000 _pending_offset = pc_offset;
1001 1001 }
1002 1002 }
1003 1003 }
1004 1004
1005 1005 // Stay out of the way of real safepoints:
1006 1006 void observe_safepoint(JVMState* jvms, int pc_offset) {
1007 1007 if (_pending_jvms != NULL &&
1008 1008 !_pending_jvms->same_calls_as(jvms) &&
1009 1009 _pending_offset < pc_offset) {
1010 1010 emit_non_safepoint();
1011 1011 }
1012 1012 _pending_jvms = NULL;
1013 1013 }
1014 1014
1015 1015 void flush_at_end() {
1016 1016 if (_pending_jvms != NULL) {
1017 1017 emit_non_safepoint();
1018 1018 }
1019 1019 _pending_jvms = NULL;
1020 1020 }
1021 1021 };
1022 1022
1023 1023 void NonSafepointEmitter::emit_non_safepoint() {
1024 1024 JVMState* youngest_jvms = _pending_jvms;
1025 1025 int pc_offset = _pending_offset;
1026 1026
1027 1027 // Clear it now:
1028 1028 _pending_jvms = NULL;
1029 1029
1030 1030 DebugInformationRecorder* debug_info = C->debug_info();
1031 1031 assert(debug_info->recording_non_safepoints(), "sanity");
1032 1032
1033 1033 debug_info->add_non_safepoint(pc_offset);
1034 1034 int max_depth = youngest_jvms->depth();
1035 1035
1036 1036 // Visit scopes from oldest to youngest.
1037 1037 for (int depth = 1; depth <= max_depth; depth++) {
1038 1038 JVMState* jvms = youngest_jvms->of_depth(depth);
1039 1039 ciMethod* method = jvms->has_method() ? jvms->method() : NULL;
1040 1040 assert(!jvms->should_reexecute() || depth==max_depth, "reexecute allowed only for the youngest");
1041 1041 debug_info->describe_scope(pc_offset, method, jvms->bci(), jvms->should_reexecute());
1042 1042 }
1043 1043
1044 1044 // Mark the end of the scope set.
1045 1045 debug_info->end_non_safepoint(pc_offset);
1046 1046 }
1047 1047
1048 1048
1049 1049
1050 1050 // helper for fill_buffer bailout logic
1051 1051 static void turn_off_compiler(Compile* C) {
1052 1052 if (CodeCache::largest_free_block() >= CodeCacheMinimumFreeSpace*10) {
1053 1053 // Do not turn off compilation if a single giant method has
1054 1054 // blown the code cache size.
1055 1055 C->record_failure("excessive request to CodeCache");
1056 1056 } else {
1057 1057 // Let CompilerBroker disable further compilations.
1058 1058 C->record_failure("CodeCache is full");
1059 1059 }
1060 1060 }
1061 1061
1062 1062
1063 1063 //------------------------------init_buffer------------------------------------
1064 1064 CodeBuffer* Compile::init_buffer(uint* blk_starts) {
1065 1065
1066 1066 // Set the initially allocated size
1067 1067 int code_req = initial_code_capacity;
1068 1068 int locs_req = initial_locs_capacity;
1069 1069 int stub_req = TraceJumps ? initial_stub_capacity * 10 : initial_stub_capacity;
1070 1070 int const_req = initial_const_capacity;
1071 1071
1072 1072 int pad_req = NativeCall::instruction_size;
1073 1073 // The extra spacing after the code is necessary on some platforms.
1074 1074 // Sometimes we need to patch in a jump after the last instruction,
1075 1075 // if the nmethod has been deoptimized. (See 4932387, 4894843.)
1076 1076
1077 1077 // Compute the byte offset where we can store the deopt pc.
1078 1078 if (fixed_slots() != 0) {
1079 1079 _orig_pc_slot_offset_in_bytes = _regalloc->reg2offset(OptoReg::stack2reg(_orig_pc_slot));
1080 1080 }
1081 1081
1082 1082 // Compute prolog code size
1083 1083 _method_size = 0;
1084 1084 _frame_slots = OptoReg::reg2stack(_matcher->_old_SP)+_regalloc->_framesize;
1085 1085 #ifdef IA64
1086 1086 if (save_argument_registers()) {
1087 1087 // 4815101: this is a stub with implicit and unknown precision fp args.
1088 1088 // The usual spill mechanism can only generate stfd's in this case, which
1089 1089 // doesn't work if the fp reg to spill contains a single-precision denorm.
1090 1090 // Instead, we hack around the normal spill mechanism using stfspill's and
1091 1091 // ldffill's in the MachProlog and MachEpilog emit methods. We allocate
1092 1092 // space here for the fp arg regs (f8-f15) we're going to thusly spill.
1093 1093 //
1094 1094 // If we ever implement 16-byte 'registers' == stack slots, we can
1095 1095 // get rid of this hack and have SpillCopy generate stfspill/ldffill
1096 1096 // instead of stfd/stfs/ldfd/ldfs.
1097 1097 _frame_slots += 8*(16/BytesPerInt);
1098 1098 }
1099 1099 #endif
1100 1100 assert(_frame_slots >= 0 && _frame_slots < 1000000, "sanity check");
1101 1101
1102 1102 if (has_mach_constant_base_node()) {
1103 1103 // Fill the constant table.
1104 1104 // Note: This must happen before shorten_branches.
1105 1105 for (uint i = 0; i < _cfg->_num_blocks; i++) {
1106 1106 Block* b = _cfg->_blocks[i];
1107 1107
1108 1108 for (uint j = 0; j < b->_nodes.size(); j++) {
1109 1109 Node* n = b->_nodes[j];
1110 1110
1111 1111 // If the node is a MachConstantNode evaluate the constant
1112 1112 // value section.
1113 1113 if (n->is_MachConstant()) {
1114 1114 MachConstantNode* machcon = n->as_MachConstant();
1115 1115 machcon->eval_constant(C);
1116 1116 }
1117 1117 }
1118 1118 }
1119 1119
1120 1120 // Calculate the offsets of the constants and the size of the
1121 1121 // constant table (including the padding to the next section).
1122 1122 constant_table().calculate_offsets_and_size();
1123 1123 const_req = constant_table().size();
1124 1124 }
1125 1125
1126 1126 // Initialize the space for the BufferBlob used to find and verify
1127 1127 // instruction size in MachNode::emit_size()
1128 1128 init_scratch_buffer_blob(const_req);
1129 1129 if (failing()) return NULL; // Out of memory
1130 1130
1131 1131 // Pre-compute the length of blocks and replace
1132 1132 // long branches with short if machine supports it.
1133 1133 shorten_branches(blk_starts, code_req, locs_req, stub_req);
1134 1134
1135 1135 // nmethod and CodeBuffer count stubs & constants as part of method's code.
1136 1136 int exception_handler_req = size_exception_handler();
1137 1137 int deopt_handler_req = size_deopt_handler();
1138 1138 exception_handler_req += MAX_stubs_size; // add marginal slop for handler
1139 1139 deopt_handler_req += MAX_stubs_size; // add marginal slop for handler
1140 1140 stub_req += MAX_stubs_size; // ensure per-stub margin
1141 1141 code_req += MAX_inst_size; // ensure per-instruction margin
1142 1142
1143 1143 if (StressCodeBuffers)
1144 1144 code_req = const_req = stub_req = exception_handler_req = deopt_handler_req = 0x10; // force expansion
1145 1145
1146 1146 int total_req =
1147 1147 const_req +
1148 1148 code_req +
1149 1149 pad_req +
1150 1150 stub_req +
1151 1151 exception_handler_req +
1152 1152 deopt_handler_req; // deopt handler
1153 1153
1154 1154 if (has_method_handle_invokes())
1155 1155 total_req += deopt_handler_req; // deopt MH handler
1156 1156
1157 1157 CodeBuffer* cb = code_buffer();
1158 1158 cb->initialize(total_req, locs_req);
1159 1159
1160 1160 // Have we run out of code space?
1161 1161 if ((cb->blob() == NULL) || (!CompileBroker::should_compile_new_jobs())) {
1162 1162 turn_off_compiler(this);
1163 1163 return NULL;
1164 1164 }
1165 1165 // Configure the code buffer.
1166 1166 cb->initialize_consts_size(const_req);
1167 1167 cb->initialize_stubs_size(stub_req);
1168 1168 cb->initialize_oop_recorder(env()->oop_recorder());
1169 1169
1170 1170 // fill in the nop array for bundling computations
1171 1171 MachNode *_nop_list[Bundle::_nop_count];
1172 1172 Bundle::initialize_nops(_nop_list, this);
1173 1173
1174 1174 return cb;
1175 1175 }
1176 1176
1177 1177 //------------------------------fill_buffer------------------------------------
1178 1178 void Compile::fill_buffer(CodeBuffer* cb, uint* blk_starts) {
1179 1179 // blk_starts[] contains offsets calculated during short branches processing,
1180 1180 // offsets should not be increased during following steps.
1181 1181
1182 1182 // Compute the size of first NumberOfLoopInstrToAlign instructions at head
1183 1183 // of a loop. It is used to determine the padding for loop alignment.
1184 1184 compute_loop_first_inst_sizes();
1185 1185
1186 1186 // Create oopmap set.
1187 1187 _oop_map_set = new OopMapSet();
1188 1188
1189 1189 // !!!!! This preserves old handling of oopmaps for now
1190 1190 debug_info()->set_oopmaps(_oop_map_set);
1191 1191
1192 1192 uint nblocks = _cfg->_num_blocks;
1193 1193 // Count and start of implicit null check instructions
1194 1194 uint inct_cnt = 0;
1195 1195 uint *inct_starts = NEW_RESOURCE_ARRAY(uint, nblocks+1);
1196 1196
1197 1197 // Count and start of calls
1198 1198 uint *call_returns = NEW_RESOURCE_ARRAY(uint, nblocks+1);
1199 1199
1200 1200 uint return_offset = 0;
1201 1201 int nop_size = (new (this) MachNopNode())->size(_regalloc);
1202 1202
1203 1203 int previous_offset = 0;
1204 1204 int current_offset = 0;
1205 1205 int last_call_offset = -1;
1206 1206 int last_avoid_back_to_back_offset = -1;
1207 1207 #ifdef ASSERT
1208 1208 uint* jmp_target = NEW_RESOURCE_ARRAY(uint,nblocks);
1209 1209 uint* jmp_offset = NEW_RESOURCE_ARRAY(uint,nblocks);
1210 1210 uint* jmp_size = NEW_RESOURCE_ARRAY(uint,nblocks);
1211 1211 uint* jmp_rule = NEW_RESOURCE_ARRAY(uint,nblocks);
1212 1212 #endif
1213 1213
1214 1214 // Create an array of unused labels, one for each basic block, if printing is enabled
1215 1215 #ifndef PRODUCT
1216 1216 int *node_offsets = NULL;
1217 1217 uint node_offset_limit = unique();
1218 1218
1219 1219 if (print_assembly())
1220 1220 node_offsets = NEW_RESOURCE_ARRAY(int, node_offset_limit);
1221 1221 #endif
1222 1222
1223 1223 NonSafepointEmitter non_safepoints(this); // emit non-safepoints lazily
1224 1224
1225 1225 // Emit the constant table.
1226 1226 if (has_mach_constant_base_node()) {
1227 1227 constant_table().emit(*cb);
1228 1228 }
1229 1229
1230 1230 // Create an array of labels, one for each basic block
1231 1231 Label *blk_labels = NEW_RESOURCE_ARRAY(Label, nblocks+1);
1232 1232 for (uint i=0; i <= nblocks; i++) {
1233 1233 blk_labels[i].init();
1234 1234 }
1235 1235
1236 1236 // ------------------
1237 1237 // Now fill in the code buffer
1238 1238 Node *delay_slot = NULL;
1239 1239
1240 1240 for (uint i=0; i < nblocks; i++) {
1241 1241 Block *b = _cfg->_blocks[i];
1242 1242
1243 1243 Node *head = b->head();
1244 1244
1245 1245 // If this block needs to start aligned (i.e, can be reached other
1246 1246 // than by falling-thru from the previous block), then force the
1247 1247 // start of a new bundle.
1248 1248 if (Pipeline::requires_bundling() && starts_bundle(head))
1249 1249 cb->flush_bundle(true);
1250 1250
1251 1251 #ifdef ASSERT
1252 1252 if (!b->is_connector()) {
1253 1253 stringStream st;
1254 1254 b->dump_head(&_cfg->_bbs, &st);
1255 1255 MacroAssembler(cb).block_comment(st.as_string());
1256 1256 }
1257 1257 jmp_target[i] = 0;
1258 1258 jmp_offset[i] = 0;
1259 1259 jmp_size[i] = 0;
1260 1260 jmp_rule[i] = 0;
1261 1261 #endif
1262 1262 int blk_offset = current_offset;
1263 1263
1264 1264 // Define the label at the beginning of the basic block
1265 1265 MacroAssembler(cb).bind(blk_labels[b->_pre_order]);
1266 1266
1267 1267 uint last_inst = b->_nodes.size();
1268 1268
1269 1269 // Emit block normally, except for last instruction.
1270 1270 // Emit means "dump code bits into code buffer".
1271 1271 for (uint j = 0; j<last_inst; j++) {
1272 1272
1273 1273 // Get the node
1274 1274 Node* n = b->_nodes[j];
1275 1275
1276 1276 // See if delay slots are supported
1277 1277 if (valid_bundle_info(n) &&
1278 1278 node_bundling(n)->used_in_unconditional_delay()) {
1279 1279 assert(delay_slot == NULL, "no use of delay slot node");
1280 1280 assert(n->size(_regalloc) == Pipeline::instr_unit_size(), "delay slot instruction wrong size");
1281 1281
1282 1282 delay_slot = n;
1283 1283 continue;
1284 1284 }
1285 1285
1286 1286 // If this starts a new instruction group, then flush the current one
1287 1287 // (but allow split bundles)
1288 1288 if (Pipeline::requires_bundling() && starts_bundle(n))
1289 1289 cb->flush_bundle(false);
1290 1290
1291 1291 // The following logic is duplicated in the code ifdeffed for
1292 1292 // ENABLE_ZAP_DEAD_LOCALS which appears above in this file. It
1293 1293 // should be factored out. Or maybe dispersed to the nodes?
1294 1294
1295 1295 // Special handling for SafePoint/Call Nodes
1296 1296 bool is_mcall = false;
1297 1297 if (n->is_Mach()) {
1298 1298 MachNode *mach = n->as_Mach();
1299 1299 is_mcall = n->is_MachCall();
1300 1300 bool is_sfn = n->is_MachSafePoint();
1301 1301
1302 1302 // If this requires all previous instructions be flushed, then do so
1303 1303 if (is_sfn || is_mcall || mach->alignment_required() != 1) {
1304 1304 cb->flush_bundle(true);
1305 1305 current_offset = cb->insts_size();
1306 1306 }
1307 1307
1308 1308 // A padding may be needed again since a previous instruction
1309 1309 // could be moved to delay slot.
1310 1310
1311 1311 // align the instruction if necessary
1312 1312 int padding = mach->compute_padding(current_offset);
1313 1313 // Make sure safepoint node for polling is distinct from a call's
1314 1314 // return by adding a nop if needed.
1315 1315 if (is_sfn && !is_mcall && padding == 0 && current_offset == last_call_offset) {
1316 1316 padding = nop_size;
1317 1317 }
1318 1318 if (padding == 0 && mach->avoid_back_to_back() &&
1319 1319 current_offset == last_avoid_back_to_back_offset) {
1320 1320 // Avoid back to back some instructions.
1321 1321 padding = nop_size;
1322 1322 }
1323 1323
1324 1324 if(padding > 0) {
1325 1325 assert((padding % nop_size) == 0, "padding is not a multiple of NOP size");
1326 1326 int nops_cnt = padding / nop_size;
1327 1327 MachNode *nop = new (this) MachNopNode(nops_cnt);
1328 1328 b->_nodes.insert(j++, nop);
1329 1329 last_inst++;
1330 1330 _cfg->_bbs.map( nop->_idx, b );
1331 1331 nop->emit(*cb, _regalloc);
1332 1332 cb->flush_bundle(true);
1333 1333 current_offset = cb->insts_size();
1334 1334 }
1335 1335
1336 1336 // Remember the start of the last call in a basic block
1337 1337 if (is_mcall) {
1338 1338 MachCallNode *mcall = mach->as_MachCall();
1339 1339
1340 1340 // This destination address is NOT PC-relative
1341 1341 mcall->method_set((intptr_t)mcall->entry_point());
1342 1342
1343 1343 // Save the return address
1344 1344 call_returns[b->_pre_order] = current_offset + mcall->ret_addr_offset();
1345 1345
1346 1346 if (mcall->is_MachCallLeaf()) {
1347 1347 is_mcall = false;
1348 1348 is_sfn = false;
1349 1349 }
1350 1350 }
1351 1351
1352 1352 // sfn will be valid whenever mcall is valid now because of inheritance
1353 1353 if (is_sfn || is_mcall) {
1354 1354
1355 1355 // Handle special safepoint nodes for synchronization
1356 1356 if (!is_mcall) {
1357 1357 MachSafePointNode *sfn = mach->as_MachSafePoint();
1358 1358 // !!!!! Stubs only need an oopmap right now, so bail out
1359 1359 if (sfn->jvms()->method() == NULL) {
1360 1360 // Write the oopmap directly to the code blob??!!
1361 1361 # ifdef ENABLE_ZAP_DEAD_LOCALS
1362 1362 assert( !is_node_getting_a_safepoint(sfn), "logic does not match; false positive");
1363 1363 # endif
1364 1364 continue;
1365 1365 }
1366 1366 } // End synchronization
1367 1367
1368 1368 non_safepoints.observe_safepoint(mach->as_MachSafePoint()->jvms(),
1369 1369 current_offset);
1370 1370 Process_OopMap_Node(mach, current_offset);
1371 1371 } // End if safepoint
1372 1372
1373 1373 // If this is a null check, then add the start of the previous instruction to the list
1374 1374 else if( mach->is_MachNullCheck() ) {
1375 1375 inct_starts[inct_cnt++] = previous_offset;
1376 1376 }
1377 1377
1378 1378 // If this is a branch, then fill in the label with the target BB's label
1379 1379 else if (mach->is_MachBranch()) {
1380 1380 // This requires the TRUE branch target be in succs[0]
1381 1381 uint block_num = b->non_connector_successor(0)->_pre_order;
1382 1382
1383 1383 // Try to replace long branch if delay slot is not used,
1384 1384 // it is mostly for back branches since forward branch's
1385 1385 // distance is not updated yet.
1386 1386 bool delay_slot_is_used = valid_bundle_info(n) &&
1387 1387 node_bundling(n)->use_unconditional_delay();
1388 1388 if (!delay_slot_is_used && mach->may_be_short_branch()) {
1389 1389 assert(delay_slot == NULL, "not expecting delay slot node");
1390 1390 int br_size = n->size(_regalloc);
1391 1391 int offset = blk_starts[block_num] - current_offset;
1392 1392 if (block_num >= i) {
1393 1393 // Current and following block's offset are not
1394 1394 // finilized yet, adjust distance by the difference
1395 1395 // between calculated and final offsets of current block.
1396 1396 offset -= (blk_starts[i] - blk_offset);
1397 1397 }
1398 1398 // In the following code a nop could be inserted before
1399 1399 // the branch which will increase the backward distance.
1400 1400 bool needs_padding = (current_offset == last_avoid_back_to_back_offset);
1401 1401 if (needs_padding && offset <= 0)
1402 1402 offset -= nop_size;
1403 1403
1404 1404 if (_matcher->is_short_branch_offset(mach->rule(), br_size, offset)) {
1405 1405 // We've got a winner. Replace this branch.
1406 1406 MachNode* replacement = mach->as_MachBranch()->short_branch_version(this);
1407 1407
1408 1408 // Update the jmp_size.
1409 1409 int new_size = replacement->size(_regalloc);
1410 1410 assert((br_size - new_size) >= (int)nop_size, "short_branch size should be smaller");
1411 1411 // Insert padding between avoid_back_to_back branches.
1412 1412 if (needs_padding && replacement->avoid_back_to_back()) {
1413 1413 MachNode *nop = new (this) MachNopNode();
1414 1414 b->_nodes.insert(j++, nop);
1415 1415 _cfg->_bbs.map(nop->_idx, b);
1416 1416 last_inst++;
1417 1417 nop->emit(*cb, _regalloc);
1418 1418 cb->flush_bundle(true);
1419 1419 current_offset = cb->insts_size();
1420 1420 }
1421 1421 #ifdef ASSERT
1422 1422 jmp_target[i] = block_num;
1423 1423 jmp_offset[i] = current_offset - blk_offset;
1424 1424 jmp_size[i] = new_size;
1425 1425 jmp_rule[i] = mach->rule();
1426 1426 #endif
1427 1427 b->_nodes.map(j, replacement);
1428 1428 mach->subsume_by(replacement, C);
1429 1429 n = replacement;
1430 1430 mach = replacement;
1431 1431 }
1432 1432 }
1433 1433 mach->as_MachBranch()->label_set( &blk_labels[block_num], block_num );
1434 1434 } else if (mach->ideal_Opcode() == Op_Jump) {
1435 1435 for (uint h = 0; h < b->_num_succs; h++) {
1436 1436 Block* succs_block = b->_succs[h];
1437 1437 for (uint j = 1; j < succs_block->num_preds(); j++) {
1438 1438 Node* jpn = succs_block->pred(j);
1439 1439 if (jpn->is_JumpProj() && jpn->in(0) == mach) {
1440 1440 uint block_num = succs_block->non_connector()->_pre_order;
1441 1441 Label *blkLabel = &blk_labels[block_num];
1442 1442 mach->add_case_label(jpn->as_JumpProj()->proj_no(), blkLabel);
1443 1443 }
1444 1444 }
1445 1445 }
1446 1446 }
1447 1447
1448 1448 #ifdef ASSERT
1449 1449 // Check that oop-store precedes the card-mark
1450 1450 else if (mach->ideal_Opcode() == Op_StoreCM) {
1451 1451 uint storeCM_idx = j;
1452 1452 int count = 0;
1453 1453 for (uint prec = mach->req(); prec < mach->len(); prec++) {
1454 1454 Node *oop_store = mach->in(prec); // Precedence edge
1455 1455 if (oop_store == NULL) continue;
1456 1456 count++;
1457 1457 uint i4;
1458 1458 for( i4 = 0; i4 < last_inst; ++i4 ) {
1459 1459 if( b->_nodes[i4] == oop_store ) break;
1460 1460 }
1461 1461 // Note: This test can provide a false failure if other precedence
1462 1462 // edges have been added to the storeCMNode.
1463 1463 assert( i4 == last_inst || i4 < storeCM_idx, "CM card-mark executes before oop-store");
1464 1464 }
1465 1465 assert(count > 0, "storeCM expects at least one precedence edge");
1466 1466 }
1467 1467 #endif
1468 1468
1469 1469 else if (!n->is_Proj()) {
1470 1470 // Remember the beginning of the previous instruction, in case
1471 1471 // it's followed by a flag-kill and a null-check. Happens on
1472 1472 // Intel all the time, with add-to-memory kind of opcodes.
1473 1473 previous_offset = current_offset;
1474 1474 }
1475 1475 }
1476 1476
1477 1477 // Verify that there is sufficient space remaining
1478 1478 cb->insts()->maybe_expand_to_ensure_remaining(MAX_inst_size);
1479 1479 if ((cb->blob() == NULL) || (!CompileBroker::should_compile_new_jobs())) {
1480 1480 turn_off_compiler(this);
1481 1481 return;
1482 1482 }
1483 1483
1484 1484 // Save the offset for the listing
1485 1485 #ifndef PRODUCT
1486 1486 if (node_offsets && n->_idx < node_offset_limit)
1487 1487 node_offsets[n->_idx] = cb->insts_size();
1488 1488 #endif
1489 1489
1490 1490 // "Normal" instruction case
1491 1491 DEBUG_ONLY( uint instr_offset = cb->insts_size(); )
1492 1492 n->emit(*cb, _regalloc);
1493 1493 current_offset = cb->insts_size();
1494 1494
1495 1495 #ifdef ASSERT
1496 1496 if (n->size(_regalloc) < (current_offset-instr_offset)) {
1497 1497 n->dump();
1498 1498 assert(false, "wrong size of mach node");
1499 1499 }
1500 1500 #endif
1501 1501 non_safepoints.observe_instruction(n, current_offset);
1502 1502
1503 1503 // mcall is last "call" that can be a safepoint
1504 1504 // record it so we can see if a poll will directly follow it
1505 1505 // in which case we'll need a pad to make the PcDesc sites unique
1506 1506 // see 5010568. This can be slightly inaccurate but conservative
1507 1507 // in the case that return address is not actually at current_offset.
1508 1508 // This is a small price to pay.
1509 1509
1510 1510 if (is_mcall) {
1511 1511 last_call_offset = current_offset;
1512 1512 }
1513 1513
1514 1514 if (n->is_Mach() && n->as_Mach()->avoid_back_to_back()) {
1515 1515 // Avoid back to back some instructions.
1516 1516 last_avoid_back_to_back_offset = current_offset;
1517 1517 }
1518 1518
1519 1519 // See if this instruction has a delay slot
1520 1520 if (valid_bundle_info(n) && node_bundling(n)->use_unconditional_delay()) {
1521 1521 assert(delay_slot != NULL, "expecting delay slot node");
1522 1522
1523 1523 // Back up 1 instruction
1524 1524 cb->set_insts_end(cb->insts_end() - Pipeline::instr_unit_size());
1525 1525
1526 1526 // Save the offset for the listing
1527 1527 #ifndef PRODUCT
1528 1528 if (node_offsets && delay_slot->_idx < node_offset_limit)
1529 1529 node_offsets[delay_slot->_idx] = cb->insts_size();
1530 1530 #endif
1531 1531
1532 1532 // Support a SafePoint in the delay slot
1533 1533 if (delay_slot->is_MachSafePoint()) {
1534 1534 MachNode *mach = delay_slot->as_Mach();
1535 1535 // !!!!! Stubs only need an oopmap right now, so bail out
1536 1536 if (!mach->is_MachCall() && mach->as_MachSafePoint()->jvms()->method() == NULL) {
1537 1537 // Write the oopmap directly to the code blob??!!
1538 1538 # ifdef ENABLE_ZAP_DEAD_LOCALS
1539 1539 assert( !is_node_getting_a_safepoint(mach), "logic does not match; false positive");
1540 1540 # endif
1541 1541 delay_slot = NULL;
1542 1542 continue;
1543 1543 }
1544 1544
1545 1545 int adjusted_offset = current_offset - Pipeline::instr_unit_size();
1546 1546 non_safepoints.observe_safepoint(mach->as_MachSafePoint()->jvms(),
1547 1547 adjusted_offset);
1548 1548 // Generate an OopMap entry
1549 1549 Process_OopMap_Node(mach, adjusted_offset);
1550 1550 }
1551 1551
1552 1552 // Insert the delay slot instruction
1553 1553 delay_slot->emit(*cb, _regalloc);
1554 1554
1555 1555 // Don't reuse it
1556 1556 delay_slot = NULL;
1557 1557 }
1558 1558
1559 1559 } // End for all instructions in block
1560 1560
1561 1561 // If the next block is the top of a loop, pad this block out to align
1562 1562 // the loop top a little. Helps prevent pipe stalls at loop back branches.
1563 1563 if (i < nblocks-1) {
1564 1564 Block *nb = _cfg->_blocks[i+1];
1565 1565 int padding = nb->alignment_padding(current_offset);
1566 1566 if( padding > 0 ) {
1567 1567 MachNode *nop = new (this) MachNopNode(padding / nop_size);
1568 1568 b->_nodes.insert( b->_nodes.size(), nop );
1569 1569 _cfg->_bbs.map( nop->_idx, b );
1570 1570 nop->emit(*cb, _regalloc);
1571 1571 current_offset = cb->insts_size();
1572 1572 }
1573 1573 }
1574 1574 // Verify that the distance for generated before forward
1575 1575 // short branches is still valid.
1576 1576 guarantee((int)(blk_starts[i+1] - blk_starts[i]) >= (current_offset - blk_offset), "shouldn't increase block size");
1577 1577
1578 1578 // Save new block start offset
1579 1579 blk_starts[i] = blk_offset;
1580 1580 } // End of for all blocks
1581 1581 blk_starts[nblocks] = current_offset;
1582 1582
1583 1583 non_safepoints.flush_at_end();
1584 1584
1585 1585 // Offset too large?
1586 1586 if (failing()) return;
1587 1587
1588 1588 // Define a pseudo-label at the end of the code
1589 1589 MacroAssembler(cb).bind( blk_labels[nblocks] );
1590 1590
1591 1591 // Compute the size of the first block
1592 1592 _first_block_size = blk_labels[1].loc_pos() - blk_labels[0].loc_pos();
1593 1593
1594 1594 assert(cb->insts_size() < 500000, "method is unreasonably large");
1595 1595
1596 1596 #ifdef ASSERT
1597 1597 for (uint i = 0; i < nblocks; i++) { // For all blocks
1598 1598 if (jmp_target[i] != 0) {
1599 1599 int br_size = jmp_size[i];
1600 1600 int offset = blk_starts[jmp_target[i]]-(blk_starts[i] + jmp_offset[i]);
1601 1601 if (!_matcher->is_short_branch_offset(jmp_rule[i], br_size, offset)) {
1602 1602 tty->print_cr("target (%d) - jmp_offset(%d) = offset (%d), jump_size(%d), jmp_block B%d, target_block B%d", blk_starts[jmp_target[i]], blk_starts[i] + jmp_offset[i], offset, br_size, i, jmp_target[i]);
1603 1603 assert(false, "Displacement too large for short jmp");
1604 1604 }
1605 1605 }
1606 1606 }
1607 1607 #endif
1608 1608
1609 1609 // ------------------
1610 1610
1611 1611 #ifndef PRODUCT
1612 1612 // Information on the size of the method, without the extraneous code
1613 1613 Scheduling::increment_method_size(cb->insts_size());
1614 1614 #endif
1615 1615
1616 1616 // ------------------
1617 1617 // Fill in exception table entries.
1618 1618 FillExceptionTables(inct_cnt, call_returns, inct_starts, blk_labels);
1619 1619
1620 1620 // Only java methods have exception handlers and deopt handlers
1621 1621 if (_method) {
1622 1622 // Emit the exception handler code.
1623 1623 _code_offsets.set_value(CodeOffsets::Exceptions, emit_exception_handler(*cb));
1624 1624 // Emit the deopt handler code.
1625 1625 _code_offsets.set_value(CodeOffsets::Deopt, emit_deopt_handler(*cb));
1626 1626
1627 1627 // Emit the MethodHandle deopt handler code (if required).
1628 1628 if (has_method_handle_invokes()) {
1629 1629 // We can use the same code as for the normal deopt handler, we
1630 1630 // just need a different entry point address.
1631 1631 _code_offsets.set_value(CodeOffsets::DeoptMH, emit_deopt_handler(*cb));
1632 1632 }
1633 1633 }
1634 1634
1635 1635 // One last check for failed CodeBuffer::expand:
1636 1636 if ((cb->blob() == NULL) || (!CompileBroker::should_compile_new_jobs())) {
1637 1637 turn_off_compiler(this);
1638 1638 return;
1639 1639 }
1640 1640
1641 1641 #ifndef PRODUCT
1642 1642 // Dump the assembly code, including basic-block numbers
1643 1643 if (print_assembly()) {
1644 1644 ttyLocker ttyl; // keep the following output all in one block
1645 1645 if (!VMThread::should_terminate()) { // test this under the tty lock
1646 1646 // This output goes directly to the tty, not the compiler log.
1647 1647 // To enable tools to match it up with the compilation activity,
1648 1648 // be sure to tag this tty output with the compile ID.
1649 1649 if (xtty != NULL) {
1650 1650 xtty->head("opto_assembly compile_id='%d'%s", compile_id(),
1651 1651 is_osr_compilation() ? " compile_kind='osr'" :
1652 1652 "");
1653 1653 }
1654 1654 if (method() != NULL) {
1655 1655 method()->print_oop();
1656 1656 print_codes();
1657 1657 }
1658 1658 dump_asm(node_offsets, node_offset_limit);
1659 1659 if (xtty != NULL) {
1660 1660 xtty->tail("opto_assembly");
1661 1661 }
1662 1662 }
1663 1663 }
1664 1664 #endif
1665 1665
1666 1666 }
1667 1667
1668 1668 void Compile::FillExceptionTables(uint cnt, uint *call_returns, uint *inct_starts, Label *blk_labels) {
1669 1669 _inc_table.set_size(cnt);
1670 1670
1671 1671 uint inct_cnt = 0;
1672 1672 for( uint i=0; i<_cfg->_num_blocks; i++ ) {
1673 1673 Block *b = _cfg->_blocks[i];
1674 1674 Node *n = NULL;
1675 1675 int j;
1676 1676
1677 1677 // Find the branch; ignore trailing NOPs.
1678 1678 for( j = b->_nodes.size()-1; j>=0; j-- ) {
1679 1679 n = b->_nodes[j];
1680 1680 if( !n->is_Mach() || n->as_Mach()->ideal_Opcode() != Op_Con )
1681 1681 break;
1682 1682 }
1683 1683
1684 1684 // If we didn't find anything, continue
1685 1685 if( j < 0 ) continue;
1686 1686
1687 1687 // Compute ExceptionHandlerTable subtable entry and add it
1688 1688 // (skip empty blocks)
1689 1689 if( n->is_Catch() ) {
1690 1690
1691 1691 // Get the offset of the return from the call
1692 1692 uint call_return = call_returns[b->_pre_order];
1693 1693 #ifdef ASSERT
1694 1694 assert( call_return > 0, "no call seen for this basic block" );
1695 1695 while( b->_nodes[--j]->is_MachProj() ) ;
1696 1696 assert( b->_nodes[j]->is_MachCall(), "CatchProj must follow call" );
1697 1697 #endif
1698 1698 // last instruction is a CatchNode, find it's CatchProjNodes
1699 1699 int nof_succs = b->_num_succs;
1700 1700 // allocate space
1701 1701 GrowableArray<intptr_t> handler_bcis(nof_succs);
1702 1702 GrowableArray<intptr_t> handler_pcos(nof_succs);
1703 1703 // iterate through all successors
1704 1704 for (int j = 0; j < nof_succs; j++) {
1705 1705 Block* s = b->_succs[j];
1706 1706 bool found_p = false;
1707 1707 for( uint k = 1; k < s->num_preds(); k++ ) {
1708 1708 Node *pk = s->pred(k);
1709 1709 if( pk->is_CatchProj() && pk->in(0) == n ) {
1710 1710 const CatchProjNode* p = pk->as_CatchProj();
1711 1711 found_p = true;
1712 1712 // add the corresponding handler bci & pco information
1713 1713 if( p->_con != CatchProjNode::fall_through_index ) {
1714 1714 // p leads to an exception handler (and is not fall through)
1715 1715 assert(s == _cfg->_blocks[s->_pre_order],"bad numbering");
1716 1716 // no duplicates, please
1717 1717 if( !handler_bcis.contains(p->handler_bci()) ) {
1718 1718 uint block_num = s->non_connector()->_pre_order;
1719 1719 handler_bcis.append(p->handler_bci());
1720 1720 handler_pcos.append(blk_labels[block_num].loc_pos());
1721 1721 }
1722 1722 }
1723 1723 }
1724 1724 }
1725 1725 assert(found_p, "no matching predecessor found");
1726 1726 // Note: Due to empty block removal, one block may have
1727 1727 // several CatchProj inputs, from the same Catch.
1728 1728 }
1729 1729
1730 1730 // Set the offset of the return from the call
1731 1731 _handler_table.add_subtable(call_return, &handler_bcis, NULL, &handler_pcos);
1732 1732 continue;
1733 1733 }
1734 1734
1735 1735 // Handle implicit null exception table updates
1736 1736 if( n->is_MachNullCheck() ) {
1737 1737 uint block_num = b->non_connector_successor(0)->_pre_order;
1738 1738 _inc_table.append( inct_starts[inct_cnt++], blk_labels[block_num].loc_pos() );
1739 1739 continue;
1740 1740 }
1741 1741 } // End of for all blocks fill in exception table entries
1742 1742 }
1743 1743
1744 1744 // Static Variables
1745 1745 #ifndef PRODUCT
1746 1746 uint Scheduling::_total_nop_size = 0;
1747 1747 uint Scheduling::_total_method_size = 0;
1748 1748 uint Scheduling::_total_branches = 0;
1749 1749 uint Scheduling::_total_unconditional_delays = 0;
1750 1750 uint Scheduling::_total_instructions_per_bundle[Pipeline::_max_instrs_per_cycle+1];
1751 1751 #endif
1752 1752
1753 1753 // Initializer for class Scheduling
1754 1754
1755 1755 Scheduling::Scheduling(Arena *arena, Compile &compile)
1756 1756 : _arena(arena),
1757 1757 _cfg(compile.cfg()),
1758 1758 _bbs(compile.cfg()->_bbs),
1759 1759 _regalloc(compile.regalloc()),
1760 1760 _reg_node(arena),
1761 1761 _bundle_instr_count(0),
1762 1762 _bundle_cycle_number(0),
1763 1763 _scheduled(arena),
1764 1764 _available(arena),
1765 1765 _next_node(NULL),
1766 1766 _bundle_use(0, 0, resource_count, &_bundle_use_elements[0]),
1767 1767 _pinch_free_list(arena)
1768 1768 #ifndef PRODUCT
1769 1769 , _branches(0)
1770 1770 , _unconditional_delays(0)
1771 1771 #endif
1772 1772 {
1773 1773 // Create a MachNopNode
1774 1774 _nop = new (&compile) MachNopNode();
1775 1775
1776 1776 // Now that the nops are in the array, save the count
1777 1777 // (but allow entries for the nops)
1778 1778 _node_bundling_limit = compile.unique();
1779 1779 uint node_max = _regalloc->node_regs_max_index();
1780 1780
1781 1781 compile.set_node_bundling_limit(_node_bundling_limit);
1782 1782
1783 1783 // This one is persistent within the Compile class
1784 1784 _node_bundling_base = NEW_ARENA_ARRAY(compile.comp_arena(), Bundle, node_max);
1785 1785
1786 1786 // Allocate space for fixed-size arrays
1787 1787 _node_latency = NEW_ARENA_ARRAY(arena, unsigned short, node_max);
1788 1788 _uses = NEW_ARENA_ARRAY(arena, short, node_max);
1789 1789 _current_latency = NEW_ARENA_ARRAY(arena, unsigned short, node_max);
1790 1790
1791 1791 // Clear the arrays
1792 1792 memset(_node_bundling_base, 0, node_max * sizeof(Bundle));
1793 1793 memset(_node_latency, 0, node_max * sizeof(unsigned short));
1794 1794 memset(_uses, 0, node_max * sizeof(short));
1795 1795 memset(_current_latency, 0, node_max * sizeof(unsigned short));
1796 1796
1797 1797 // Clear the bundling information
1798 1798 memcpy(_bundle_use_elements,
1799 1799 Pipeline_Use::elaborated_elements,
1800 1800 sizeof(Pipeline_Use::elaborated_elements));
1801 1801
1802 1802 // Get the last node
1803 1803 Block *bb = _cfg->_blocks[_cfg->_blocks.size()-1];
1804 1804
1805 1805 _next_node = bb->_nodes[bb->_nodes.size()-1];
1806 1806 }
1807 1807
1808 1808 #ifndef PRODUCT
1809 1809 // Scheduling destructor
1810 1810 Scheduling::~Scheduling() {
1811 1811 _total_branches += _branches;
1812 1812 _total_unconditional_delays += _unconditional_delays;
1813 1813 }
1814 1814 #endif
1815 1815
1816 1816 // Step ahead "i" cycles
1817 1817 void Scheduling::step(uint i) {
1818 1818
1819 1819 Bundle *bundle = node_bundling(_next_node);
1820 1820 bundle->set_starts_bundle();
1821 1821
1822 1822 // Update the bundle record, but leave the flags information alone
1823 1823 if (_bundle_instr_count > 0) {
1824 1824 bundle->set_instr_count(_bundle_instr_count);
1825 1825 bundle->set_resources_used(_bundle_use.resourcesUsed());
1826 1826 }
1827 1827
1828 1828 // Update the state information
1829 1829 _bundle_instr_count = 0;
1830 1830 _bundle_cycle_number += i;
1831 1831 _bundle_use.step(i);
1832 1832 }
1833 1833
1834 1834 void Scheduling::step_and_clear() {
1835 1835 Bundle *bundle = node_bundling(_next_node);
1836 1836 bundle->set_starts_bundle();
1837 1837
1838 1838 // Update the bundle record
1839 1839 if (_bundle_instr_count > 0) {
1840 1840 bundle->set_instr_count(_bundle_instr_count);
1841 1841 bundle->set_resources_used(_bundle_use.resourcesUsed());
1842 1842
1843 1843 _bundle_cycle_number += 1;
1844 1844 }
1845 1845
1846 1846 // Clear the bundling information
1847 1847 _bundle_instr_count = 0;
1848 1848 _bundle_use.reset();
1849 1849
1850 1850 memcpy(_bundle_use_elements,
1851 1851 Pipeline_Use::elaborated_elements,
1852 1852 sizeof(Pipeline_Use::elaborated_elements));
1853 1853 }
1854 1854
1855 1855 //------------------------------ScheduleAndBundle------------------------------
1856 1856 // Perform instruction scheduling and bundling over the sequence of
1857 1857 // instructions in backwards order.
1858 1858 void Compile::ScheduleAndBundle() {
1859 1859
1860 1860 // Don't optimize this if it isn't a method
1861 1861 if (!_method)
1862 1862 return;
1863 1863
1864 1864 // Don't optimize this if scheduling is disabled
1865 1865 if (!do_scheduling())
1866 1866 return;
1867 1867
1868 1868 // Scheduling code works only with pairs (8 bytes) maximum.
1869 1869 if (max_vector_size() > 8)
1870 1870 return;
1871 1871
1872 1872 NOT_PRODUCT( TracePhase t2("isched", &_t_instrSched, TimeCompiler); )
1873 1873
1874 1874 // Create a data structure for all the scheduling information
1875 1875 Scheduling scheduling(Thread::current()->resource_area(), *this);
1876 1876
1877 1877 // Walk backwards over each basic block, computing the needed alignment
1878 1878 // Walk over all the basic blocks
1879 1879 scheduling.DoScheduling();
1880 1880 }
1881 1881
1882 1882 //------------------------------ComputeLocalLatenciesForward-------------------
1883 1883 // Compute the latency of all the instructions. This is fairly simple,
1884 1884 // because we already have a legal ordering. Walk over the instructions
1885 1885 // from first to last, and compute the latency of the instruction based
1886 1886 // on the latency of the preceding instruction(s).
1887 1887 void Scheduling::ComputeLocalLatenciesForward(const Block *bb) {
1888 1888 #ifndef PRODUCT
1889 1889 if (_cfg->C->trace_opto_output())
1890 1890 tty->print("# -> ComputeLocalLatenciesForward\n");
1891 1891 #endif
1892 1892
1893 1893 // Walk over all the schedulable instructions
1894 1894 for( uint j=_bb_start; j < _bb_end; j++ ) {
1895 1895
1896 1896 // This is a kludge, forcing all latency calculations to start at 1.
1897 1897 // Used to allow latency 0 to force an instruction to the beginning
1898 1898 // of the bb
1899 1899 uint latency = 1;
1900 1900 Node *use = bb->_nodes[j];
1901 1901 uint nlen = use->len();
1902 1902
1903 1903 // Walk over all the inputs
1904 1904 for ( uint k=0; k < nlen; k++ ) {
1905 1905 Node *def = use->in(k);
1906 1906 if (!def)
1907 1907 continue;
1908 1908
1909 1909 uint l = _node_latency[def->_idx] + use->latency(k);
1910 1910 if (latency < l)
1911 1911 latency = l;
1912 1912 }
1913 1913
1914 1914 _node_latency[use->_idx] = latency;
1915 1915
1916 1916 #ifndef PRODUCT
1917 1917 if (_cfg->C->trace_opto_output()) {
1918 1918 tty->print("# latency %4d: ", latency);
1919 1919 use->dump();
1920 1920 }
1921 1921 #endif
1922 1922 }
1923 1923
1924 1924 #ifndef PRODUCT
1925 1925 if (_cfg->C->trace_opto_output())
1926 1926 tty->print("# <- ComputeLocalLatenciesForward\n");
1927 1927 #endif
1928 1928
1929 1929 } // end ComputeLocalLatenciesForward
1930 1930
1931 1931 // See if this node fits into the present instruction bundle
1932 1932 bool Scheduling::NodeFitsInBundle(Node *n) {
1933 1933 uint n_idx = n->_idx;
1934 1934
1935 1935 // If this is the unconditional delay instruction, then it fits
1936 1936 if (n == _unconditional_delay_slot) {
1937 1937 #ifndef PRODUCT
1938 1938 if (_cfg->C->trace_opto_output())
1939 1939 tty->print("# NodeFitsInBundle [%4d]: TRUE; is in unconditional delay slot\n", n->_idx);
1940 1940 #endif
1941 1941 return (true);
1942 1942 }
1943 1943
1944 1944 // If the node cannot be scheduled this cycle, skip it
1945 1945 if (_current_latency[n_idx] > _bundle_cycle_number) {
1946 1946 #ifndef PRODUCT
1947 1947 if (_cfg->C->trace_opto_output())
1948 1948 tty->print("# NodeFitsInBundle [%4d]: FALSE; latency %4d > %d\n",
1949 1949 n->_idx, _current_latency[n_idx], _bundle_cycle_number);
1950 1950 #endif
1951 1951 return (false);
1952 1952 }
1953 1953
1954 1954 const Pipeline *node_pipeline = n->pipeline();
1955 1955
1956 1956 uint instruction_count = node_pipeline->instructionCount();
1957 1957 if (node_pipeline->mayHaveNoCode() && n->size(_regalloc) == 0)
1958 1958 instruction_count = 0;
1959 1959 else if (node_pipeline->hasBranchDelay() && !_unconditional_delay_slot)
1960 1960 instruction_count++;
1961 1961
1962 1962 if (_bundle_instr_count + instruction_count > Pipeline::_max_instrs_per_cycle) {
1963 1963 #ifndef PRODUCT
1964 1964 if (_cfg->C->trace_opto_output())
1965 1965 tty->print("# NodeFitsInBundle [%4d]: FALSE; too many instructions: %d > %d\n",
1966 1966 n->_idx, _bundle_instr_count + instruction_count, Pipeline::_max_instrs_per_cycle);
1967 1967 #endif
1968 1968 return (false);
1969 1969 }
1970 1970
1971 1971 // Don't allow non-machine nodes to be handled this way
1972 1972 if (!n->is_Mach() && instruction_count == 0)
1973 1973 return (false);
1974 1974
1975 1975 // See if there is any overlap
1976 1976 uint delay = _bundle_use.full_latency(0, node_pipeline->resourceUse());
1977 1977
1978 1978 if (delay > 0) {
1979 1979 #ifndef PRODUCT
1980 1980 if (_cfg->C->trace_opto_output())
1981 1981 tty->print("# NodeFitsInBundle [%4d]: FALSE; functional units overlap\n", n_idx);
1982 1982 #endif
1983 1983 return false;
1984 1984 }
1985 1985
1986 1986 #ifndef PRODUCT
1987 1987 if (_cfg->C->trace_opto_output())
1988 1988 tty->print("# NodeFitsInBundle [%4d]: TRUE\n", n_idx);
1989 1989 #endif
1990 1990
1991 1991 return true;
1992 1992 }
1993 1993
1994 1994 Node * Scheduling::ChooseNodeToBundle() {
1995 1995 uint siz = _available.size();
1996 1996
1997 1997 if (siz == 0) {
1998 1998
1999 1999 #ifndef PRODUCT
2000 2000 if (_cfg->C->trace_opto_output())
2001 2001 tty->print("# ChooseNodeToBundle: NULL\n");
2002 2002 #endif
2003 2003 return (NULL);
2004 2004 }
2005 2005
2006 2006 // Fast path, if only 1 instruction in the bundle
2007 2007 if (siz == 1) {
2008 2008 #ifndef PRODUCT
2009 2009 if (_cfg->C->trace_opto_output()) {
2010 2010 tty->print("# ChooseNodeToBundle (only 1): ");
2011 2011 _available[0]->dump();
2012 2012 }
2013 2013 #endif
2014 2014 return (_available[0]);
2015 2015 }
2016 2016
2017 2017 // Don't bother, if the bundle is already full
2018 2018 if (_bundle_instr_count < Pipeline::_max_instrs_per_cycle) {
2019 2019 for ( uint i = 0; i < siz; i++ ) {
2020 2020 Node *n = _available[i];
2021 2021
2022 2022 // Skip projections, we'll handle them another way
2023 2023 if (n->is_Proj())
2024 2024 continue;
2025 2025
2026 2026 // This presupposed that instructions are inserted into the
2027 2027 // available list in a legality order; i.e. instructions that
2028 2028 // must be inserted first are at the head of the list
2029 2029 if (NodeFitsInBundle(n)) {
2030 2030 #ifndef PRODUCT
2031 2031 if (_cfg->C->trace_opto_output()) {
2032 2032 tty->print("# ChooseNodeToBundle: ");
2033 2033 n->dump();
2034 2034 }
2035 2035 #endif
2036 2036 return (n);
2037 2037 }
2038 2038 }
2039 2039 }
2040 2040
2041 2041 // Nothing fits in this bundle, choose the highest priority
2042 2042 #ifndef PRODUCT
2043 2043 if (_cfg->C->trace_opto_output()) {
2044 2044 tty->print("# ChooseNodeToBundle: ");
2045 2045 _available[0]->dump();
2046 2046 }
2047 2047 #endif
2048 2048
2049 2049 return _available[0];
2050 2050 }
2051 2051
2052 2052 //------------------------------AddNodeToAvailableList-------------------------
2053 2053 void Scheduling::AddNodeToAvailableList(Node *n) {
2054 2054 assert( !n->is_Proj(), "projections never directly made available" );
2055 2055 #ifndef PRODUCT
2056 2056 if (_cfg->C->trace_opto_output()) {
2057 2057 tty->print("# AddNodeToAvailableList: ");
2058 2058 n->dump();
2059 2059 }
2060 2060 #endif
2061 2061
2062 2062 int latency = _current_latency[n->_idx];
2063 2063
2064 2064 // Insert in latency order (insertion sort)
2065 2065 uint i;
2066 2066 for ( i=0; i < _available.size(); i++ )
2067 2067 if (_current_latency[_available[i]->_idx] > latency)
2068 2068 break;
2069 2069
2070 2070 // Special Check for compares following branches
2071 2071 if( n->is_Mach() && _scheduled.size() > 0 ) {
2072 2072 int op = n->as_Mach()->ideal_Opcode();
2073 2073 Node *last = _scheduled[0];
2074 2074 if( last->is_MachIf() && last->in(1) == n &&
2075 2075 ( op == Op_CmpI ||
2076 2076 op == Op_CmpU ||
2077 2077 op == Op_CmpP ||
2078 2078 op == Op_CmpF ||
2079 2079 op == Op_CmpD ||
2080 2080 op == Op_CmpL ) ) {
2081 2081
2082 2082 // Recalculate position, moving to front of same latency
2083 2083 for ( i=0 ; i < _available.size(); i++ )
2084 2084 if (_current_latency[_available[i]->_idx] >= latency)
2085 2085 break;
2086 2086 }
2087 2087 }
2088 2088
2089 2089 // Insert the node in the available list
2090 2090 _available.insert(i, n);
2091 2091
2092 2092 #ifndef PRODUCT
2093 2093 if (_cfg->C->trace_opto_output())
2094 2094 dump_available();
2095 2095 #endif
2096 2096 }
2097 2097
2098 2098 //------------------------------DecrementUseCounts-----------------------------
2099 2099 void Scheduling::DecrementUseCounts(Node *n, const Block *bb) {
2100 2100 for ( uint i=0; i < n->len(); i++ ) {
2101 2101 Node *def = n->in(i);
2102 2102 if (!def) continue;
2103 2103 if( def->is_Proj() ) // If this is a machine projection, then
2104 2104 def = def->in(0); // propagate usage thru to the base instruction
2105 2105
2106 2106 if( _bbs[def->_idx] != bb ) // Ignore if not block-local
2107 2107 continue;
2108 2108
2109 2109 // Compute the latency
2110 2110 uint l = _bundle_cycle_number + n->latency(i);
2111 2111 if (_current_latency[def->_idx] < l)
2112 2112 _current_latency[def->_idx] = l;
2113 2113
2114 2114 // If this does not have uses then schedule it
2115 2115 if ((--_uses[def->_idx]) == 0)
2116 2116 AddNodeToAvailableList(def);
2117 2117 }
2118 2118 }
2119 2119
2120 2120 //------------------------------AddNodeToBundle--------------------------------
2121 2121 void Scheduling::AddNodeToBundle(Node *n, const Block *bb) {
2122 2122 #ifndef PRODUCT
2123 2123 if (_cfg->C->trace_opto_output()) {
2124 2124 tty->print("# AddNodeToBundle: ");
2125 2125 n->dump();
2126 2126 }
2127 2127 #endif
2128 2128
2129 2129 // Remove this from the available list
2130 2130 uint i;
2131 2131 for (i = 0; i < _available.size(); i++)
2132 2132 if (_available[i] == n)
2133 2133 break;
2134 2134 assert(i < _available.size(), "entry in _available list not found");
2135 2135 _available.remove(i);
2136 2136
2137 2137 // See if this fits in the current bundle
2138 2138 const Pipeline *node_pipeline = n->pipeline();
2139 2139 const Pipeline_Use& node_usage = node_pipeline->resourceUse();
2140 2140
2141 2141 // Check for instructions to be placed in the delay slot. We
2142 2142 // do this before we actually schedule the current instruction,
2143 2143 // because the delay slot follows the current instruction.
2144 2144 if (Pipeline::_branch_has_delay_slot &&
2145 2145 node_pipeline->hasBranchDelay() &&
2146 2146 !_unconditional_delay_slot) {
2147 2147
2148 2148 uint siz = _available.size();
2149 2149
2150 2150 // Conditional branches can support an instruction that
2151 2151 // is unconditionally executed and not dependent by the
2152 2152 // branch, OR a conditionally executed instruction if
2153 2153 // the branch is taken. In practice, this means that
2154 2154 // the first instruction at the branch target is
2155 2155 // copied to the delay slot, and the branch goes to
2156 2156 // the instruction after that at the branch target
2157 2157 if ( n->is_MachBranch() ) {
2158 2158
2159 2159 assert( !n->is_MachNullCheck(), "should not look for delay slot for Null Check" );
2160 2160 assert( !n->is_Catch(), "should not look for delay slot for Catch" );
2161 2161
2162 2162 #ifndef PRODUCT
2163 2163 _branches++;
2164 2164 #endif
2165 2165
2166 2166 // At least 1 instruction is on the available list
2167 2167 // that is not dependent on the branch
2168 2168 for (uint i = 0; i < siz; i++) {
2169 2169 Node *d = _available[i];
2170 2170 const Pipeline *avail_pipeline = d->pipeline();
2171 2171
2172 2172 // Don't allow safepoints in the branch shadow, that will
2173 2173 // cause a number of difficulties
2174 2174 if ( avail_pipeline->instructionCount() == 1 &&
2175 2175 !avail_pipeline->hasMultipleBundles() &&
2176 2176 !avail_pipeline->hasBranchDelay() &&
2177 2177 Pipeline::instr_has_unit_size() &&
2178 2178 d->size(_regalloc) == Pipeline::instr_unit_size() &&
2179 2179 NodeFitsInBundle(d) &&
2180 2180 !node_bundling(d)->used_in_delay()) {
2181 2181
2182 2182 if (d->is_Mach() && !d->is_MachSafePoint()) {
2183 2183 // A node that fits in the delay slot was found, so we need to
2184 2184 // set the appropriate bits in the bundle pipeline information so
2185 2185 // that it correctly indicates resource usage. Later, when we
2186 2186 // attempt to add this instruction to the bundle, we will skip
2187 2187 // setting the resource usage.
2188 2188 _unconditional_delay_slot = d;
2189 2189 node_bundling(n)->set_use_unconditional_delay();
2190 2190 node_bundling(d)->set_used_in_unconditional_delay();
2191 2191 _bundle_use.add_usage(avail_pipeline->resourceUse());
2192 2192 _current_latency[d->_idx] = _bundle_cycle_number;
2193 2193 _next_node = d;
2194 2194 ++_bundle_instr_count;
2195 2195 #ifndef PRODUCT
2196 2196 _unconditional_delays++;
2197 2197 #endif
2198 2198 break;
2199 2199 }
2200 2200 }
2201 2201 }
2202 2202 }
2203 2203
2204 2204 // No delay slot, add a nop to the usage
2205 2205 if (!_unconditional_delay_slot) {
2206 2206 // See if adding an instruction in the delay slot will overflow
2207 2207 // the bundle.
2208 2208 if (!NodeFitsInBundle(_nop)) {
2209 2209 #ifndef PRODUCT
2210 2210 if (_cfg->C->trace_opto_output())
2211 2211 tty->print("# *** STEP(1 instruction for delay slot) ***\n");
2212 2212 #endif
2213 2213 step(1);
2214 2214 }
2215 2215
2216 2216 _bundle_use.add_usage(_nop->pipeline()->resourceUse());
2217 2217 _next_node = _nop;
2218 2218 ++_bundle_instr_count;
2219 2219 }
2220 2220
2221 2221 // See if the instruction in the delay slot requires a
2222 2222 // step of the bundles
2223 2223 if (!NodeFitsInBundle(n)) {
2224 2224 #ifndef PRODUCT
2225 2225 if (_cfg->C->trace_opto_output())
2226 2226 tty->print("# *** STEP(branch won't fit) ***\n");
2227 2227 #endif
2228 2228 // Update the state information
2229 2229 _bundle_instr_count = 0;
2230 2230 _bundle_cycle_number += 1;
2231 2231 _bundle_use.step(1);
2232 2232 }
2233 2233 }
2234 2234
2235 2235 // Get the number of instructions
2236 2236 uint instruction_count = node_pipeline->instructionCount();
2237 2237 if (node_pipeline->mayHaveNoCode() && n->size(_regalloc) == 0)
2238 2238 instruction_count = 0;
2239 2239
2240 2240 // Compute the latency information
2241 2241 uint delay = 0;
2242 2242
2243 2243 if (instruction_count > 0 || !node_pipeline->mayHaveNoCode()) {
2244 2244 int relative_latency = _current_latency[n->_idx] - _bundle_cycle_number;
2245 2245 if (relative_latency < 0)
2246 2246 relative_latency = 0;
2247 2247
2248 2248 delay = _bundle_use.full_latency(relative_latency, node_usage);
2249 2249
2250 2250 // Does not fit in this bundle, start a new one
2251 2251 if (delay > 0) {
2252 2252 step(delay);
2253 2253
2254 2254 #ifndef PRODUCT
2255 2255 if (_cfg->C->trace_opto_output())
2256 2256 tty->print("# *** STEP(%d) ***\n", delay);
2257 2257 #endif
2258 2258 }
2259 2259 }
2260 2260
2261 2261 // If this was placed in the delay slot, ignore it
2262 2262 if (n != _unconditional_delay_slot) {
2263 2263
2264 2264 if (delay == 0) {
2265 2265 if (node_pipeline->hasMultipleBundles()) {
2266 2266 #ifndef PRODUCT
2267 2267 if (_cfg->C->trace_opto_output())
2268 2268 tty->print("# *** STEP(multiple instructions) ***\n");
2269 2269 #endif
2270 2270 step(1);
2271 2271 }
2272 2272
2273 2273 else if (instruction_count + _bundle_instr_count > Pipeline::_max_instrs_per_cycle) {
2274 2274 #ifndef PRODUCT
2275 2275 if (_cfg->C->trace_opto_output())
2276 2276 tty->print("# *** STEP(%d >= %d instructions) ***\n",
2277 2277 instruction_count + _bundle_instr_count,
2278 2278 Pipeline::_max_instrs_per_cycle);
2279 2279 #endif
2280 2280 step(1);
2281 2281 }
2282 2282 }
2283 2283
2284 2284 if (node_pipeline->hasBranchDelay() && !_unconditional_delay_slot)
2285 2285 _bundle_instr_count++;
2286 2286
2287 2287 // Set the node's latency
2288 2288 _current_latency[n->_idx] = _bundle_cycle_number;
2289 2289
2290 2290 // Now merge the functional unit information
2291 2291 if (instruction_count > 0 || !node_pipeline->mayHaveNoCode())
2292 2292 _bundle_use.add_usage(node_usage);
2293 2293
2294 2294 // Increment the number of instructions in this bundle
2295 2295 _bundle_instr_count += instruction_count;
2296 2296
2297 2297 // Remember this node for later
2298 2298 if (n->is_Mach())
2299 2299 _next_node = n;
2300 2300 }
2301 2301
2302 2302 // It's possible to have a BoxLock in the graph and in the _bbs mapping but
2303 2303 // not in the bb->_nodes array. This happens for debug-info-only BoxLocks.
2304 2304 // 'Schedule' them (basically ignore in the schedule) but do not insert them
2305 2305 // into the block. All other scheduled nodes get put in the schedule here.
2306 2306 int op = n->Opcode();
2307 2307 if( (op == Op_Node && n->req() == 0) || // anti-dependence node OR
2308 2308 (op != Op_Node && // Not an unused antidepedence node and
2309 2309 // not an unallocated boxlock
2310 2310 (OptoReg::is_valid(_regalloc->get_reg_first(n)) || op != Op_BoxLock)) ) {
2311 2311
2312 2312 // Push any trailing projections
2313 2313 if( bb->_nodes[bb->_nodes.size()-1] != n ) {
2314 2314 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
2315 2315 Node *foi = n->fast_out(i);
2316 2316 if( foi->is_Proj() )
2317 2317 _scheduled.push(foi);
2318 2318 }
2319 2319 }
2320 2320
2321 2321 // Put the instruction in the schedule list
2322 2322 _scheduled.push(n);
2323 2323 }
2324 2324
2325 2325 #ifndef PRODUCT
2326 2326 if (_cfg->C->trace_opto_output())
2327 2327 dump_available();
2328 2328 #endif
2329 2329
2330 2330 // Walk all the definitions, decrementing use counts, and
2331 2331 // if a definition has a 0 use count, place it in the available list.
2332 2332 DecrementUseCounts(n,bb);
2333 2333 }
2334 2334
2335 2335 //------------------------------ComputeUseCount--------------------------------
2336 2336 // This method sets the use count within a basic block. We will ignore all
2337 2337 // uses outside the current basic block. As we are doing a backwards walk,
2338 2338 // any node we reach that has a use count of 0 may be scheduled. This also
2339 2339 // avoids the problem of cyclic references from phi nodes, as long as phi
2340 2340 // nodes are at the front of the basic block. This method also initializes
2341 2341 // the available list to the set of instructions that have no uses within this
2342 2342 // basic block.
2343 2343 void Scheduling::ComputeUseCount(const Block *bb) {
2344 2344 #ifndef PRODUCT
2345 2345 if (_cfg->C->trace_opto_output())
2346 2346 tty->print("# -> ComputeUseCount\n");
2347 2347 #endif
2348 2348
2349 2349 // Clear the list of available and scheduled instructions, just in case
2350 2350 _available.clear();
2351 2351 _scheduled.clear();
2352 2352
2353 2353 // No delay slot specified
2354 2354 _unconditional_delay_slot = NULL;
2355 2355
2356 2356 #ifdef ASSERT
2357 2357 for( uint i=0; i < bb->_nodes.size(); i++ )
2358 2358 assert( _uses[bb->_nodes[i]->_idx] == 0, "_use array not clean" );
2359 2359 #endif
2360 2360
2361 2361 // Force the _uses count to never go to zero for unscheduable pieces
2362 2362 // of the block
2363 2363 for( uint k = 0; k < _bb_start; k++ )
2364 2364 _uses[bb->_nodes[k]->_idx] = 1;
2365 2365 for( uint l = _bb_end; l < bb->_nodes.size(); l++ )
2366 2366 _uses[bb->_nodes[l]->_idx] = 1;
2367 2367
2368 2368 // Iterate backwards over the instructions in the block. Don't count the
2369 2369 // branch projections at end or the block header instructions.
2370 2370 for( uint j = _bb_end-1; j >= _bb_start; j-- ) {
2371 2371 Node *n = bb->_nodes[j];
2372 2372 if( n->is_Proj() ) continue; // Projections handled another way
2373 2373
2374 2374 // Account for all uses
2375 2375 for ( uint k = 0; k < n->len(); k++ ) {
2376 2376 Node *inp = n->in(k);
2377 2377 if (!inp) continue;
2378 2378 assert(inp != n, "no cycles allowed" );
2379 2379 if( _bbs[inp->_idx] == bb ) { // Block-local use?
2380 2380 if( inp->is_Proj() ) // Skip through Proj's
2381 2381 inp = inp->in(0);
2382 2382 ++_uses[inp->_idx]; // Count 1 block-local use
2383 2383 }
2384 2384 }
2385 2385
2386 2386 // If this instruction has a 0 use count, then it is available
2387 2387 if (!_uses[n->_idx]) {
2388 2388 _current_latency[n->_idx] = _bundle_cycle_number;
2389 2389 AddNodeToAvailableList(n);
2390 2390 }
2391 2391
2392 2392 #ifndef PRODUCT
2393 2393 if (_cfg->C->trace_opto_output()) {
2394 2394 tty->print("# uses: %3d: ", _uses[n->_idx]);
2395 2395 n->dump();
2396 2396 }
2397 2397 #endif
2398 2398 }
2399 2399
2400 2400 #ifndef PRODUCT
2401 2401 if (_cfg->C->trace_opto_output())
2402 2402 tty->print("# <- ComputeUseCount\n");
2403 2403 #endif
2404 2404 }
2405 2405
2406 2406 // This routine performs scheduling on each basic block in reverse order,
2407 2407 // using instruction latencies and taking into account function unit
2408 2408 // availability.
2409 2409 void Scheduling::DoScheduling() {
2410 2410 #ifndef PRODUCT
2411 2411 if (_cfg->C->trace_opto_output())
2412 2412 tty->print("# -> DoScheduling\n");
2413 2413 #endif
2414 2414
2415 2415 Block *succ_bb = NULL;
2416 2416 Block *bb;
2417 2417
2418 2418 // Walk over all the basic blocks in reverse order
2419 2419 for( int i=_cfg->_num_blocks-1; i >= 0; succ_bb = bb, i-- ) {
2420 2420 bb = _cfg->_blocks[i];
2421 2421
2422 2422 #ifndef PRODUCT
2423 2423 if (_cfg->C->trace_opto_output()) {
2424 2424 tty->print("# Schedule BB#%03d (initial)\n", i);
2425 2425 for (uint j = 0; j < bb->_nodes.size(); j++)
2426 2426 bb->_nodes[j]->dump();
2427 2427 }
2428 2428 #endif
2429 2429
2430 2430 // On the head node, skip processing
2431 2431 if( bb == _cfg->_broot )
2432 2432 continue;
2433 2433
2434 2434 // Skip empty, connector blocks
2435 2435 if (bb->is_connector())
2436 2436 continue;
2437 2437
2438 2438 // If the following block is not the sole successor of
2439 2439 // this one, then reset the pipeline information
2440 2440 if (bb->_num_succs != 1 || bb->non_connector_successor(0) != succ_bb) {
2441 2441 #ifndef PRODUCT
2442 2442 if (_cfg->C->trace_opto_output()) {
2443 2443 tty->print("*** bundle start of next BB, node %d, for %d instructions\n",
2444 2444 _next_node->_idx, _bundle_instr_count);
2445 2445 }
2446 2446 #endif
2447 2447 step_and_clear();
2448 2448 }
2449 2449
2450 2450 // Leave untouched the starting instruction, any Phis, a CreateEx node
2451 2451 // or Top. bb->_nodes[_bb_start] is the first schedulable instruction.
2452 2452 _bb_end = bb->_nodes.size()-1;
2453 2453 for( _bb_start=1; _bb_start <= _bb_end; _bb_start++ ) {
2454 2454 Node *n = bb->_nodes[_bb_start];
2455 2455 // Things not matched, like Phinodes and ProjNodes don't get scheduled.
2456 2456 // Also, MachIdealNodes do not get scheduled
2457 2457 if( !n->is_Mach() ) continue; // Skip non-machine nodes
2458 2458 MachNode *mach = n->as_Mach();
2459 2459 int iop = mach->ideal_Opcode();
2460 2460 if( iop == Op_CreateEx ) continue; // CreateEx is pinned
2461 2461 if( iop == Op_Con ) continue; // Do not schedule Top
2462 2462 if( iop == Op_Node && // Do not schedule PhiNodes, ProjNodes
2463 2463 mach->pipeline() == MachNode::pipeline_class() &&
2464 2464 !n->is_SpillCopy() ) // Breakpoints, Prolog, etc
2465 2465 continue;
2466 2466 break; // Funny loop structure to be sure...
2467 2467 }
2468 2468 // Compute last "interesting" instruction in block - last instruction we
2469 2469 // might schedule. _bb_end points just after last schedulable inst. We
2470 2470 // normally schedule conditional branches (despite them being forced last
2471 2471 // in the block), because they have delay slots we can fill. Calls all
2472 2472 // have their delay slots filled in the template expansions, so we don't
2473 2473 // bother scheduling them.
2474 2474 Node *last = bb->_nodes[_bb_end];
2475 2475 // Ignore trailing NOPs.
2476 2476 while (_bb_end > 0 && last->is_Mach() &&
2477 2477 last->as_Mach()->ideal_Opcode() == Op_Con) {
2478 2478 last = bb->_nodes[--_bb_end];
2479 2479 }
2480 2480 assert(!last->is_Mach() || last->as_Mach()->ideal_Opcode() != Op_Con, "");
2481 2481 if( last->is_Catch() ||
2482 2482 // Exclude unreachable path case when Halt node is in a separate block.
2483 2483 (_bb_end > 1 && last->is_Mach() && last->as_Mach()->ideal_Opcode() == Op_Halt) ) {
2484 2484 // There must be a prior call. Skip it.
2485 2485 while( !bb->_nodes[--_bb_end]->is_MachCall() ) {
2486 2486 assert( bb->_nodes[_bb_end]->is_MachProj(), "skipping projections after expected call" );
2487 2487 }
2488 2488 } else if( last->is_MachNullCheck() ) {
2489 2489 // Backup so the last null-checked memory instruction is
2490 2490 // outside the schedulable range. Skip over the nullcheck,
2491 2491 // projection, and the memory nodes.
2492 2492 Node *mem = last->in(1);
2493 2493 do {
2494 2494 _bb_end--;
2495 2495 } while (mem != bb->_nodes[_bb_end]);
2496 2496 } else {
2497 2497 // Set _bb_end to point after last schedulable inst.
2498 2498 _bb_end++;
2499 2499 }
2500 2500
2501 2501 assert( _bb_start <= _bb_end, "inverted block ends" );
2502 2502
2503 2503 // Compute the register antidependencies for the basic block
2504 2504 ComputeRegisterAntidependencies(bb);
2505 2505 if (_cfg->C->failing()) return; // too many D-U pinch points
2506 2506
2507 2507 // Compute intra-bb latencies for the nodes
2508 2508 ComputeLocalLatenciesForward(bb);
2509 2509
2510 2510 // Compute the usage within the block, and set the list of all nodes
2511 2511 // in the block that have no uses within the block.
2512 2512 ComputeUseCount(bb);
2513 2513
2514 2514 // Schedule the remaining instructions in the block
2515 2515 while ( _available.size() > 0 ) {
2516 2516 Node *n = ChooseNodeToBundle();
2517 2517 AddNodeToBundle(n,bb);
2518 2518 }
2519 2519
2520 2520 assert( _scheduled.size() == _bb_end - _bb_start, "wrong number of instructions" );
2521 2521 #ifdef ASSERT
2522 2522 for( uint l = _bb_start; l < _bb_end; l++ ) {
2523 2523 Node *n = bb->_nodes[l];
2524 2524 uint m;
2525 2525 for( m = 0; m < _bb_end-_bb_start; m++ )
2526 2526 if( _scheduled[m] == n )
2527 2527 break;
2528 2528 assert( m < _bb_end-_bb_start, "instruction missing in schedule" );
2529 2529 }
2530 2530 #endif
2531 2531
2532 2532 // Now copy the instructions (in reverse order) back to the block
2533 2533 for ( uint k = _bb_start; k < _bb_end; k++ )
2534 2534 bb->_nodes.map(k, _scheduled[_bb_end-k-1]);
2535 2535
2536 2536 #ifndef PRODUCT
2537 2537 if (_cfg->C->trace_opto_output()) {
2538 2538 tty->print("# Schedule BB#%03d (final)\n", i);
2539 2539 uint current = 0;
2540 2540 for (uint j = 0; j < bb->_nodes.size(); j++) {
2541 2541 Node *n = bb->_nodes[j];
2542 2542 if( valid_bundle_info(n) ) {
2543 2543 Bundle *bundle = node_bundling(n);
2544 2544 if (bundle->instr_count() > 0 || bundle->flags() > 0) {
2545 2545 tty->print("*** Bundle: ");
2546 2546 bundle->dump();
2547 2547 }
2548 2548 n->dump();
2549 2549 }
2550 2550 }
2551 2551 }
2552 2552 #endif
2553 2553 #ifdef ASSERT
2554 2554 verify_good_schedule(bb,"after block local scheduling");
2555 2555 #endif
2556 2556 }
2557 2557
2558 2558 #ifndef PRODUCT
2559 2559 if (_cfg->C->trace_opto_output())
2560 2560 tty->print("# <- DoScheduling\n");
2561 2561 #endif
2562 2562
2563 2563 // Record final node-bundling array location
2564 2564 _regalloc->C->set_node_bundling_base(_node_bundling_base);
2565 2565
2566 2566 } // end DoScheduling
2567 2567
2568 2568 //------------------------------verify_good_schedule---------------------------
2569 2569 // Verify that no live-range used in the block is killed in the block by a
2570 2570 // wrong DEF. This doesn't verify live-ranges that span blocks.
2571 2571
2572 2572 // Check for edge existence. Used to avoid adding redundant precedence edges.
2573 2573 static bool edge_from_to( Node *from, Node *to ) {
2574 2574 for( uint i=0; i<from->len(); i++ )
2575 2575 if( from->in(i) == to )
2576 2576 return true;
2577 2577 return false;
2578 2578 }
2579 2579
2580 2580 #ifdef ASSERT
2581 2581 //------------------------------verify_do_def----------------------------------
2582 2582 void Scheduling::verify_do_def( Node *n, OptoReg::Name def, const char *msg ) {
2583 2583 // Check for bad kills
2584 2584 if( OptoReg::is_valid(def) ) { // Ignore stores & control flow
2585 2585 Node *prior_use = _reg_node[def];
2586 2586 if( prior_use && !edge_from_to(prior_use,n) ) {
2587 2587 tty->print("%s = ",OptoReg::as_VMReg(def)->name());
2588 2588 n->dump();
2589 2589 tty->print_cr("...");
2590 2590 prior_use->dump();
2591 2591 assert(edge_from_to(prior_use,n),msg);
2592 2592 }
2593 2593 _reg_node.map(def,NULL); // Kill live USEs
2594 2594 }
2595 2595 }
2596 2596
2597 2597 //------------------------------verify_good_schedule---------------------------
2598 2598 void Scheduling::verify_good_schedule( Block *b, const char *msg ) {
2599 2599
2600 2600 // Zap to something reasonable for the verify code
2601 2601 _reg_node.clear();
2602 2602
2603 2603 // Walk over the block backwards. Check to make sure each DEF doesn't
2604 2604 // kill a live value (other than the one it's supposed to). Add each
2605 2605 // USE to the live set.
2606 2606 for( uint i = b->_nodes.size()-1; i >= _bb_start; i-- ) {
2607 2607 Node *n = b->_nodes[i];
2608 2608 int n_op = n->Opcode();
2609 2609 if( n_op == Op_MachProj && n->ideal_reg() == MachProjNode::fat_proj ) {
2610 2610 // Fat-proj kills a slew of registers
2611 2611 RegMask rm = n->out_RegMask();// Make local copy
2612 2612 while( rm.is_NotEmpty() ) {
2613 2613 OptoReg::Name kill = rm.find_first_elem();
2614 2614 rm.Remove(kill);
2615 2615 verify_do_def( n, kill, msg );
2616 2616 }
2617 2617 } else if( n_op != Op_Node ) { // Avoid brand new antidependence nodes
2618 2618 // Get DEF'd registers the normal way
2619 2619 verify_do_def( n, _regalloc->get_reg_first(n), msg );
2620 2620 verify_do_def( n, _regalloc->get_reg_second(n), msg );
2621 2621 }
2622 2622
2623 2623 // Now make all USEs live
2624 2624 for( uint i=1; i<n->req(); i++ ) {
2625 2625 Node *def = n->in(i);
2626 2626 assert(def != 0, "input edge required");
2627 2627 OptoReg::Name reg_lo = _regalloc->get_reg_first(def);
2628 2628 OptoReg::Name reg_hi = _regalloc->get_reg_second(def);
2629 2629 if( OptoReg::is_valid(reg_lo) ) {
2630 2630 assert(!_reg_node[reg_lo] || edge_from_to(_reg_node[reg_lo],def), msg);
2631 2631 _reg_node.map(reg_lo,n);
2632 2632 }
2633 2633 if( OptoReg::is_valid(reg_hi) ) {
2634 2634 assert(!_reg_node[reg_hi] || edge_from_to(_reg_node[reg_hi],def), msg);
2635 2635 _reg_node.map(reg_hi,n);
2636 2636 }
2637 2637 }
2638 2638
2639 2639 }
2640 2640
2641 2641 // Zap to something reasonable for the Antidependence code
2642 2642 _reg_node.clear();
2643 2643 }
2644 2644 #endif
2645 2645
2646 2646 // Conditionally add precedence edges. Avoid putting edges on Projs.
2647 2647 static void add_prec_edge_from_to( Node *from, Node *to ) {
2648 2648 if( from->is_Proj() ) { // Put precedence edge on Proj's input
2649 2649 assert( from->req() == 1 && (from->len() == 1 || from->in(1)==0), "no precedence edges on projections" );
2650 2650 from = from->in(0);
2651 2651 }
2652 2652 if( from != to && // No cycles (for things like LD L0,[L0+4] )
2653 2653 !edge_from_to( from, to ) ) // Avoid duplicate edge
2654 2654 from->add_prec(to);
2655 2655 }
2656 2656
2657 2657 //------------------------------anti_do_def------------------------------------
2658 2658 void Scheduling::anti_do_def( Block *b, Node *def, OptoReg::Name def_reg, int is_def ) {
2659 2659 if( !OptoReg::is_valid(def_reg) ) // Ignore stores & control flow
2660 2660 return;
2661 2661
2662 2662 Node *pinch = _reg_node[def_reg]; // Get pinch point
2663 2663 if( !pinch || _bbs[pinch->_idx] != b || // No pinch-point yet?
2664 2664 is_def ) { // Check for a true def (not a kill)
2665 2665 _reg_node.map(def_reg,def); // Record def/kill as the optimistic pinch-point
2666 2666 return;
2667 2667 }
2668 2668
2669 2669 Node *kill = def; // Rename 'def' to more descriptive 'kill'
2670 2670 debug_only( def = (Node*)0xdeadbeef; )
2671 2671
2672 2672 // After some number of kills there _may_ be a later def
2673 2673 Node *later_def = NULL;
2674 2674
2675 2675 // Finding a kill requires a real pinch-point.
2676 2676 // Check for not already having a pinch-point.
2677 2677 // Pinch points are Op_Node's.
2678 2678 if( pinch->Opcode() != Op_Node ) { // Or later-def/kill as pinch-point?
2679 2679 later_def = pinch; // Must be def/kill as optimistic pinch-point
2680 2680 if ( _pinch_free_list.size() > 0) {
2681 2681 pinch = _pinch_free_list.pop();
2682 2682 } else {
2683 2683 pinch = new (_cfg->C) Node(1); // Pinch point to-be
2684 2684 }
2685 2685 if (pinch->_idx >= _regalloc->node_regs_max_index()) {
2686 2686 _cfg->C->record_method_not_compilable("too many D-U pinch points");
2687 2687 return;
2688 2688 }
2689 2689 _bbs.map(pinch->_idx,b); // Pretend it's valid in this block (lazy init)
2690 2690 _reg_node.map(def_reg,pinch); // Record pinch-point
2691 2691 //_regalloc->set_bad(pinch->_idx); // Already initialized this way.
2692 2692 if( later_def->outcnt() == 0 || later_def->ideal_reg() == MachProjNode::fat_proj ) { // Distinguish def from kill
2693 2693 pinch->init_req(0, _cfg->C->top()); // set not NULL for the next call
2694 2694 add_prec_edge_from_to(later_def,pinch); // Add edge from kill to pinch
2695 2695 later_def = NULL; // and no later def
2696 2696 }
2697 2697 pinch->set_req(0,later_def); // Hook later def so we can find it
2698 2698 } else { // Else have valid pinch point
2699 2699 if( pinch->in(0) ) // If there is a later-def
2700 2700 later_def = pinch->in(0); // Get it
2701 2701 }
2702 2702
2703 2703 // Add output-dependence edge from later def to kill
2704 2704 if( later_def ) // If there is some original def
2705 2705 add_prec_edge_from_to(later_def,kill); // Add edge from def to kill
2706 2706
2707 2707 // See if current kill is also a use, and so is forced to be the pinch-point.
2708 2708 if( pinch->Opcode() == Op_Node ) {
2709 2709 Node *uses = kill->is_Proj() ? kill->in(0) : kill;
2710 2710 for( uint i=1; i<uses->req(); i++ ) {
2711 2711 if( _regalloc->get_reg_first(uses->in(i)) == def_reg ||
2712 2712 _regalloc->get_reg_second(uses->in(i)) == def_reg ) {
2713 2713 // Yes, found a use/kill pinch-point
2714 2714 pinch->set_req(0,NULL); //
2715 2715 pinch->replace_by(kill); // Move anti-dep edges up
2716 2716 pinch = kill;
2717 2717 _reg_node.map(def_reg,pinch);
2718 2718 return;
2719 2719 }
2720 2720 }
2721 2721 }
2722 2722
2723 2723 // Add edge from kill to pinch-point
2724 2724 add_prec_edge_from_to(kill,pinch);
2725 2725 }
2726 2726
2727 2727 //------------------------------anti_do_use------------------------------------
2728 2728 void Scheduling::anti_do_use( Block *b, Node *use, OptoReg::Name use_reg ) {
2729 2729 if( !OptoReg::is_valid(use_reg) ) // Ignore stores & control flow
2730 2730 return;
2731 2731 Node *pinch = _reg_node[use_reg]; // Get pinch point
2732 2732 // Check for no later def_reg/kill in block
2733 2733 if( pinch && _bbs[pinch->_idx] == b &&
2734 2734 // Use has to be block-local as well
2735 2735 _bbs[use->_idx] == b ) {
2736 2736 if( pinch->Opcode() == Op_Node && // Real pinch-point (not optimistic?)
2737 2737 pinch->req() == 1 ) { // pinch not yet in block?
2738 2738 pinch->del_req(0); // yank pointer to later-def, also set flag
2739 2739 // Insert the pinch-point in the block just after the last use
2740 2740 b->_nodes.insert(b->find_node(use)+1,pinch);
2741 2741 _bb_end++; // Increase size scheduled region in block
2742 2742 }
2743 2743
2744 2744 add_prec_edge_from_to(pinch,use);
2745 2745 }
2746 2746 }
2747 2747
2748 2748 //------------------------------ComputeRegisterAntidependences-----------------
2749 2749 // We insert antidependences between the reads and following write of
2750 2750 // allocated registers to prevent illegal code motion. Hopefully, the
2751 2751 // number of added references should be fairly small, especially as we
2752 2752 // are only adding references within the current basic block.
2753 2753 void Scheduling::ComputeRegisterAntidependencies(Block *b) {
2754 2754
2755 2755 #ifdef ASSERT
2756 2756 verify_good_schedule(b,"before block local scheduling");
2757 2757 #endif
2758 2758
2759 2759 // A valid schedule, for each register independently, is an endless cycle
2760 2760 // of: a def, then some uses (connected to the def by true dependencies),
2761 2761 // then some kills (defs with no uses), finally the cycle repeats with a new
2762 2762 // def. The uses are allowed to float relative to each other, as are the
2763 2763 // kills. No use is allowed to slide past a kill (or def). This requires
2764 2764 // antidependencies between all uses of a single def and all kills that
2765 2765 // follow, up to the next def. More edges are redundant, because later defs
2766 2766 // & kills are already serialized with true or antidependencies. To keep
2767 2767 // the edge count down, we add a 'pinch point' node if there's more than
2768 2768 // one use or more than one kill/def.
2769 2769
2770 2770 // We add dependencies in one bottom-up pass.
2771 2771
2772 2772 // For each instruction we handle it's DEFs/KILLs, then it's USEs.
2773 2773
2774 2774 // For each DEF/KILL, we check to see if there's a prior DEF/KILL for this
2775 2775 // register. If not, we record the DEF/KILL in _reg_node, the
2776 2776 // register-to-def mapping. If there is a prior DEF/KILL, we insert a
2777 2777 // "pinch point", a new Node that's in the graph but not in the block.
2778 2778 // We put edges from the prior and current DEF/KILLs to the pinch point.
2779 2779 // We put the pinch point in _reg_node. If there's already a pinch point
2780 2780 // we merely add an edge from the current DEF/KILL to the pinch point.
2781 2781
2782 2782 // After doing the DEF/KILLs, we handle USEs. For each used register, we
2783 2783 // put an edge from the pinch point to the USE.
2784 2784
2785 2785 // To be expedient, the _reg_node array is pre-allocated for the whole
2786 2786 // compilation. _reg_node is lazily initialized; it either contains a NULL,
2787 2787 // or a valid def/kill/pinch-point, or a leftover node from some prior
2788 2788 // block. Leftover node from some prior block is treated like a NULL (no
2789 2789 // prior def, so no anti-dependence needed). Valid def is distinguished by
2790 2790 // it being in the current block.
2791 2791 bool fat_proj_seen = false;
2792 2792 uint last_safept = _bb_end-1;
2793 2793 Node* end_node = (_bb_end-1 >= _bb_start) ? b->_nodes[last_safept] : NULL;
2794 2794 Node* last_safept_node = end_node;
2795 2795 for( uint i = _bb_end-1; i >= _bb_start; i-- ) {
2796 2796 Node *n = b->_nodes[i];
2797 2797 int is_def = n->outcnt(); // def if some uses prior to adding precedence edges
2798 2798 if( n->is_MachProj() && n->ideal_reg() == MachProjNode::fat_proj ) {
2799 2799 // Fat-proj kills a slew of registers
2800 2800 // This can add edges to 'n' and obscure whether or not it was a def,
2801 2801 // hence the is_def flag.
2802 2802 fat_proj_seen = true;
2803 2803 RegMask rm = n->out_RegMask();// Make local copy
2804 2804 while( rm.is_NotEmpty() ) {
2805 2805 OptoReg::Name kill = rm.find_first_elem();
2806 2806 rm.Remove(kill);
2807 2807 anti_do_def( b, n, kill, is_def );
2808 2808 }
2809 2809 } else {
2810 2810 // Get DEF'd registers the normal way
2811 2811 anti_do_def( b, n, _regalloc->get_reg_first(n), is_def );
2812 2812 anti_do_def( b, n, _regalloc->get_reg_second(n), is_def );
2813 2813 }
2814 2814
2815 2815 // Kill projections on a branch should appear to occur on the
2816 2816 // branch, not afterwards, so grab the masks from the projections
2817 2817 // and process them.
2818 2818 if (n->is_MachBranch() || n->is_Mach() && n->as_Mach()->ideal_Opcode() == Op_Jump) {
2819 2819 for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
2820 2820 Node* use = n->fast_out(i);
2821 2821 if (use->is_Proj()) {
2822 2822 RegMask rm = use->out_RegMask();// Make local copy
2823 2823 while( rm.is_NotEmpty() ) {
2824 2824 OptoReg::Name kill = rm.find_first_elem();
2825 2825 rm.Remove(kill);
2826 2826 anti_do_def( b, n, kill, false );
2827 2827 }
2828 2828 }
2829 2829 }
2830 2830 }
2831 2831
2832 2832 // Check each register used by this instruction for a following DEF/KILL
2833 2833 // that must occur afterward and requires an anti-dependence edge.
2834 2834 for( uint j=0; j<n->req(); j++ ) {
2835 2835 Node *def = n->in(j);
2836 2836 if( def ) {
2837 2837 assert( !def->is_MachProj() || def->ideal_reg() != MachProjNode::fat_proj, "" );
2838 2838 anti_do_use( b, n, _regalloc->get_reg_first(def) );
2839 2839 anti_do_use( b, n, _regalloc->get_reg_second(def) );
2840 2840 }
2841 2841 }
2842 2842 // Do not allow defs of new derived values to float above GC
2843 2843 // points unless the base is definitely available at the GC point.
2844 2844
2845 2845 Node *m = b->_nodes[i];
2846 2846
2847 2847 // Add precedence edge from following safepoint to use of derived pointer
2848 2848 if( last_safept_node != end_node &&
2849 2849 m != last_safept_node) {
2850 2850 for (uint k = 1; k < m->req(); k++) {
2851 2851 const Type *t = m->in(k)->bottom_type();
2852 2852 if( t->isa_oop_ptr() &&
2853 2853 t->is_ptr()->offset() != 0 ) {
2854 2854 last_safept_node->add_prec( m );
2855 2855 break;
2856 2856 }
2857 2857 }
2858 2858 }
2859 2859
2860 2860 if( n->jvms() ) { // Precedence edge from derived to safept
2861 2861 // Check if last_safept_node was moved by pinch-point insertion in anti_do_use()
2862 2862 if( b->_nodes[last_safept] != last_safept_node ) {
2863 2863 last_safept = b->find_node(last_safept_node);
2864 2864 }
2865 2865 for( uint j=last_safept; j > i; j-- ) {
2866 2866 Node *mach = b->_nodes[j];
2867 2867 if( mach->is_Mach() && mach->as_Mach()->ideal_Opcode() == Op_AddP )
2868 2868 mach->add_prec( n );
2869 2869 }
2870 2870 last_safept = i;
2871 2871 last_safept_node = m;
2872 2872 }
2873 2873 }
2874 2874
2875 2875 if (fat_proj_seen) {
2876 2876 // Garbage collect pinch nodes that were not consumed.
2877 2877 // They are usually created by a fat kill MachProj for a call.
2878 2878 garbage_collect_pinch_nodes();
2879 2879 }
2880 2880 }
2881 2881
2882 2882 //------------------------------garbage_collect_pinch_nodes-------------------------------
2883 2883
2884 2884 // Garbage collect pinch nodes for reuse by other blocks.
2885 2885 //
2886 2886 // The block scheduler's insertion of anti-dependence
2887 2887 // edges creates many pinch nodes when the block contains
2888 2888 // 2 or more Calls. A pinch node is used to prevent a
2889 2889 // combinatorial explosion of edges. If a set of kills for a
2890 2890 // register is anti-dependent on a set of uses (or defs), rather
2891 2891 // than adding an edge in the graph between each pair of kill
2892 2892 // and use (or def), a pinch is inserted between them:
2893 2893 //
2894 2894 // use1 use2 use3
2895 2895 // \ | /
2896 2896 // \ | /
2897 2897 // pinch
2898 2898 // / | \
2899 2899 // / | \
2900 2900 // kill1 kill2 kill3
2901 2901 //
2902 2902 // One pinch node is created per register killed when
2903 2903 // the second call is encountered during a backwards pass
2904 2904 // over the block. Most of these pinch nodes are never
2905 2905 // wired into the graph because the register is never
2906 2906 // used or def'ed in the block.
2907 2907 //
2908 2908 void Scheduling::garbage_collect_pinch_nodes() {
2909 2909 #ifndef PRODUCT
2910 2910 if (_cfg->C->trace_opto_output()) tty->print("Reclaimed pinch nodes:");
2911 2911 #endif
2912 2912 int trace_cnt = 0;
2913 2913 for (uint k = 0; k < _reg_node.Size(); k++) {
2914 2914 Node* pinch = _reg_node[k];
2915 2915 if (pinch != NULL && pinch->Opcode() == Op_Node &&
2916 2916 // no predecence input edges
2917 2917 (pinch->req() == pinch->len() || pinch->in(pinch->req()) == NULL) ) {
2918 2918 cleanup_pinch(pinch);
2919 2919 _pinch_free_list.push(pinch);
2920 2920 _reg_node.map(k, NULL);
2921 2921 #ifndef PRODUCT
2922 2922 if (_cfg->C->trace_opto_output()) {
2923 2923 trace_cnt++;
2924 2924 if (trace_cnt > 40) {
2925 2925 tty->print("\n");
2926 2926 trace_cnt = 0;
2927 2927 }
2928 2928 tty->print(" %d", pinch->_idx);
2929 2929 }
2930 2930 #endif
2931 2931 }
2932 2932 }
2933 2933 #ifndef PRODUCT
2934 2934 if (_cfg->C->trace_opto_output()) tty->print("\n");
2935 2935 #endif
2936 2936 }
2937 2937
2938 2938 // Clean up a pinch node for reuse.
2939 2939 void Scheduling::cleanup_pinch( Node *pinch ) {
2940 2940 assert (pinch && pinch->Opcode() == Op_Node && pinch->req() == 1, "just checking");
2941 2941
2942 2942 for (DUIterator_Last imin, i = pinch->last_outs(imin); i >= imin; ) {
2943 2943 Node* use = pinch->last_out(i);
2944 2944 uint uses_found = 0;
2945 2945 for (uint j = use->req(); j < use->len(); j++) {
2946 2946 if (use->in(j) == pinch) {
2947 2947 use->rm_prec(j);
2948 2948 uses_found++;
2949 2949 }
2950 2950 }
2951 2951 assert(uses_found > 0, "must be a precedence edge");
2952 2952 i -= uses_found; // we deleted 1 or more copies of this edge
2953 2953 }
2954 2954 // May have a later_def entry
2955 2955 pinch->set_req(0, NULL);
2956 2956 }
2957 2957
2958 2958 //------------------------------print_statistics-------------------------------
2959 2959 #ifndef PRODUCT
2960 2960
2961 2961 void Scheduling::dump_available() const {
2962 2962 tty->print("#Availist ");
2963 2963 for (uint i = 0; i < _available.size(); i++)
2964 2964 tty->print(" N%d/l%d", _available[i]->_idx,_current_latency[_available[i]->_idx]);
2965 2965 tty->cr();
2966 2966 }
2967 2967
2968 2968 // Print Scheduling Statistics
2969 2969 void Scheduling::print_statistics() {
2970 2970 // Print the size added by nops for bundling
2971 2971 tty->print("Nops added %d bytes to total of %d bytes",
2972 2972 _total_nop_size, _total_method_size);
2973 2973 if (_total_method_size > 0)
2974 2974 tty->print(", for %.2f%%",
2975 2975 ((double)_total_nop_size) / ((double) _total_method_size) * 100.0);
2976 2976 tty->print("\n");
2977 2977
2978 2978 // Print the number of branch shadows filled
2979 2979 if (Pipeline::_branch_has_delay_slot) {
2980 2980 tty->print("Of %d branches, %d had unconditional delay slots filled",
2981 2981 _total_branches, _total_unconditional_delays);
2982 2982 if (_total_branches > 0)
2983 2983 tty->print(", for %.2f%%",
2984 2984 ((double)_total_unconditional_delays) / ((double)_total_branches) * 100.0);
2985 2985 tty->print("\n");
2986 2986 }
2987 2987
2988 2988 uint total_instructions = 0, total_bundles = 0;
2989 2989
2990 2990 for (uint i = 1; i <= Pipeline::_max_instrs_per_cycle; i++) {
2991 2991 uint bundle_count = _total_instructions_per_bundle[i];
2992 2992 total_instructions += bundle_count * i;
2993 2993 total_bundles += bundle_count;
2994 2994 }
2995 2995
2996 2996 if (total_bundles > 0)
2997 2997 tty->print("Average ILP (excluding nops) is %.2f\n",
2998 2998 ((double)total_instructions) / ((double)total_bundles));
2999 2999 }
3000 3000 #endif
↓ open down ↓ |
2056 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX