Print this page
Split |
Close |
Expand all |
Collapse all |
--- old/src/cpu/sparc/vm/stubGenerator_sparc.cpp
+++ new/src/cpu/sparc/vm/stubGenerator_sparc.cpp
1 1 /*
2 - * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
2 + * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
3 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 4 *
5 5 * This code is free software; you can redistribute it and/or modify it
6 6 * under the terms of the GNU General Public License version 2 only, as
7 7 * published by the Free Software Foundation.
8 8 *
9 9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 12 * version 2 for more details (a copy is included in the LICENSE file that
13 13 * accompanied this code).
14 14 *
15 15 * You should have received a copy of the GNU General Public License version
16 16 * 2 along with this work; if not, write to the Free Software Foundation,
17 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 18 *
19 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 20 * or visit www.oracle.com if you need additional information or have any
21 21 * questions.
22 22 *
23 23 */
24 24
25 25 #include "precompiled.hpp"
26 26 #include "asm/assembler.hpp"
27 27 #include "assembler_sparc.inline.hpp"
28 28 #include "interpreter/interpreter.hpp"
29 29 #include "nativeInst_sparc.hpp"
30 30 #include "oops/instanceOop.hpp"
31 31 #include "oops/methodOop.hpp"
32 32 #include "oops/objArrayKlass.hpp"
33 33 #include "oops/oop.inline.hpp"
34 34 #include "prims/methodHandles.hpp"
35 35 #include "runtime/frame.inline.hpp"
36 36 #include "runtime/handles.inline.hpp"
37 37 #include "runtime/sharedRuntime.hpp"
38 38 #include "runtime/stubCodeGenerator.hpp"
39 39 #include "runtime/stubRoutines.hpp"
40 40 #include "utilities/top.hpp"
41 41 #ifdef TARGET_OS_FAMILY_linux
42 42 # include "thread_linux.inline.hpp"
43 43 #endif
44 44 #ifdef TARGET_OS_FAMILY_solaris
45 45 # include "thread_solaris.inline.hpp"
46 46 #endif
47 47 #ifdef COMPILER2
48 48 #include "opto/runtime.hpp"
49 49 #endif
50 50
51 51 // Declaration and definition of StubGenerator (no .hpp file).
52 52 // For a more detailed description of the stub routine structure
53 53 // see the comment in stubRoutines.hpp.
54 54
55 55 #define __ _masm->
56 56
57 57 #ifdef PRODUCT
58 58 #define BLOCK_COMMENT(str) /* nothing */
59 59 #else
60 60 #define BLOCK_COMMENT(str) __ block_comment(str)
61 61 #endif
62 62
63 63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
64 64
65 65 // Note: The register L7 is used as L7_thread_cache, and may not be used
66 66 // any other way within this module.
67 67
68 68
69 69 static const Register& Lstub_temp = L2;
70 70
71 71 // -------------------------------------------------------------------------------------------------------------------------
72 72 // Stub Code definitions
73 73
74 74 static address handle_unsafe_access() {
75 75 JavaThread* thread = JavaThread::current();
76 76 address pc = thread->saved_exception_pc();
77 77 address npc = thread->saved_exception_npc();
78 78 // pc is the instruction which we must emulate
79 79 // doing a no-op is fine: return garbage from the load
80 80
81 81 // request an async exception
82 82 thread->set_pending_unsafe_access_error();
83 83
84 84 // return address of next instruction to execute
85 85 return npc;
86 86 }
87 87
88 88 class StubGenerator: public StubCodeGenerator {
89 89 private:
90 90
91 91 #ifdef PRODUCT
92 92 #define inc_counter_np(a,b,c) (0)
93 93 #else
94 94 #define inc_counter_np(counter, t1, t2) \
95 95 BLOCK_COMMENT("inc_counter " #counter); \
96 96 __ inc_counter(&counter, t1, t2);
97 97 #endif
98 98
99 99 //----------------------------------------------------------------------------------------------------
100 100 // Call stubs are used to call Java from C
101 101
102 102 address generate_call_stub(address& return_pc) {
103 103 StubCodeMark mark(this, "StubRoutines", "call_stub");
104 104 address start = __ pc();
105 105
106 106 // Incoming arguments:
107 107 //
108 108 // o0 : call wrapper address
109 109 // o1 : result (address)
110 110 // o2 : result type
111 111 // o3 : method
112 112 // o4 : (interpreter) entry point
113 113 // o5 : parameters (address)
114 114 // [sp + 0x5c]: parameter size (in words)
115 115 // [sp + 0x60]: thread
116 116 //
117 117 // +---------------+ <--- sp + 0
118 118 // | |
119 119 // . reg save area .
120 120 // | |
121 121 // +---------------+ <--- sp + 0x40
122 122 // | |
123 123 // . extra 7 slots .
124 124 // | |
125 125 // +---------------+ <--- sp + 0x5c
126 126 // | param. size |
127 127 // +---------------+ <--- sp + 0x60
128 128 // | thread |
129 129 // +---------------+
130 130 // | |
131 131
132 132 // note: if the link argument position changes, adjust
133 133 // the code in frame::entry_frame_call_wrapper()
134 134
135 135 const Argument link = Argument(0, false); // used only for GC
136 136 const Argument result = Argument(1, false);
137 137 const Argument result_type = Argument(2, false);
138 138 const Argument method = Argument(3, false);
139 139 const Argument entry_point = Argument(4, false);
140 140 const Argument parameters = Argument(5, false);
141 141 const Argument parameter_size = Argument(6, false);
142 142 const Argument thread = Argument(7, false);
143 143
144 144 // setup thread register
145 145 __ ld_ptr(thread.as_address(), G2_thread);
146 146 __ reinit_heapbase();
147 147
148 148 #ifdef ASSERT
149 149 // make sure we have no pending exceptions
150 150 { const Register t = G3_scratch;
151 151 Label L;
152 152 __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
153 153 __ br_null(t, false, Assembler::pt, L);
154 154 __ delayed()->nop();
155 155 __ stop("StubRoutines::call_stub: entered with pending exception");
156 156 __ bind(L);
157 157 }
158 158 #endif
159 159
160 160 // create activation frame & allocate space for parameters
161 161 { const Register t = G3_scratch;
162 162 __ ld_ptr(parameter_size.as_address(), t); // get parameter size (in words)
163 163 __ add(t, frame::memory_parameter_word_sp_offset, t); // add space for save area (in words)
164 164 __ round_to(t, WordsPerLong); // make sure it is multiple of 2 (in words)
165 165 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes
166 166 __ neg(t); // negate so it can be used with save
167 167 __ save(SP, t, SP); // setup new frame
168 168 }
169 169
170 170 // +---------------+ <--- sp + 0
171 171 // | |
172 172 // . reg save area .
173 173 // | |
174 174 // +---------------+ <--- sp + 0x40
175 175 // | |
176 176 // . extra 7 slots .
177 177 // | |
178 178 // +---------------+ <--- sp + 0x5c
179 179 // | empty slot | (only if parameter size is even)
180 180 // +---------------+
181 181 // | |
182 182 // . parameters .
183 183 // | |
184 184 // +---------------+ <--- fp + 0
185 185 // | |
186 186 // . reg save area .
187 187 // | |
188 188 // +---------------+ <--- fp + 0x40
189 189 // | |
190 190 // . extra 7 slots .
191 191 // | |
192 192 // +---------------+ <--- fp + 0x5c
193 193 // | param. size |
194 194 // +---------------+ <--- fp + 0x60
195 195 // | thread |
196 196 // +---------------+
197 197 // | |
198 198
199 199 // pass parameters if any
200 200 BLOCK_COMMENT("pass parameters if any");
201 201 { const Register src = parameters.as_in().as_register();
202 202 const Register dst = Lentry_args;
203 203 const Register tmp = G3_scratch;
204 204 const Register cnt = G4_scratch;
205 205
206 206 // test if any parameters & setup of Lentry_args
207 207 Label exit;
208 208 __ ld_ptr(parameter_size.as_in().as_address(), cnt); // parameter counter
209 209 __ add( FP, STACK_BIAS, dst );
210 210 __ tst(cnt);
211 211 __ br(Assembler::zero, false, Assembler::pn, exit);
212 212 __ delayed()->sub(dst, BytesPerWord, dst); // setup Lentry_args
213 213
214 214 // copy parameters if any
215 215 Label loop;
216 216 __ BIND(loop);
217 217 // Store parameter value
218 218 __ ld_ptr(src, 0, tmp);
219 219 __ add(src, BytesPerWord, src);
220 220 __ st_ptr(tmp, dst, 0);
221 221 __ deccc(cnt);
222 222 __ br(Assembler::greater, false, Assembler::pt, loop);
223 223 __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
224 224
225 225 // done
226 226 __ BIND(exit);
227 227 }
228 228
229 229 // setup parameters, method & call Java function
230 230 #ifdef ASSERT
231 231 // layout_activation_impl checks it's notion of saved SP against
232 232 // this register, so if this changes update it as well.
233 233 const Register saved_SP = Lscratch;
234 234 __ mov(SP, saved_SP); // keep track of SP before call
235 235 #endif
236 236
237 237 // setup parameters
238 238 const Register t = G3_scratch;
239 239 __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
240 240 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes
241 241 __ sub(FP, t, Gargs); // setup parameter pointer
242 242 #ifdef _LP64
243 243 __ add( Gargs, STACK_BIAS, Gargs ); // Account for LP64 stack bias
244 244 #endif
245 245 __ mov(SP, O5_savedSP);
246 246
247 247
248 248 // do the call
249 249 //
250 250 // the following register must be setup:
251 251 //
252 252 // G2_thread
253 253 // G5_method
254 254 // Gargs
255 255 BLOCK_COMMENT("call Java function");
256 256 __ jmpl(entry_point.as_in().as_register(), G0, O7);
257 257 __ delayed()->mov(method.as_in().as_register(), G5_method); // setup method
258 258
259 259 BLOCK_COMMENT("call_stub_return_address:");
260 260 return_pc = __ pc();
261 261
262 262 // The callee, if it wasn't interpreted, can return with SP changed so
263 263 // we can no longer assert of change of SP.
264 264
265 265 // store result depending on type
266 266 // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
267 267 // is treated as T_INT)
268 268 { const Register addr = result .as_in().as_register();
269 269 const Register type = result_type.as_in().as_register();
270 270 Label is_long, is_float, is_double, is_object, exit;
271 271 __ cmp(type, T_OBJECT); __ br(Assembler::equal, false, Assembler::pn, is_object);
272 272 __ delayed()->cmp(type, T_FLOAT); __ br(Assembler::equal, false, Assembler::pn, is_float);
273 273 __ delayed()->cmp(type, T_DOUBLE); __ br(Assembler::equal, false, Assembler::pn, is_double);
274 274 __ delayed()->cmp(type, T_LONG); __ br(Assembler::equal, false, Assembler::pn, is_long);
275 275 __ delayed()->nop();
276 276
277 277 // store int result
278 278 __ st(O0, addr, G0);
279 279
280 280 __ BIND(exit);
281 281 __ ret();
282 282 __ delayed()->restore();
283 283
284 284 __ BIND(is_object);
285 285 __ ba(false, exit);
286 286 __ delayed()->st_ptr(O0, addr, G0);
287 287
288 288 __ BIND(is_float);
289 289 __ ba(false, exit);
290 290 __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
291 291
292 292 __ BIND(is_double);
293 293 __ ba(false, exit);
294 294 __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
295 295
296 296 __ BIND(is_long);
297 297 #ifdef _LP64
298 298 __ ba(false, exit);
299 299 __ delayed()->st_long(O0, addr, G0); // store entire long
300 300 #else
301 301 #if defined(COMPILER2)
302 302 // All return values are where we want them, except for Longs. C2 returns
303 303 // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1.
304 304 // Since the interpreter will return longs in G1 and O0/O1 in the 32bit
305 305 // build we simply always use G1.
306 306 // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to
307 307 // do this here. Unfortunately if we did a rethrow we'd see an machepilog node
308 308 // first which would move g1 -> O0/O1 and destroy the exception we were throwing.
309 309
310 310 __ ba(false, exit);
311 311 __ delayed()->stx(G1, addr, G0); // store entire long
312 312 #else
313 313 __ st(O1, addr, BytesPerInt);
314 314 __ ba(false, exit);
315 315 __ delayed()->st(O0, addr, G0);
316 316 #endif /* COMPILER2 */
317 317 #endif /* _LP64 */
318 318 }
319 319 return start;
320 320 }
321 321
322 322
323 323 //----------------------------------------------------------------------------------------------------
324 324 // Return point for a Java call if there's an exception thrown in Java code.
325 325 // The exception is caught and transformed into a pending exception stored in
326 326 // JavaThread that can be tested from within the VM.
327 327 //
328 328 // Oexception: exception oop
329 329
330 330 address generate_catch_exception() {
331 331 StubCodeMark mark(this, "StubRoutines", "catch_exception");
332 332
333 333 address start = __ pc();
334 334 // verify that thread corresponds
335 335 __ verify_thread();
336 336
337 337 const Register& temp_reg = Gtemp;
338 338 Address pending_exception_addr (G2_thread, Thread::pending_exception_offset());
339 339 Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset ());
340 340 Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset ());
341 341
342 342 // set pending exception
343 343 __ verify_oop(Oexception);
344 344 __ st_ptr(Oexception, pending_exception_addr);
345 345 __ set((intptr_t)__FILE__, temp_reg);
346 346 __ st_ptr(temp_reg, exception_file_offset_addr);
347 347 __ set((intptr_t)__LINE__, temp_reg);
348 348 __ st(temp_reg, exception_line_offset_addr);
349 349
350 350 // complete return to VM
351 351 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
352 352
353 353 AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
354 354 __ jump_to(stub_ret, temp_reg);
355 355 __ delayed()->nop();
356 356
357 357 return start;
358 358 }
359 359
360 360
361 361 //----------------------------------------------------------------------------------------------------
362 362 // Continuation point for runtime calls returning with a pending exception
363 363 // The pending exception check happened in the runtime or native call stub
364 364 // The pending exception in Thread is converted into a Java-level exception
365 365 //
366 366 // Contract with Java-level exception handler: O0 = exception
367 367 // O1 = throwing pc
368 368
369 369 address generate_forward_exception() {
370 370 StubCodeMark mark(this, "StubRoutines", "forward_exception");
371 371 address start = __ pc();
372 372
373 373 // Upon entry, O7 has the return address returning into Java
374 374 // (interpreted or compiled) code; i.e. the return address
375 375 // becomes the throwing pc.
376 376
377 377 const Register& handler_reg = Gtemp;
378 378
379 379 Address exception_addr(G2_thread, Thread::pending_exception_offset());
380 380
381 381 #ifdef ASSERT
382 382 // make sure that this code is only executed if there is a pending exception
383 383 { Label L;
384 384 __ ld_ptr(exception_addr, Gtemp);
385 385 __ br_notnull(Gtemp, false, Assembler::pt, L);
386 386 __ delayed()->nop();
387 387 __ stop("StubRoutines::forward exception: no pending exception (1)");
388 388 __ bind(L);
389 389 }
390 390 #endif
391 391
392 392 // compute exception handler into handler_reg
393 393 __ get_thread();
394 394 __ ld_ptr(exception_addr, Oexception);
395 395 __ verify_oop(Oexception);
396 396 __ save_frame(0); // compensates for compiler weakness
397 397 __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
398 398 BLOCK_COMMENT("call exception_handler_for_return_address");
399 399 __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
400 400 __ mov(O0, handler_reg);
401 401 __ restore(); // compensates for compiler weakness
402 402
403 403 __ ld_ptr(exception_addr, Oexception);
404 404 __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
405 405
406 406 #ifdef ASSERT
407 407 // make sure exception is set
408 408 { Label L;
409 409 __ br_notnull(Oexception, false, Assembler::pt, L);
410 410 __ delayed()->nop();
411 411 __ stop("StubRoutines::forward exception: no pending exception (2)");
412 412 __ bind(L);
413 413 }
414 414 #endif
415 415 // jump to exception handler
416 416 __ jmp(handler_reg, 0);
417 417 // clear pending exception
418 418 __ delayed()->st_ptr(G0, exception_addr);
419 419
420 420 return start;
421 421 }
422 422
423 423
424 424 //------------------------------------------------------------------------------------------------------------------------
425 425 // Continuation point for throwing of implicit exceptions that are not handled in
426 426 // the current activation. Fabricates an exception oop and initiates normal
427 427 // exception dispatching in this frame. Only callee-saved registers are preserved
428 428 // (through the normal register window / RegisterMap handling).
429 429 // If the compiler needs all registers to be preserved between the fault
430 430 // point and the exception handler then it must assume responsibility for that in
431 431 // AbstractCompiler::continuation_for_implicit_null_exception or
432 432 // continuation_for_implicit_division_by_zero_exception. All other implicit
433 433 // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
434 434 // either at call sites or otherwise assume that stack unwinding will be initiated,
435 435 // so caller saved registers were assumed volatile in the compiler.
436 436
437 437 // Note that we generate only this stub into a RuntimeStub, because it needs to be
438 438 // properly traversed and ignored during GC, so we change the meaning of the "__"
439 439 // macro within this method.
440 440 #undef __
441 441 #define __ masm->
442 442
443 443 address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc) {
444 444 #ifdef ASSERT
445 445 int insts_size = VerifyThread ? 1 * K : 600;
446 446 #else
447 447 int insts_size = VerifyThread ? 1 * K : 256;
448 448 #endif /* ASSERT */
449 449 int locs_size = 32;
450 450
451 451 CodeBuffer code(name, insts_size, locs_size);
452 452 MacroAssembler* masm = new MacroAssembler(&code);
453 453
454 454 __ verify_thread();
455 455
456 456 // This is an inlined and slightly modified version of call_VM
457 457 // which has the ability to fetch the return PC out of thread-local storage
458 458 __ assert_not_delayed();
459 459
460 460 // Note that we always push a frame because on the SPARC
461 461 // architecture, for all of our implicit exception kinds at call
462 462 // sites, the implicit exception is taken before the callee frame
463 463 // is pushed.
464 464 __ save_frame(0);
465 465
466 466 int frame_complete = __ offset();
467 467
468 468 if (restore_saved_exception_pc) {
469 469 __ ld_ptr(G2_thread, JavaThread::saved_exception_pc_offset(), I7);
470 470 __ sub(I7, frame::pc_return_offset, I7);
471 471 }
472 472
473 473 // Note that we always have a runtime stub frame on the top of stack by this point
474 474 Register last_java_sp = SP;
475 475 // 64-bit last_java_sp is biased!
476 476 __ set_last_Java_frame(last_java_sp, G0);
477 477 if (VerifyThread) __ mov(G2_thread, O0); // about to be smashed; pass early
478 478 __ save_thread(noreg);
479 479 // do the call
480 480 BLOCK_COMMENT("call runtime_entry");
481 481 __ call(runtime_entry, relocInfo::runtime_call_type);
482 482 if (!VerifyThread)
483 483 __ delayed()->mov(G2_thread, O0); // pass thread as first argument
484 484 else
485 485 __ delayed()->nop(); // (thread already passed)
486 486 __ restore_thread(noreg);
487 487 __ reset_last_Java_frame();
488 488
489 489 // check for pending exceptions. use Gtemp as scratch register.
490 490 #ifdef ASSERT
491 491 Label L;
492 492
493 493 Address exception_addr(G2_thread, Thread::pending_exception_offset());
494 494 Register scratch_reg = Gtemp;
495 495 __ ld_ptr(exception_addr, scratch_reg);
496 496 __ br_notnull(scratch_reg, false, Assembler::pt, L);
497 497 __ delayed()->nop();
498 498 __ should_not_reach_here();
499 499 __ bind(L);
500 500 #endif // ASSERT
501 501 BLOCK_COMMENT("call forward_exception_entry");
502 502 __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
503 503 // we use O7 linkage so that forward_exception_entry has the issuing PC
504 504 __ delayed()->restore();
505 505
506 506 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
507 507 return stub->entry_point();
508 508 }
509 509
510 510 #undef __
511 511 #define __ _masm->
512 512
513 513
514 514 // Generate a routine that sets all the registers so we
515 515 // can tell if the stop routine prints them correctly.
516 516 address generate_test_stop() {
517 517 StubCodeMark mark(this, "StubRoutines", "test_stop");
518 518 address start = __ pc();
519 519
520 520 int i;
521 521
522 522 __ save_frame(0);
523 523
524 524 static jfloat zero = 0.0, one = 1.0;
525 525
526 526 // put addr in L0, then load through L0 to F0
527 527 __ set((intptr_t)&zero, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F0);
528 528 __ set((intptr_t)&one, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
529 529
530 530 // use add to put 2..18 in F2..F18
531 531 for ( i = 2; i <= 18; ++i ) {
532 532 __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1), as_FloatRegister(i));
533 533 }
534 534
535 535 // Now put double 2 in F16, double 18 in F18
536 536 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
537 537 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
538 538
539 539 // use add to put 20..32 in F20..F32
540 540 for (i = 20; i < 32; i += 2) {
541 541 __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2), as_FloatRegister(i));
542 542 }
543 543
544 544 // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
545 545 for ( i = 0; i < 8; ++i ) {
546 546 if (i < 6) {
547 547 __ set( i, as_iRegister(i));
548 548 __ set(16 + i, as_oRegister(i));
549 549 __ set(24 + i, as_gRegister(i));
550 550 }
551 551 __ set( 8 + i, as_lRegister(i));
552 552 }
553 553
554 554 __ stop("testing stop");
555 555
556 556
557 557 __ ret();
558 558 __ delayed()->restore();
559 559
560 560 return start;
561 561 }
562 562
563 563
564 564 address generate_stop_subroutine() {
565 565 StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
566 566 address start = __ pc();
567 567
568 568 __ stop_subroutine();
569 569
570 570 return start;
571 571 }
572 572
573 573 address generate_flush_callers_register_windows() {
574 574 StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
575 575 address start = __ pc();
576 576
577 577 __ flush_windows();
578 578 __ retl(false);
579 579 __ delayed()->add( FP, STACK_BIAS, O0 );
580 580 // The returned value must be a stack pointer whose register save area
581 581 // is flushed, and will stay flushed while the caller executes.
582 582
583 583 return start;
584 584 }
585 585
586 586 // Helper functions for v8 atomic operations.
587 587 //
588 588 void get_v8_oop_lock_ptr(Register lock_ptr_reg, Register mark_oop_reg, Register scratch_reg) {
589 589 if (mark_oop_reg == noreg) {
590 590 address lock_ptr = (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr();
591 591 __ set((intptr_t)lock_ptr, lock_ptr_reg);
592 592 } else {
593 593 assert(scratch_reg != noreg, "just checking");
594 594 address lock_ptr = (address)StubRoutines::Sparc::_v8_oop_lock_cache;
595 595 __ set((intptr_t)lock_ptr, lock_ptr_reg);
596 596 __ and3(mark_oop_reg, StubRoutines::Sparc::v8_oop_lock_mask_in_place, scratch_reg);
597 597 __ add(lock_ptr_reg, scratch_reg, lock_ptr_reg);
598 598 }
599 599 }
600 600
601 601 void generate_v8_lock_prologue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
602 602
603 603 get_v8_oop_lock_ptr(lock_ptr_reg, mark_oop_reg, scratch_reg);
604 604 __ set(StubRoutines::Sparc::locked, lock_reg);
605 605 // Initialize yield counter
606 606 __ mov(G0,yield_reg);
607 607
608 608 __ BIND(retry);
609 609 __ cmp(yield_reg, V8AtomicOperationUnderLockSpinCount);
610 610 __ br(Assembler::less, false, Assembler::pt, dontyield);
611 611 __ delayed()->nop();
612 612
613 613 // This code can only be called from inside the VM, this
614 614 // stub is only invoked from Atomic::add(). We do not
615 615 // want to use call_VM, because _last_java_sp and such
616 616 // must already be set.
617 617 //
618 618 // Save the regs and make space for a C call
619 619 __ save(SP, -96, SP);
620 620 __ save_all_globals_into_locals();
621 621 BLOCK_COMMENT("call os::naked_sleep");
622 622 __ call(CAST_FROM_FN_PTR(address, os::naked_sleep));
623 623 __ delayed()->nop();
624 624 __ restore_globals_from_locals();
625 625 __ restore();
626 626 // reset the counter
627 627 __ mov(G0,yield_reg);
628 628
629 629 __ BIND(dontyield);
630 630
631 631 // try to get lock
632 632 __ swap(lock_ptr_reg, 0, lock_reg);
633 633
634 634 // did we get the lock?
635 635 __ cmp(lock_reg, StubRoutines::Sparc::unlocked);
636 636 __ br(Assembler::notEqual, true, Assembler::pn, retry);
637 637 __ delayed()->add(yield_reg,1,yield_reg);
638 638
639 639 // yes, got lock. do the operation here.
640 640 }
641 641
642 642 void generate_v8_lock_epilogue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
643 643 __ st(lock_reg, lock_ptr_reg, 0); // unlock
644 644 }
645 645
646 646 // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
647 647 //
648 648 // Arguments :
649 649 //
650 650 // exchange_value: O0
651 651 // dest: O1
652 652 //
653 653 // Results:
654 654 //
655 655 // O0: the value previously stored in dest
656 656 //
657 657 address generate_atomic_xchg() {
658 658 StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
659 659 address start = __ pc();
660 660
661 661 if (UseCASForSwap) {
662 662 // Use CAS instead of swap, just in case the MP hardware
663 663 // prefers to work with just one kind of synch. instruction.
664 664 Label retry;
665 665 __ BIND(retry);
666 666 __ mov(O0, O3); // scratch copy of exchange value
667 667 __ ld(O1, 0, O2); // observe the previous value
668 668 // try to replace O2 with O3
669 669 __ cas_under_lock(O1, O2, O3,
670 670 (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
671 671 __ cmp(O2, O3);
672 672 __ br(Assembler::notEqual, false, Assembler::pn, retry);
673 673 __ delayed()->nop();
674 674
675 675 __ retl(false);
676 676 __ delayed()->mov(O2, O0); // report previous value to caller
677 677
678 678 } else {
679 679 if (VM_Version::v9_instructions_work()) {
680 680 __ retl(false);
681 681 __ delayed()->swap(O1, 0, O0);
682 682 } else {
683 683 const Register& lock_reg = O2;
684 684 const Register& lock_ptr_reg = O3;
685 685 const Register& yield_reg = O4;
686 686
687 687 Label retry;
688 688 Label dontyield;
689 689
690 690 generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
691 691 // got the lock, do the swap
692 692 __ swap(O1, 0, O0);
693 693
694 694 generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
695 695 __ retl(false);
696 696 __ delayed()->nop();
697 697 }
698 698 }
699 699
700 700 return start;
701 701 }
702 702
703 703
704 704 // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
705 705 //
706 706 // Arguments :
707 707 //
708 708 // exchange_value: O0
709 709 // dest: O1
710 710 // compare_value: O2
711 711 //
712 712 // Results:
713 713 //
714 714 // O0: the value previously stored in dest
715 715 //
716 716 // Overwrites (v8): O3,O4,O5
717 717 //
718 718 address generate_atomic_cmpxchg() {
719 719 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
720 720 address start = __ pc();
721 721
722 722 // cmpxchg(dest, compare_value, exchange_value)
723 723 __ cas_under_lock(O1, O2, O0,
724 724 (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
725 725 __ retl(false);
726 726 __ delayed()->nop();
727 727
728 728 return start;
729 729 }
730 730
731 731 // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
732 732 //
733 733 // Arguments :
734 734 //
735 735 // exchange_value: O1:O0
736 736 // dest: O2
737 737 // compare_value: O4:O3
738 738 //
739 739 // Results:
740 740 //
741 741 // O1:O0: the value previously stored in dest
742 742 //
743 743 // This only works on V9, on V8 we don't generate any
744 744 // code and just return NULL.
745 745 //
746 746 // Overwrites: G1,G2,G3
747 747 //
748 748 address generate_atomic_cmpxchg_long() {
749 749 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
750 750 address start = __ pc();
751 751
752 752 if (!VM_Version::supports_cx8())
753 753 return NULL;;
754 754 __ sllx(O0, 32, O0);
755 755 __ srl(O1, 0, O1);
756 756 __ or3(O0,O1,O0); // O0 holds 64-bit value from compare_value
757 757 __ sllx(O3, 32, O3);
758 758 __ srl(O4, 0, O4);
759 759 __ or3(O3,O4,O3); // O3 holds 64-bit value from exchange_value
760 760 __ casx(O2, O3, O0);
761 761 __ srl(O0, 0, O1); // unpacked return value in O1:O0
762 762 __ retl(false);
763 763 __ delayed()->srlx(O0, 32, O0);
764 764
765 765 return start;
766 766 }
767 767
768 768
769 769 // Support for jint Atomic::add(jint add_value, volatile jint* dest).
770 770 //
771 771 // Arguments :
772 772 //
773 773 // add_value: O0 (e.g., +1 or -1)
774 774 // dest: O1
775 775 //
776 776 // Results:
777 777 //
778 778 // O0: the new value stored in dest
779 779 //
780 780 // Overwrites (v9): O3
781 781 // Overwrites (v8): O3,O4,O5
782 782 //
783 783 address generate_atomic_add() {
784 784 StubCodeMark mark(this, "StubRoutines", "atomic_add");
785 785 address start = __ pc();
786 786 __ BIND(_atomic_add_stub);
787 787
788 788 if (VM_Version::v9_instructions_work()) {
789 789 Label(retry);
790 790 __ BIND(retry);
791 791
792 792 __ lduw(O1, 0, O2);
793 793 __ add(O0, O2, O3);
794 794 __ cas(O1, O2, O3);
795 795 __ cmp( O2, O3);
796 796 __ br(Assembler::notEqual, false, Assembler::pn, retry);
797 797 __ delayed()->nop();
798 798 __ retl(false);
799 799 __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
800 800 } else {
801 801 const Register& lock_reg = O2;
802 802 const Register& lock_ptr_reg = O3;
803 803 const Register& value_reg = O4;
804 804 const Register& yield_reg = O5;
805 805
806 806 Label(retry);
807 807 Label(dontyield);
808 808
809 809 generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
810 810 // got lock, do the increment
811 811 __ ld(O1, 0, value_reg);
812 812 __ add(O0, value_reg, value_reg);
813 813 __ st(value_reg, O1, 0);
814 814
815 815 // %%% only for RMO and PSO
816 816 __ membar(Assembler::StoreStore);
817 817
818 818 generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
819 819
820 820 __ retl(false);
821 821 __ delayed()->mov(value_reg, O0);
822 822 }
823 823
824 824 return start;
825 825 }
826 826 Label _atomic_add_stub; // called from other stubs
827 827
828 828
829 829 //------------------------------------------------------------------------------------------------------------------------
830 830 // The following routine generates a subroutine to throw an asynchronous
831 831 // UnknownError when an unsafe access gets a fault that could not be
832 832 // reasonably prevented by the programmer. (Example: SIGBUS/OBJERR.)
833 833 //
834 834 // Arguments :
835 835 //
836 836 // trapping PC: O7
837 837 //
838 838 // Results:
839 839 // posts an asynchronous exception, skips the trapping instruction
840 840 //
841 841
842 842 address generate_handler_for_unsafe_access() {
843 843 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
844 844 address start = __ pc();
845 845
846 846 const int preserve_register_words = (64 * 2);
847 847 Address preserve_addr(FP, (-preserve_register_words * wordSize) + STACK_BIAS);
848 848
849 849 Register Lthread = L7_thread_cache;
850 850 int i;
851 851
852 852 __ save_frame(0);
853 853 __ mov(G1, L1);
854 854 __ mov(G2, L2);
855 855 __ mov(G3, L3);
856 856 __ mov(G4, L4);
857 857 __ mov(G5, L5);
858 858 for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
859 859 __ stf(FloatRegisterImpl::D, as_FloatRegister(i), preserve_addr, i * wordSize);
860 860 }
861 861
862 862 address entry_point = CAST_FROM_FN_PTR(address, handle_unsafe_access);
863 863 BLOCK_COMMENT("call handle_unsafe_access");
864 864 __ call(entry_point, relocInfo::runtime_call_type);
865 865 __ delayed()->nop();
866 866
867 867 __ mov(L1, G1);
868 868 __ mov(L2, G2);
869 869 __ mov(L3, G3);
870 870 __ mov(L4, G4);
871 871 __ mov(L5, G5);
872 872 for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
873 873 __ ldf(FloatRegisterImpl::D, preserve_addr, as_FloatRegister(i), i * wordSize);
874 874 }
875 875
876 876 __ verify_thread();
877 877
878 878 __ jmp(O0, 0);
879 879 __ delayed()->restore();
880 880
881 881 return start;
882 882 }
883 883
884 884
885 885 // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
886 886 // Arguments :
887 887 //
888 888 // ret : O0, returned
889 889 // icc/xcc: set as O0 (depending on wordSize)
890 890 // sub : O1, argument, not changed
891 891 // super: O2, argument, not changed
892 892 // raddr: O7, blown by call
893 893 address generate_partial_subtype_check() {
894 894 __ align(CodeEntryAlignment);
895 895 StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
896 896 address start = __ pc();
897 897 Label miss;
898 898
899 899 #if defined(COMPILER2) && !defined(_LP64)
900 900 // Do not use a 'save' because it blows the 64-bit O registers.
901 901 __ add(SP,-4*wordSize,SP); // Make space for 4 temps (stack must be 2 words aligned)
902 902 __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize);
903 903 __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize);
904 904 __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize);
905 905 __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize);
906 906 Register Rret = O0;
907 907 Register Rsub = O1;
908 908 Register Rsuper = O2;
909 909 #else
910 910 __ save_frame(0);
911 911 Register Rret = I0;
912 912 Register Rsub = I1;
913 913 Register Rsuper = I2;
914 914 #endif
915 915
916 916 Register L0_ary_len = L0;
917 917 Register L1_ary_ptr = L1;
918 918 Register L2_super = L2;
919 919 Register L3_index = L3;
920 920
921 921 __ check_klass_subtype_slow_path(Rsub, Rsuper,
922 922 L0, L1, L2, L3,
923 923 NULL, &miss);
924 924
925 925 // Match falls through here.
926 926 __ addcc(G0,0,Rret); // set Z flags, Z result
927 927
928 928 #if defined(COMPILER2) && !defined(_LP64)
929 929 __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
930 930 __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
931 931 __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
932 932 __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
933 933 __ retl(); // Result in Rret is zero; flags set to Z
934 934 __ delayed()->add(SP,4*wordSize,SP);
935 935 #else
936 936 __ ret(); // Result in Rret is zero; flags set to Z
937 937 __ delayed()->restore();
938 938 #endif
939 939
940 940 __ BIND(miss);
941 941 __ addcc(G0,1,Rret); // set NZ flags, NZ result
942 942
943 943 #if defined(COMPILER2) && !defined(_LP64)
944 944 __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
945 945 __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
946 946 __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
947 947 __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
948 948 __ retl(); // Result in Rret is != 0; flags set to NZ
949 949 __ delayed()->add(SP,4*wordSize,SP);
950 950 #else
951 951 __ ret(); // Result in Rret is != 0; flags set to NZ
952 952 __ delayed()->restore();
953 953 #endif
954 954
955 955 return start;
956 956 }
957 957
958 958
959 959 // Called from MacroAssembler::verify_oop
960 960 //
↓ open down ↓ |
948 lines elided |
↑ open up ↑ |
961 961 address generate_verify_oop_subroutine() {
962 962 StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
963 963
964 964 address start = __ pc();
965 965
966 966 __ verify_oop_subroutine();
967 967
968 968 return start;
969 969 }
970 970
971 - static address disjoint_byte_copy_entry;
972 - static address disjoint_short_copy_entry;
973 - static address disjoint_int_copy_entry;
974 - static address disjoint_long_copy_entry;
975 - static address disjoint_oop_copy_entry;
976 -
977 - static address byte_copy_entry;
978 - static address short_copy_entry;
979 - static address int_copy_entry;
980 - static address long_copy_entry;
981 - static address oop_copy_entry;
982 -
983 - static address checkcast_copy_entry;
984 971
985 972 //
986 973 // Verify that a register contains clean 32-bits positive value
987 974 // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
988 975 //
989 976 // Input:
990 977 // Rint - 32-bits value
991 978 // Rtmp - scratch
992 979 //
993 980 void assert_clean_int(Register Rint, Register Rtmp) {
994 981 #if defined(ASSERT) && defined(_LP64)
995 982 __ signx(Rint, Rtmp);
996 983 __ cmp(Rint, Rtmp);
997 984 __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
998 985 #endif
999 986 }
1000 987
1001 988 //
1002 989 // Generate overlap test for array copy stubs
1003 990 //
1004 991 // Input:
1005 992 // O0 - array1
1006 993 // O1 - array2
1007 994 // O2 - element count
1008 995 //
1009 996 // Kills temps: O3, O4
1010 997 //
1011 998 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
1012 999 assert(no_overlap_target != NULL, "must be generated");
1013 1000 array_overlap_test(no_overlap_target, NULL, log2_elem_size);
1014 1001 }
1015 1002 void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
1016 1003 array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
1017 1004 }
1018 1005 void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
1019 1006 const Register from = O0;
1020 1007 const Register to = O1;
1021 1008 const Register count = O2;
1022 1009 const Register to_from = O3; // to - from
1023 1010 const Register byte_count = O4; // count << log2_elem_size
1024 1011
1025 1012 __ subcc(to, from, to_from);
1026 1013 __ sll_ptr(count, log2_elem_size, byte_count);
1027 1014 if (NOLp == NULL)
1028 1015 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
1029 1016 else
1030 1017 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
1031 1018 __ delayed()->cmp(to_from, byte_count);
1032 1019 if (NOLp == NULL)
1033 1020 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
1034 1021 else
1035 1022 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
1036 1023 __ delayed()->nop();
1037 1024 }
1038 1025
1039 1026 //
1040 1027 // Generate pre-write barrier for array.
1041 1028 //
1042 1029 // Input:
1043 1030 // addr - register containing starting address
1044 1031 // count - register containing element count
1045 1032 // tmp - scratch register
1046 1033 //
1047 1034 // The input registers are overwritten.
1048 1035 //
1049 1036 void gen_write_ref_array_pre_barrier(Register addr, Register count) {
1050 1037 BarrierSet* bs = Universe::heap()->barrier_set();
1051 1038 if (bs->has_write_ref_pre_barrier()) {
1052 1039 assert(bs->has_write_ref_array_pre_opt(),
1053 1040 "Else unsupported barrier set.");
1054 1041
1055 1042 __ save_frame(0);
1056 1043 // Save the necessary global regs... will be used after.
1057 1044 if (addr->is_global()) {
1058 1045 __ mov(addr, L0);
1059 1046 }
1060 1047 if (count->is_global()) {
1061 1048 __ mov(count, L1);
1062 1049 }
1063 1050 __ mov(addr->after_save(), O0);
1064 1051 // Get the count into O1
1065 1052 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
1066 1053 __ delayed()->mov(count->after_save(), O1);
1067 1054 if (addr->is_global()) {
1068 1055 __ mov(L0, addr);
1069 1056 }
1070 1057 if (count->is_global()) {
1071 1058 __ mov(L1, count);
1072 1059 }
1073 1060 __ restore();
1074 1061 }
1075 1062 }
1076 1063 //
1077 1064 // Generate post-write barrier for array.
1078 1065 //
1079 1066 // Input:
1080 1067 // addr - register containing starting address
1081 1068 // count - register containing element count
1082 1069 // tmp - scratch register
1083 1070 //
1084 1071 // The input registers are overwritten.
1085 1072 //
1086 1073 void gen_write_ref_array_post_barrier(Register addr, Register count,
1087 1074 Register tmp) {
1088 1075 BarrierSet* bs = Universe::heap()->barrier_set();
1089 1076
1090 1077 switch (bs->kind()) {
1091 1078 case BarrierSet::G1SATBCT:
1092 1079 case BarrierSet::G1SATBCTLogging:
1093 1080 {
1094 1081 // Get some new fresh output registers.
1095 1082 __ save_frame(0);
1096 1083 __ mov(addr->after_save(), O0);
1097 1084 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
1098 1085 __ delayed()->mov(count->after_save(), O1);
1099 1086 __ restore();
1100 1087 }
1101 1088 break;
1102 1089 case BarrierSet::CardTableModRef:
1103 1090 case BarrierSet::CardTableExtension:
1104 1091 {
1105 1092 CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1106 1093 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1107 1094 assert_different_registers(addr, count, tmp);
1108 1095
1109 1096 Label L_loop;
1110 1097
1111 1098 __ sll_ptr(count, LogBytesPerHeapOop, count);
1112 1099 __ sub(count, BytesPerHeapOop, count);
1113 1100 __ add(count, addr, count);
1114 1101 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
1115 1102 __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
1116 1103 __ srl_ptr(count, CardTableModRefBS::card_shift, count);
1117 1104 __ sub(count, addr, count);
1118 1105 AddressLiteral rs(ct->byte_map_base);
1119 1106 __ set(rs, tmp);
1120 1107 __ BIND(L_loop);
1121 1108 __ stb(G0, tmp, addr);
1122 1109 __ subcc(count, 1, count);
1123 1110 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1124 1111 __ delayed()->add(addr, 1, addr);
1125 1112 }
1126 1113 break;
1127 1114 case BarrierSet::ModRef:
1128 1115 break;
1129 1116 default:
1130 1117 ShouldNotReachHere();
1131 1118 }
1132 1119 }
1133 1120
1134 1121
1135 1122 // Copy big chunks forward with shift
1136 1123 //
1137 1124 // Inputs:
1138 1125 // from - source arrays
1139 1126 // to - destination array aligned to 8-bytes
1140 1127 // count - elements count to copy >= the count equivalent to 16 bytes
1141 1128 // count_dec - elements count's decrement equivalent to 16 bytes
1142 1129 // L_copy_bytes - copy exit label
1143 1130 //
1144 1131 void copy_16_bytes_forward_with_shift(Register from, Register to,
1145 1132 Register count, int count_dec, Label& L_copy_bytes) {
1146 1133 Label L_loop, L_aligned_copy, L_copy_last_bytes;
1147 1134
1148 1135 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1149 1136 __ andcc(from, 7, G1); // misaligned bytes
1150 1137 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1151 1138 __ delayed()->nop();
1152 1139
1153 1140 const Register left_shift = G1; // left shift bit counter
1154 1141 const Register right_shift = G5; // right shift bit counter
1155 1142
1156 1143 __ sll(G1, LogBitsPerByte, left_shift);
1157 1144 __ mov(64, right_shift);
1158 1145 __ sub(right_shift, left_shift, right_shift);
1159 1146
1160 1147 //
1161 1148 // Load 2 aligned 8-bytes chunks and use one from previous iteration
1162 1149 // to form 2 aligned 8-bytes chunks to store.
1163 1150 //
1164 1151 __ deccc(count, count_dec); // Pre-decrement 'count'
1165 1152 __ andn(from, 7, from); // Align address
1166 1153 __ ldx(from, 0, O3);
1167 1154 __ inc(from, 8);
1168 1155 __ align(OptoLoopAlignment);
1169 1156 __ BIND(L_loop);
1170 1157 __ ldx(from, 0, O4);
1171 1158 __ deccc(count, count_dec); // Can we do next iteration after this one?
1172 1159 __ ldx(from, 8, G4);
1173 1160 __ inc(to, 16);
1174 1161 __ inc(from, 16);
1175 1162 __ sllx(O3, left_shift, O3);
1176 1163 __ srlx(O4, right_shift, G3);
1177 1164 __ bset(G3, O3);
1178 1165 __ stx(O3, to, -16);
1179 1166 __ sllx(O4, left_shift, O4);
1180 1167 __ srlx(G4, right_shift, G3);
1181 1168 __ bset(G3, O4);
1182 1169 __ stx(O4, to, -8);
1183 1170 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1184 1171 __ delayed()->mov(G4, O3);
1185 1172
1186 1173 __ inccc(count, count_dec>>1 ); // + 8 bytes
1187 1174 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1188 1175 __ delayed()->inc(count, count_dec>>1); // restore 'count'
1189 1176
1190 1177 // copy 8 bytes, part of them already loaded in O3
1191 1178 __ ldx(from, 0, O4);
1192 1179 __ inc(to, 8);
1193 1180 __ inc(from, 8);
1194 1181 __ sllx(O3, left_shift, O3);
1195 1182 __ srlx(O4, right_shift, G3);
1196 1183 __ bset(O3, G3);
1197 1184 __ stx(G3, to, -8);
1198 1185
1199 1186 __ BIND(L_copy_last_bytes);
1200 1187 __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1201 1188 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1202 1189 __ delayed()->sub(from, right_shift, from); // restore address
1203 1190
1204 1191 __ BIND(L_aligned_copy);
1205 1192 }
1206 1193
1207 1194 // Copy big chunks backward with shift
1208 1195 //
1209 1196 // Inputs:
1210 1197 // end_from - source arrays end address
1211 1198 // end_to - destination array end address aligned to 8-bytes
1212 1199 // count - elements count to copy >= the count equivalent to 16 bytes
1213 1200 // count_dec - elements count's decrement equivalent to 16 bytes
1214 1201 // L_aligned_copy - aligned copy exit label
1215 1202 // L_copy_bytes - copy exit label
1216 1203 //
1217 1204 void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
1218 1205 Register count, int count_dec,
1219 1206 Label& L_aligned_copy, Label& L_copy_bytes) {
1220 1207 Label L_loop, L_copy_last_bytes;
1221 1208
1222 1209 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1223 1210 __ andcc(end_from, 7, G1); // misaligned bytes
1224 1211 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1225 1212 __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
1226 1213
1227 1214 const Register left_shift = G1; // left shift bit counter
1228 1215 const Register right_shift = G5; // right shift bit counter
1229 1216
1230 1217 __ sll(G1, LogBitsPerByte, left_shift);
1231 1218 __ mov(64, right_shift);
1232 1219 __ sub(right_shift, left_shift, right_shift);
1233 1220
1234 1221 //
1235 1222 // Load 2 aligned 8-bytes chunks and use one from previous iteration
1236 1223 // to form 2 aligned 8-bytes chunks to store.
1237 1224 //
1238 1225 __ andn(end_from, 7, end_from); // Align address
1239 1226 __ ldx(end_from, 0, O3);
1240 1227 __ align(OptoLoopAlignment);
1241 1228 __ BIND(L_loop);
1242 1229 __ ldx(end_from, -8, O4);
1243 1230 __ deccc(count, count_dec); // Can we do next iteration after this one?
1244 1231 __ ldx(end_from, -16, G4);
1245 1232 __ dec(end_to, 16);
1246 1233 __ dec(end_from, 16);
1247 1234 __ srlx(O3, right_shift, O3);
1248 1235 __ sllx(O4, left_shift, G3);
1249 1236 __ bset(G3, O3);
1250 1237 __ stx(O3, end_to, 8);
1251 1238 __ srlx(O4, right_shift, O4);
1252 1239 __ sllx(G4, left_shift, G3);
1253 1240 __ bset(G3, O4);
1254 1241 __ stx(O4, end_to, 0);
1255 1242 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1256 1243 __ delayed()->mov(G4, O3);
1257 1244
1258 1245 __ inccc(count, count_dec>>1 ); // + 8 bytes
1259 1246 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1260 1247 __ delayed()->inc(count, count_dec>>1); // restore 'count'
1261 1248
1262 1249 // copy 8 bytes, part of them already loaded in O3
1263 1250 __ ldx(end_from, -8, O4);
1264 1251 __ dec(end_to, 8);
1265 1252 __ dec(end_from, 8);
1266 1253 __ srlx(O3, right_shift, O3);
1267 1254 __ sllx(O4, left_shift, G3);
1268 1255 __ bset(O3, G3);
1269 1256 __ stx(G3, end_to, 0);
1270 1257
1271 1258 __ BIND(L_copy_last_bytes);
1272 1259 __ srl(left_shift, LogBitsPerByte, left_shift); // misaligned bytes
1273 1260 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1274 1261 __ delayed()->add(end_from, left_shift, end_from); // restore address
1275 1262 }
↓ open down ↓ |
282 lines elided |
↑ open up ↑ |
1276 1263
1277 1264 //
1278 1265 // Generate stub for disjoint byte copy. If "aligned" is true, the
1279 1266 // "from" and "to" addresses are assumed to be heapword aligned.
1280 1267 //
1281 1268 // Arguments for generated stub:
1282 1269 // from: O0
1283 1270 // to: O1
1284 1271 // count: O2 treated as signed
1285 1272 //
1286 - address generate_disjoint_byte_copy(bool aligned, const char * name) {
1273 + address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
1287 1274 __ align(CodeEntryAlignment);
1288 1275 StubCodeMark mark(this, "StubRoutines", name);
1289 1276 address start = __ pc();
1290 1277
1291 1278 Label L_skip_alignment, L_align;
1292 1279 Label L_copy_byte, L_copy_byte_loop, L_exit;
1293 1280
1294 1281 const Register from = O0; // source array address
1295 1282 const Register to = O1; // destination array address
1296 1283 const Register count = O2; // elements count
1297 1284 const Register offset = O5; // offset from start of arrays
1298 1285 // O3, O4, G3, G4 are used as temp registers
1299 1286
1300 1287 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1301 1288
1302 - if (!aligned) disjoint_byte_copy_entry = __ pc();
1303 - // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1304 - if (!aligned) BLOCK_COMMENT("Entry:");
1289 + if (entry != NULL) {
1290 + *entry = __ pc();
1291 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1292 + BLOCK_COMMENT("Entry:");
1293 + }
1305 1294
1306 1295 // for short arrays, just do single element copy
1307 1296 __ cmp(count, 23); // 16 + 7
1308 1297 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1309 1298 __ delayed()->mov(G0, offset);
1310 1299
1311 1300 if (aligned) {
1312 1301 // 'aligned' == true when it is known statically during compilation
1313 1302 // of this arraycopy call site that both 'from' and 'to' addresses
1314 1303 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1315 1304 //
1316 1305 // Aligned arrays have 4 bytes alignment in 32-bits VM
1317 1306 // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
1318 1307 //
1319 1308 #ifndef _LP64
1320 1309 // copy a 4-bytes word if necessary to align 'to' to 8 bytes
1321 1310 __ andcc(to, 7, G0);
1322 1311 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment);
1323 1312 __ delayed()->ld(from, 0, O3);
1324 1313 __ inc(from, 4);
1325 1314 __ inc(to, 4);
1326 1315 __ dec(count, 4);
1327 1316 __ st(O3, to, -4);
1328 1317 __ BIND(L_skip_alignment);
1329 1318 #endif
1330 1319 } else {
1331 1320 // copy bytes to align 'to' on 8 byte boundary
1332 1321 __ andcc(to, 7, G1); // misaligned bytes
1333 1322 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1334 1323 __ delayed()->neg(G1);
1335 1324 __ inc(G1, 8); // bytes need to copy to next 8-bytes alignment
1336 1325 __ sub(count, G1, count);
1337 1326 __ BIND(L_align);
1338 1327 __ ldub(from, 0, O3);
1339 1328 __ deccc(G1);
1340 1329 __ inc(from);
1341 1330 __ stb(O3, to, 0);
1342 1331 __ br(Assembler::notZero, false, Assembler::pt, L_align);
1343 1332 __ delayed()->inc(to);
1344 1333 __ BIND(L_skip_alignment);
1345 1334 }
1346 1335 #ifdef _LP64
1347 1336 if (!aligned)
1348 1337 #endif
1349 1338 {
1350 1339 // Copy with shift 16 bytes per iteration if arrays do not have
1351 1340 // the same alignment mod 8, otherwise fall through to the next
1352 1341 // code for aligned copy.
1353 1342 // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1354 1343 // Also jump over aligned copy after the copy with shift completed.
1355 1344
1356 1345 copy_16_bytes_forward_with_shift(from, to, count, 16, L_copy_byte);
1357 1346 }
1358 1347
1359 1348 // Both array are 8 bytes aligned, copy 16 bytes at a time
1360 1349 __ and3(count, 7, G4); // Save count
1361 1350 __ srl(count, 3, count);
1362 1351 generate_disjoint_long_copy_core(aligned);
1363 1352 __ mov(G4, count); // Restore count
1364 1353
1365 1354 // copy tailing bytes
1366 1355 __ BIND(L_copy_byte);
1367 1356 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1368 1357 __ delayed()->nop();
1369 1358 __ align(OptoLoopAlignment);
1370 1359 __ BIND(L_copy_byte_loop);
1371 1360 __ ldub(from, offset, O3);
1372 1361 __ deccc(count);
1373 1362 __ stb(O3, to, offset);
1374 1363 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1375 1364 __ delayed()->inc(offset);
1376 1365
1377 1366 __ BIND(L_exit);
1378 1367 // O3, O4 are used as temp registers
1379 1368 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1380 1369 __ retl();
1381 1370 __ delayed()->mov(G0, O0); // return 0
1382 1371 return start;
1383 1372 }
↓ open down ↓ |
69 lines elided |
↑ open up ↑ |
1384 1373
1385 1374 //
1386 1375 // Generate stub for conjoint byte copy. If "aligned" is true, the
1387 1376 // "from" and "to" addresses are assumed to be heapword aligned.
1388 1377 //
1389 1378 // Arguments for generated stub:
1390 1379 // from: O0
1391 1380 // to: O1
1392 1381 // count: O2 treated as signed
1393 1382 //
1394 - address generate_conjoint_byte_copy(bool aligned, const char * name) {
1383 + address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1384 + address *entry, const char *name) {
1395 1385 // Do reverse copy.
1396 1386
1397 1387 __ align(CodeEntryAlignment);
1398 1388 StubCodeMark mark(this, "StubRoutines", name);
1399 1389 address start = __ pc();
1400 - address nooverlap_target = aligned ?
1401 - StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
1402 - disjoint_byte_copy_entry;
1403 1390
1404 1391 Label L_skip_alignment, L_align, L_aligned_copy;
1405 1392 Label L_copy_byte, L_copy_byte_loop, L_exit;
1406 1393
1407 1394 const Register from = O0; // source array address
1408 1395 const Register to = O1; // destination array address
1409 1396 const Register count = O2; // elements count
1410 1397 const Register end_from = from; // source array end address
1411 1398 const Register end_to = to; // destination array end address
1412 1399
1413 1400 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1414 1401
1415 - if (!aligned) byte_copy_entry = __ pc();
1416 - // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1417 - if (!aligned) BLOCK_COMMENT("Entry:");
1402 + if (entry != NULL) {
1403 + *entry = __ pc();
1404 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1405 + BLOCK_COMMENT("Entry:");
1406 + }
1418 1407
1419 1408 array_overlap_test(nooverlap_target, 0);
1420 1409
1421 1410 __ add(to, count, end_to); // offset after last copied element
1422 1411
1423 1412 // for short arrays, just do single element copy
1424 1413 __ cmp(count, 23); // 16 + 7
1425 1414 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1426 1415 __ delayed()->add(from, count, end_from);
1427 1416
1428 1417 {
1429 1418 // Align end of arrays since they could be not aligned even
1430 1419 // when arrays itself are aligned.
1431 1420
1432 1421 // copy bytes to align 'end_to' on 8 byte boundary
1433 1422 __ andcc(end_to, 7, G1); // misaligned bytes
1434 1423 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1435 1424 __ delayed()->nop();
1436 1425 __ sub(count, G1, count);
1437 1426 __ BIND(L_align);
1438 1427 __ dec(end_from);
1439 1428 __ dec(end_to);
1440 1429 __ ldub(end_from, 0, O3);
1441 1430 __ deccc(G1);
1442 1431 __ brx(Assembler::notZero, false, Assembler::pt, L_align);
1443 1432 __ delayed()->stb(O3, end_to, 0);
1444 1433 __ BIND(L_skip_alignment);
1445 1434 }
1446 1435 #ifdef _LP64
1447 1436 if (aligned) {
1448 1437 // Both arrays are aligned to 8-bytes in 64-bits VM.
1449 1438 // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1450 1439 // in unaligned case.
1451 1440 __ dec(count, 16);
1452 1441 } else
1453 1442 #endif
1454 1443 {
1455 1444 // Copy with shift 16 bytes per iteration if arrays do not have
1456 1445 // the same alignment mod 8, otherwise jump to the next
1457 1446 // code for aligned copy (and substracting 16 from 'count' before jump).
1458 1447 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1459 1448 // Also jump over aligned copy after the copy with shift completed.
1460 1449
1461 1450 copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
1462 1451 L_aligned_copy, L_copy_byte);
1463 1452 }
1464 1453 // copy 4 elements (16 bytes) at a time
1465 1454 __ align(OptoLoopAlignment);
1466 1455 __ BIND(L_aligned_copy);
1467 1456 __ dec(end_from, 16);
1468 1457 __ ldx(end_from, 8, O3);
1469 1458 __ ldx(end_from, 0, O4);
1470 1459 __ dec(end_to, 16);
1471 1460 __ deccc(count, 16);
1472 1461 __ stx(O3, end_to, 8);
1473 1462 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1474 1463 __ delayed()->stx(O4, end_to, 0);
1475 1464 __ inc(count, 16);
1476 1465
1477 1466 // copy 1 element (2 bytes) at a time
1478 1467 __ BIND(L_copy_byte);
1479 1468 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1480 1469 __ delayed()->nop();
1481 1470 __ align(OptoLoopAlignment);
1482 1471 __ BIND(L_copy_byte_loop);
1483 1472 __ dec(end_from);
1484 1473 __ dec(end_to);
1485 1474 __ ldub(end_from, 0, O4);
1486 1475 __ deccc(count);
1487 1476 __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
1488 1477 __ delayed()->stb(O4, end_to, 0);
1489 1478
1490 1479 __ BIND(L_exit);
1491 1480 // O3, O4 are used as temp registers
1492 1481 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1493 1482 __ retl();
1494 1483 __ delayed()->mov(G0, O0); // return 0
1495 1484 return start;
1496 1485 }
↓ open down ↓ |
69 lines elided |
↑ open up ↑ |
1497 1486
1498 1487 //
1499 1488 // Generate stub for disjoint short copy. If "aligned" is true, the
1500 1489 // "from" and "to" addresses are assumed to be heapword aligned.
1501 1490 //
1502 1491 // Arguments for generated stub:
1503 1492 // from: O0
1504 1493 // to: O1
1505 1494 // count: O2 treated as signed
1506 1495 //
1507 - address generate_disjoint_short_copy(bool aligned, const char * name) {
1496 + address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
1508 1497 __ align(CodeEntryAlignment);
1509 1498 StubCodeMark mark(this, "StubRoutines", name);
1510 1499 address start = __ pc();
1511 1500
1512 1501 Label L_skip_alignment, L_skip_alignment2;
1513 1502 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1514 1503
1515 1504 const Register from = O0; // source array address
1516 1505 const Register to = O1; // destination array address
1517 1506 const Register count = O2; // elements count
1518 1507 const Register offset = O5; // offset from start of arrays
1519 1508 // O3, O4, G3, G4 are used as temp registers
1520 1509
1521 1510 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1522 1511
1523 - if (!aligned) disjoint_short_copy_entry = __ pc();
1524 - // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1525 - if (!aligned) BLOCK_COMMENT("Entry:");
1512 + if (entry != NULL) {
1513 + *entry = __ pc();
1514 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1515 + BLOCK_COMMENT("Entry:");
1516 + }
1526 1517
1527 1518 // for short arrays, just do single element copy
1528 1519 __ cmp(count, 11); // 8 + 3 (22 bytes)
1529 1520 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1530 1521 __ delayed()->mov(G0, offset);
1531 1522
1532 1523 if (aligned) {
1533 1524 // 'aligned' == true when it is known statically during compilation
1534 1525 // of this arraycopy call site that both 'from' and 'to' addresses
1535 1526 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1536 1527 //
1537 1528 // Aligned arrays have 4 bytes alignment in 32-bits VM
1538 1529 // and 8 bytes - in 64-bits VM.
1539 1530 //
1540 1531 #ifndef _LP64
1541 1532 // copy a 2-elements word if necessary to align 'to' to 8 bytes
1542 1533 __ andcc(to, 7, G0);
1543 1534 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1544 1535 __ delayed()->ld(from, 0, O3);
1545 1536 __ inc(from, 4);
1546 1537 __ inc(to, 4);
1547 1538 __ dec(count, 2);
1548 1539 __ st(O3, to, -4);
1549 1540 __ BIND(L_skip_alignment);
1550 1541 #endif
1551 1542 } else {
1552 1543 // copy 1 element if necessary to align 'to' on an 4 bytes
1553 1544 __ andcc(to, 3, G0);
1554 1545 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1555 1546 __ delayed()->lduh(from, 0, O3);
1556 1547 __ inc(from, 2);
1557 1548 __ inc(to, 2);
1558 1549 __ dec(count);
1559 1550 __ sth(O3, to, -2);
1560 1551 __ BIND(L_skip_alignment);
1561 1552
1562 1553 // copy 2 elements to align 'to' on an 8 byte boundary
1563 1554 __ andcc(to, 7, G0);
1564 1555 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1565 1556 __ delayed()->lduh(from, 0, O3);
1566 1557 __ dec(count, 2);
1567 1558 __ lduh(from, 2, O4);
1568 1559 __ inc(from, 4);
1569 1560 __ inc(to, 4);
1570 1561 __ sth(O3, to, -4);
1571 1562 __ sth(O4, to, -2);
1572 1563 __ BIND(L_skip_alignment2);
1573 1564 }
1574 1565 #ifdef _LP64
1575 1566 if (!aligned)
1576 1567 #endif
1577 1568 {
1578 1569 // Copy with shift 16 bytes per iteration if arrays do not have
1579 1570 // the same alignment mod 8, otherwise fall through to the next
1580 1571 // code for aligned copy.
1581 1572 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1582 1573 // Also jump over aligned copy after the copy with shift completed.
1583 1574
1584 1575 copy_16_bytes_forward_with_shift(from, to, count, 8, L_copy_2_bytes);
1585 1576 }
1586 1577
1587 1578 // Both array are 8 bytes aligned, copy 16 bytes at a time
1588 1579 __ and3(count, 3, G4); // Save
1589 1580 __ srl(count, 2, count);
1590 1581 generate_disjoint_long_copy_core(aligned);
1591 1582 __ mov(G4, count); // restore
1592 1583
1593 1584 // copy 1 element at a time
1594 1585 __ BIND(L_copy_2_bytes);
1595 1586 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1596 1587 __ delayed()->nop();
1597 1588 __ align(OptoLoopAlignment);
1598 1589 __ BIND(L_copy_2_bytes_loop);
1599 1590 __ lduh(from, offset, O3);
1600 1591 __ deccc(count);
1601 1592 __ sth(O3, to, offset);
1602 1593 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1603 1594 __ delayed()->inc(offset, 2);
1604 1595
1605 1596 __ BIND(L_exit);
1606 1597 // O3, O4 are used as temp registers
1607 1598 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1608 1599 __ retl();
1609 1600 __ delayed()->mov(G0, O0); // return 0
1610 1601 return start;
1611 1602 }
1612 1603
1613 1604 //
1614 1605 // Generate stub for disjoint short fill. If "aligned" is true, the
1615 1606 // "to" address is assumed to be heapword aligned.
1616 1607 //
1617 1608 // Arguments for generated stub:
1618 1609 // to: O0
1619 1610 // value: O1
1620 1611 // count: O2 treated as signed
1621 1612 //
1622 1613 address generate_fill(BasicType t, bool aligned, const char* name) {
1623 1614 __ align(CodeEntryAlignment);
1624 1615 StubCodeMark mark(this, "StubRoutines", name);
1625 1616 address start = __ pc();
1626 1617
1627 1618 const Register to = O0; // source array address
1628 1619 const Register value = O1; // fill value
1629 1620 const Register count = O2; // elements count
1630 1621 // O3 is used as a temp register
1631 1622
1632 1623 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1633 1624
1634 1625 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
1635 1626 Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
1636 1627
1637 1628 int shift = -1;
1638 1629 switch (t) {
1639 1630 case T_BYTE:
1640 1631 shift = 2;
1641 1632 break;
1642 1633 case T_SHORT:
1643 1634 shift = 1;
1644 1635 break;
1645 1636 case T_INT:
1646 1637 shift = 0;
1647 1638 break;
1648 1639 default: ShouldNotReachHere();
1649 1640 }
1650 1641
1651 1642 BLOCK_COMMENT("Entry:");
1652 1643
1653 1644 if (t == T_BYTE) {
1654 1645 // Zero extend value
1655 1646 __ and3(value, 0xff, value);
1656 1647 __ sllx(value, 8, O3);
1657 1648 __ or3(value, O3, value);
1658 1649 }
1659 1650 if (t == T_SHORT) {
1660 1651 // Zero extend value
1661 1652 __ sllx(value, 48, value);
1662 1653 __ srlx(value, 48, value);
1663 1654 }
1664 1655 if (t == T_BYTE || t == T_SHORT) {
1665 1656 __ sllx(value, 16, O3);
1666 1657 __ or3(value, O3, value);
1667 1658 }
1668 1659
1669 1660 __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
1670 1661 __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
1671 1662 __ delayed()->andcc(count, 1, G0);
1672 1663
1673 1664 if (!aligned && (t == T_BYTE || t == T_SHORT)) {
1674 1665 // align source address at 4 bytes address boundary
1675 1666 if (t == T_BYTE) {
1676 1667 // One byte misalignment happens only for byte arrays
1677 1668 __ andcc(to, 1, G0);
1678 1669 __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
1679 1670 __ delayed()->nop();
1680 1671 __ stb(value, to, 0);
1681 1672 __ inc(to, 1);
1682 1673 __ dec(count, 1);
1683 1674 __ BIND(L_skip_align1);
1684 1675 }
1685 1676 // Two bytes misalignment happens only for byte and short (char) arrays
1686 1677 __ andcc(to, 2, G0);
1687 1678 __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
1688 1679 __ delayed()->nop();
1689 1680 __ sth(value, to, 0);
1690 1681 __ inc(to, 2);
1691 1682 __ dec(count, 1 << (shift - 1));
1692 1683 __ BIND(L_skip_align2);
1693 1684 }
1694 1685 #ifdef _LP64
1695 1686 if (!aligned) {
1696 1687 #endif
1697 1688 // align to 8 bytes, we know we are 4 byte aligned to start
1698 1689 __ andcc(to, 7, G0);
1699 1690 __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
1700 1691 __ delayed()->nop();
1701 1692 __ stw(value, to, 0);
1702 1693 __ inc(to, 4);
1703 1694 __ dec(count, 1 << shift);
1704 1695 __ BIND(L_fill_32_bytes);
1705 1696 #ifdef _LP64
1706 1697 }
1707 1698 #endif
1708 1699
1709 1700 if (t == T_INT) {
1710 1701 // Zero extend value
1711 1702 __ srl(value, 0, value);
1712 1703 }
1713 1704 if (t == T_BYTE || t == T_SHORT || t == T_INT) {
1714 1705 __ sllx(value, 32, O3);
1715 1706 __ or3(value, O3, value);
1716 1707 }
1717 1708
1718 1709 Label L_check_fill_8_bytes;
1719 1710 // Fill 32-byte chunks
1720 1711 __ subcc(count, 8 << shift, count);
1721 1712 __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
1722 1713 __ delayed()->nop();
1723 1714
1724 1715 Label L_fill_32_bytes_loop, L_fill_4_bytes;
1725 1716 __ align(16);
1726 1717 __ BIND(L_fill_32_bytes_loop);
1727 1718
1728 1719 __ stx(value, to, 0);
1729 1720 __ stx(value, to, 8);
1730 1721 __ stx(value, to, 16);
1731 1722 __ stx(value, to, 24);
1732 1723
1733 1724 __ subcc(count, 8 << shift, count);
1734 1725 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
1735 1726 __ delayed()->add(to, 32, to);
1736 1727
1737 1728 __ BIND(L_check_fill_8_bytes);
1738 1729 __ addcc(count, 8 << shift, count);
1739 1730 __ brx(Assembler::zero, false, Assembler::pn, L_exit);
1740 1731 __ delayed()->subcc(count, 1 << (shift + 1), count);
1741 1732 __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
1742 1733 __ delayed()->andcc(count, 1<<shift, G0);
1743 1734
1744 1735 //
1745 1736 // length is too short, just fill 8 bytes at a time
1746 1737 //
1747 1738 Label L_fill_8_bytes_loop;
1748 1739 __ BIND(L_fill_8_bytes_loop);
1749 1740 __ stx(value, to, 0);
1750 1741 __ subcc(count, 1 << (shift + 1), count);
1751 1742 __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
1752 1743 __ delayed()->add(to, 8, to);
1753 1744
1754 1745 // fill trailing 4 bytes
1755 1746 __ andcc(count, 1<<shift, G0); // in delay slot of branches
1756 1747 if (t == T_INT) {
1757 1748 __ BIND(L_fill_elements);
1758 1749 }
1759 1750 __ BIND(L_fill_4_bytes);
1760 1751 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
1761 1752 if (t == T_BYTE || t == T_SHORT) {
1762 1753 __ delayed()->andcc(count, 1<<(shift-1), G0);
1763 1754 } else {
1764 1755 __ delayed()->nop();
1765 1756 }
1766 1757 __ stw(value, to, 0);
1767 1758 if (t == T_BYTE || t == T_SHORT) {
1768 1759 __ inc(to, 4);
1769 1760 // fill trailing 2 bytes
1770 1761 __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
1771 1762 __ BIND(L_fill_2_bytes);
1772 1763 __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
1773 1764 __ delayed()->andcc(count, 1, count);
1774 1765 __ sth(value, to, 0);
1775 1766 if (t == T_BYTE) {
1776 1767 __ inc(to, 2);
1777 1768 // fill trailing byte
1778 1769 __ andcc(count, 1, count); // in delay slot of branches
1779 1770 __ BIND(L_fill_byte);
1780 1771 __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1781 1772 __ delayed()->nop();
1782 1773 __ stb(value, to, 0);
1783 1774 } else {
1784 1775 __ BIND(L_fill_byte);
1785 1776 }
1786 1777 } else {
1787 1778 __ BIND(L_fill_2_bytes);
1788 1779 }
1789 1780 __ BIND(L_exit);
1790 1781 __ retl();
1791 1782 __ delayed()->nop();
1792 1783
1793 1784 // Handle copies less than 8 bytes. Int is handled elsewhere.
1794 1785 if (t == T_BYTE) {
1795 1786 __ BIND(L_fill_elements);
1796 1787 Label L_fill_2, L_fill_4;
1797 1788 // in delay slot __ andcc(count, 1, G0);
1798 1789 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1799 1790 __ delayed()->andcc(count, 2, G0);
1800 1791 __ stb(value, to, 0);
1801 1792 __ inc(to, 1);
1802 1793 __ BIND(L_fill_2);
1803 1794 __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
1804 1795 __ delayed()->andcc(count, 4, G0);
1805 1796 __ stb(value, to, 0);
1806 1797 __ stb(value, to, 1);
1807 1798 __ inc(to, 2);
1808 1799 __ BIND(L_fill_4);
1809 1800 __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1810 1801 __ delayed()->nop();
1811 1802 __ stb(value, to, 0);
1812 1803 __ stb(value, to, 1);
1813 1804 __ stb(value, to, 2);
1814 1805 __ retl();
1815 1806 __ delayed()->stb(value, to, 3);
1816 1807 }
1817 1808
1818 1809 if (t == T_SHORT) {
1819 1810 Label L_fill_2;
1820 1811 __ BIND(L_fill_elements);
1821 1812 // in delay slot __ andcc(count, 1, G0);
1822 1813 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1823 1814 __ delayed()->andcc(count, 2, G0);
1824 1815 __ sth(value, to, 0);
1825 1816 __ inc(to, 2);
1826 1817 __ BIND(L_fill_2);
1827 1818 __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1828 1819 __ delayed()->nop();
1829 1820 __ sth(value, to, 0);
1830 1821 __ retl();
1831 1822 __ delayed()->sth(value, to, 2);
1832 1823 }
1833 1824 return start;
1834 1825 }
↓ open down ↓ |
299 lines elided |
↑ open up ↑ |
1835 1826
1836 1827 //
1837 1828 // Generate stub for conjoint short copy. If "aligned" is true, the
1838 1829 // "from" and "to" addresses are assumed to be heapword aligned.
1839 1830 //
1840 1831 // Arguments for generated stub:
1841 1832 // from: O0
1842 1833 // to: O1
1843 1834 // count: O2 treated as signed
1844 1835 //
1845 - address generate_conjoint_short_copy(bool aligned, const char * name) {
1836 + address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1837 + address *entry, const char *name) {
1846 1838 // Do reverse copy.
1847 1839
1848 1840 __ align(CodeEntryAlignment);
1849 1841 StubCodeMark mark(this, "StubRoutines", name);
1850 1842 address start = __ pc();
1851 - address nooverlap_target = aligned ?
1852 - StubRoutines::arrayof_jshort_disjoint_arraycopy() :
1853 - disjoint_short_copy_entry;
1854 1843
1855 1844 Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
1856 1845 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1857 1846
1858 1847 const Register from = O0; // source array address
1859 1848 const Register to = O1; // destination array address
1860 1849 const Register count = O2; // elements count
1861 1850 const Register end_from = from; // source array end address
1862 1851 const Register end_to = to; // destination array end address
1863 1852
1864 1853 const Register byte_count = O3; // bytes count to copy
1865 1854
1866 1855 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1867 1856
1868 - if (!aligned) short_copy_entry = __ pc();
1869 - // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1870 - if (!aligned) BLOCK_COMMENT("Entry:");
1857 + if (entry != NULL) {
1858 + *entry = __ pc();
1859 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1860 + BLOCK_COMMENT("Entry:");
1861 + }
1871 1862
1872 1863 array_overlap_test(nooverlap_target, 1);
1873 1864
1874 1865 __ sllx(count, LogBytesPerShort, byte_count);
1875 1866 __ add(to, byte_count, end_to); // offset after last copied element
1876 1867
1877 1868 // for short arrays, just do single element copy
1878 1869 __ cmp(count, 11); // 8 + 3 (22 bytes)
1879 1870 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1880 1871 __ delayed()->add(from, byte_count, end_from);
1881 1872
1882 1873 {
1883 1874 // Align end of arrays since they could be not aligned even
1884 1875 // when arrays itself are aligned.
1885 1876
1886 1877 // copy 1 element if necessary to align 'end_to' on an 4 bytes
1887 1878 __ andcc(end_to, 3, G0);
1888 1879 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1889 1880 __ delayed()->lduh(end_from, -2, O3);
1890 1881 __ dec(end_from, 2);
1891 1882 __ dec(end_to, 2);
1892 1883 __ dec(count);
1893 1884 __ sth(O3, end_to, 0);
1894 1885 __ BIND(L_skip_alignment);
1895 1886
1896 1887 // copy 2 elements to align 'end_to' on an 8 byte boundary
1897 1888 __ andcc(end_to, 7, G0);
1898 1889 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1899 1890 __ delayed()->lduh(end_from, -2, O3);
1900 1891 __ dec(count, 2);
1901 1892 __ lduh(end_from, -4, O4);
1902 1893 __ dec(end_from, 4);
1903 1894 __ dec(end_to, 4);
1904 1895 __ sth(O3, end_to, 2);
1905 1896 __ sth(O4, end_to, 0);
1906 1897 __ BIND(L_skip_alignment2);
1907 1898 }
1908 1899 #ifdef _LP64
1909 1900 if (aligned) {
1910 1901 // Both arrays are aligned to 8-bytes in 64-bits VM.
1911 1902 // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1912 1903 // in unaligned case.
1913 1904 __ dec(count, 8);
1914 1905 } else
1915 1906 #endif
1916 1907 {
1917 1908 // Copy with shift 16 bytes per iteration if arrays do not have
1918 1909 // the same alignment mod 8, otherwise jump to the next
1919 1910 // code for aligned copy (and substracting 8 from 'count' before jump).
1920 1911 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1921 1912 // Also jump over aligned copy after the copy with shift completed.
1922 1913
1923 1914 copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
1924 1915 L_aligned_copy, L_copy_2_bytes);
1925 1916 }
1926 1917 // copy 4 elements (16 bytes) at a time
1927 1918 __ align(OptoLoopAlignment);
1928 1919 __ BIND(L_aligned_copy);
1929 1920 __ dec(end_from, 16);
1930 1921 __ ldx(end_from, 8, O3);
1931 1922 __ ldx(end_from, 0, O4);
1932 1923 __ dec(end_to, 16);
1933 1924 __ deccc(count, 8);
1934 1925 __ stx(O3, end_to, 8);
1935 1926 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1936 1927 __ delayed()->stx(O4, end_to, 0);
1937 1928 __ inc(count, 8);
1938 1929
1939 1930 // copy 1 element (2 bytes) at a time
1940 1931 __ BIND(L_copy_2_bytes);
1941 1932 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
1942 1933 __ delayed()->nop();
1943 1934 __ BIND(L_copy_2_bytes_loop);
1944 1935 __ dec(end_from, 2);
1945 1936 __ dec(end_to, 2);
1946 1937 __ lduh(end_from, 0, O4);
1947 1938 __ deccc(count);
1948 1939 __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
1949 1940 __ delayed()->sth(O4, end_to, 0);
1950 1941
1951 1942 __ BIND(L_exit);
1952 1943 // O3, O4 are used as temp registers
1953 1944 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1954 1945 __ retl();
1955 1946 __ delayed()->mov(G0, O0); // return 0
1956 1947 return start;
1957 1948 }
1958 1949
1959 1950 //
1960 1951 // Generate core code for disjoint int copy (and oop copy on 32-bit).
1961 1952 // If "aligned" is true, the "from" and "to" addresses are assumed
1962 1953 // to be heapword aligned.
1963 1954 //
1964 1955 // Arguments:
1965 1956 // from: O0
1966 1957 // to: O1
1967 1958 // count: O2 treated as signed
1968 1959 //
1969 1960 void generate_disjoint_int_copy_core(bool aligned) {
1970 1961
1971 1962 Label L_skip_alignment, L_aligned_copy;
1972 1963 Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
1973 1964
1974 1965 const Register from = O0; // source array address
1975 1966 const Register to = O1; // destination array address
1976 1967 const Register count = O2; // elements count
1977 1968 const Register offset = O5; // offset from start of arrays
1978 1969 // O3, O4, G3, G4 are used as temp registers
1979 1970
1980 1971 // 'aligned' == true when it is known statically during compilation
1981 1972 // of this arraycopy call site that both 'from' and 'to' addresses
1982 1973 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1983 1974 //
1984 1975 // Aligned arrays have 4 bytes alignment in 32-bits VM
1985 1976 // and 8 bytes - in 64-bits VM.
1986 1977 //
1987 1978 #ifdef _LP64
1988 1979 if (!aligned)
1989 1980 #endif
1990 1981 {
1991 1982 // The next check could be put under 'ifndef' since the code in
1992 1983 // generate_disjoint_long_copy_core() has own checks and set 'offset'.
1993 1984
1994 1985 // for short arrays, just do single element copy
1995 1986 __ cmp(count, 5); // 4 + 1 (20 bytes)
1996 1987 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
1997 1988 __ delayed()->mov(G0, offset);
1998 1989
1999 1990 // copy 1 element to align 'to' on an 8 byte boundary
2000 1991 __ andcc(to, 7, G0);
2001 1992 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2002 1993 __ delayed()->ld(from, 0, O3);
2003 1994 __ inc(from, 4);
2004 1995 __ inc(to, 4);
2005 1996 __ dec(count);
2006 1997 __ st(O3, to, -4);
2007 1998 __ BIND(L_skip_alignment);
2008 1999
2009 2000 // if arrays have same alignment mod 8, do 4 elements copy
2010 2001 __ andcc(from, 7, G0);
2011 2002 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2012 2003 __ delayed()->ld(from, 0, O3);
2013 2004
2014 2005 //
2015 2006 // Load 2 aligned 8-bytes chunks and use one from previous iteration
2016 2007 // to form 2 aligned 8-bytes chunks to store.
2017 2008 //
2018 2009 // copy_16_bytes_forward_with_shift() is not used here since this
2019 2010 // code is more optimal.
2020 2011
2021 2012 // copy with shift 4 elements (16 bytes) at a time
2022 2013 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4
2023 2014
2024 2015 __ align(OptoLoopAlignment);
2025 2016 __ BIND(L_copy_16_bytes);
2026 2017 __ ldx(from, 4, O4);
2027 2018 __ deccc(count, 4); // Can we do next iteration after this one?
2028 2019 __ ldx(from, 12, G4);
2029 2020 __ inc(to, 16);
2030 2021 __ inc(from, 16);
2031 2022 __ sllx(O3, 32, O3);
2032 2023 __ srlx(O4, 32, G3);
2033 2024 __ bset(G3, O3);
2034 2025 __ stx(O3, to, -16);
2035 2026 __ sllx(O4, 32, O4);
2036 2027 __ srlx(G4, 32, G3);
2037 2028 __ bset(G3, O4);
2038 2029 __ stx(O4, to, -8);
2039 2030 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2040 2031 __ delayed()->mov(G4, O3);
2041 2032
2042 2033 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2043 2034 __ delayed()->inc(count, 4); // restore 'count'
2044 2035
2045 2036 __ BIND(L_aligned_copy);
2046 2037 }
2047 2038 // copy 4 elements (16 bytes) at a time
2048 2039 __ and3(count, 1, G4); // Save
2049 2040 __ srl(count, 1, count);
2050 2041 generate_disjoint_long_copy_core(aligned);
2051 2042 __ mov(G4, count); // Restore
2052 2043
2053 2044 // copy 1 element at a time
2054 2045 __ BIND(L_copy_4_bytes);
2055 2046 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
2056 2047 __ delayed()->nop();
2057 2048 __ BIND(L_copy_4_bytes_loop);
2058 2049 __ ld(from, offset, O3);
2059 2050 __ deccc(count);
2060 2051 __ st(O3, to, offset);
2061 2052 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
2062 2053 __ delayed()->inc(offset, 4);
2063 2054 __ BIND(L_exit);
2064 2055 }
↓ open down ↓ |
184 lines elided |
↑ open up ↑ |
2065 2056
2066 2057 //
2067 2058 // Generate stub for disjoint int copy. If "aligned" is true, the
2068 2059 // "from" and "to" addresses are assumed to be heapword aligned.
2069 2060 //
2070 2061 // Arguments for generated stub:
2071 2062 // from: O0
2072 2063 // to: O1
2073 2064 // count: O2 treated as signed
2074 2065 //
2075 - address generate_disjoint_int_copy(bool aligned, const char * name) {
2066 + address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
2076 2067 __ align(CodeEntryAlignment);
2077 2068 StubCodeMark mark(this, "StubRoutines", name);
2078 2069 address start = __ pc();
2079 2070
2080 2071 const Register count = O2;
2081 2072 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2082 2073
2083 - if (!aligned) disjoint_int_copy_entry = __ pc();
2084 - // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2085 - if (!aligned) BLOCK_COMMENT("Entry:");
2074 + if (entry != NULL) {
2075 + *entry = __ pc();
2076 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2077 + BLOCK_COMMENT("Entry:");
2078 + }
2086 2079
2087 2080 generate_disjoint_int_copy_core(aligned);
2088 2081
2089 2082 // O3, O4 are used as temp registers
2090 2083 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2091 2084 __ retl();
2092 2085 __ delayed()->mov(G0, O0); // return 0
2093 2086 return start;
2094 2087 }
2095 2088
2096 2089 //
2097 2090 // Generate core code for conjoint int copy (and oop copy on 32-bit).
2098 2091 // If "aligned" is true, the "from" and "to" addresses are assumed
2099 2092 // to be heapword aligned.
2100 2093 //
2101 2094 // Arguments:
2102 2095 // from: O0
2103 2096 // to: O1
2104 2097 // count: O2 treated as signed
2105 2098 //
2106 2099 void generate_conjoint_int_copy_core(bool aligned) {
2107 2100 // Do reverse copy.
2108 2101
2109 2102 Label L_skip_alignment, L_aligned_copy;
2110 2103 Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2111 2104
2112 2105 const Register from = O0; // source array address
2113 2106 const Register to = O1; // destination array address
2114 2107 const Register count = O2; // elements count
2115 2108 const Register end_from = from; // source array end address
2116 2109 const Register end_to = to; // destination array end address
2117 2110 // O3, O4, O5, G3 are used as temp registers
2118 2111
2119 2112 const Register byte_count = O3; // bytes count to copy
2120 2113
2121 2114 __ sllx(count, LogBytesPerInt, byte_count);
2122 2115 __ add(to, byte_count, end_to); // offset after last copied element
2123 2116
2124 2117 __ cmp(count, 5); // for short arrays, just do single element copy
2125 2118 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2126 2119 __ delayed()->add(from, byte_count, end_from);
2127 2120
2128 2121 // copy 1 element to align 'to' on an 8 byte boundary
2129 2122 __ andcc(end_to, 7, G0);
2130 2123 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2131 2124 __ delayed()->nop();
2132 2125 __ dec(count);
2133 2126 __ dec(end_from, 4);
2134 2127 __ dec(end_to, 4);
2135 2128 __ ld(end_from, 0, O4);
2136 2129 __ st(O4, end_to, 0);
2137 2130 __ BIND(L_skip_alignment);
2138 2131
2139 2132 // Check if 'end_from' and 'end_to' has the same alignment.
2140 2133 __ andcc(end_from, 7, G0);
2141 2134 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2142 2135 __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
2143 2136
2144 2137 // copy with shift 4 elements (16 bytes) at a time
2145 2138 //
2146 2139 // Load 2 aligned 8-bytes chunks and use one from previous iteration
2147 2140 // to form 2 aligned 8-bytes chunks to store.
2148 2141 //
2149 2142 __ ldx(end_from, -4, O3);
2150 2143 __ align(OptoLoopAlignment);
2151 2144 __ BIND(L_copy_16_bytes);
2152 2145 __ ldx(end_from, -12, O4);
2153 2146 __ deccc(count, 4);
2154 2147 __ ldx(end_from, -20, O5);
2155 2148 __ dec(end_to, 16);
2156 2149 __ dec(end_from, 16);
2157 2150 __ srlx(O3, 32, O3);
2158 2151 __ sllx(O4, 32, G3);
2159 2152 __ bset(G3, O3);
2160 2153 __ stx(O3, end_to, 8);
2161 2154 __ srlx(O4, 32, O4);
2162 2155 __ sllx(O5, 32, G3);
2163 2156 __ bset(O4, G3);
2164 2157 __ stx(G3, end_to, 0);
2165 2158 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2166 2159 __ delayed()->mov(O5, O3);
2167 2160
2168 2161 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2169 2162 __ delayed()->inc(count, 4);
2170 2163
2171 2164 // copy 4 elements (16 bytes) at a time
2172 2165 __ align(OptoLoopAlignment);
2173 2166 __ BIND(L_aligned_copy);
2174 2167 __ dec(end_from, 16);
2175 2168 __ ldx(end_from, 8, O3);
2176 2169 __ ldx(end_from, 0, O4);
2177 2170 __ dec(end_to, 16);
2178 2171 __ deccc(count, 4);
2179 2172 __ stx(O3, end_to, 8);
2180 2173 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2181 2174 __ delayed()->stx(O4, end_to, 0);
2182 2175 __ inc(count, 4);
2183 2176
2184 2177 // copy 1 element (4 bytes) at a time
2185 2178 __ BIND(L_copy_4_bytes);
2186 2179 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
2187 2180 __ delayed()->nop();
2188 2181 __ BIND(L_copy_4_bytes_loop);
2189 2182 __ dec(end_from, 4);
2190 2183 __ dec(end_to, 4);
2191 2184 __ ld(end_from, 0, O4);
2192 2185 __ deccc(count);
2193 2186 __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
2194 2187 __ delayed()->st(O4, end_to, 0);
2195 2188 __ BIND(L_exit);
2196 2189 }
↓ open down ↓ |
101 lines elided |
↑ open up ↑ |
2197 2190
2198 2191 //
2199 2192 // Generate stub for conjoint int copy. If "aligned" is true, the
2200 2193 // "from" and "to" addresses are assumed to be heapword aligned.
2201 2194 //
2202 2195 // Arguments for generated stub:
2203 2196 // from: O0
2204 2197 // to: O1
2205 2198 // count: O2 treated as signed
2206 2199 //
2207 - address generate_conjoint_int_copy(bool aligned, const char * name) {
2200 + address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
2201 + address *entry, const char *name) {
2208 2202 __ align(CodeEntryAlignment);
2209 2203 StubCodeMark mark(this, "StubRoutines", name);
2210 2204 address start = __ pc();
2211 2205
2212 - address nooverlap_target = aligned ?
2213 - StubRoutines::arrayof_jint_disjoint_arraycopy() :
2214 - disjoint_int_copy_entry;
2215 -
2216 2206 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2217 2207
2218 - if (!aligned) int_copy_entry = __ pc();
2219 - // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2220 - if (!aligned) BLOCK_COMMENT("Entry:");
2208 + if (entry != NULL) {
2209 + *entry = __ pc();
2210 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2211 + BLOCK_COMMENT("Entry:");
2212 + }
2221 2213
2222 2214 array_overlap_test(nooverlap_target, 2);
2223 2215
2224 2216 generate_conjoint_int_copy_core(aligned);
2225 2217
2226 2218 // O3, O4 are used as temp registers
2227 2219 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2228 2220 __ retl();
2229 2221 __ delayed()->mov(G0, O0); // return 0
2230 2222 return start;
2231 2223 }
2232 2224
2233 2225 //
2234 2226 // Generate core code for disjoint long copy (and oop copy on 64-bit).
2235 2227 // "aligned" is ignored, because we must make the stronger
2236 2228 // assumption that both addresses are always 64-bit aligned.
2237 2229 //
2238 2230 // Arguments:
2239 2231 // from: O0
2240 2232 // to: O1
2241 2233 // count: O2 treated as signed
2242 2234 //
2243 2235 // count -= 2;
2244 2236 // if ( count >= 0 ) { // >= 2 elements
2245 2237 // if ( count > 6) { // >= 8 elements
2246 2238 // count -= 6; // original count - 8
2247 2239 // do {
2248 2240 // copy_8_elements;
2249 2241 // count -= 8;
2250 2242 // } while ( count >= 0 );
2251 2243 // count += 6;
2252 2244 // }
2253 2245 // if ( count >= 0 ) { // >= 2 elements
2254 2246 // do {
2255 2247 // copy_2_elements;
2256 2248 // } while ( (count=count-2) >= 0 );
2257 2249 // }
2258 2250 // }
2259 2251 // count += 2;
2260 2252 // if ( count != 0 ) { // 1 element left
2261 2253 // copy_1_element;
2262 2254 // }
2263 2255 //
2264 2256 void generate_disjoint_long_copy_core(bool aligned) {
2265 2257 Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2266 2258 const Register from = O0; // source array address
2267 2259 const Register to = O1; // destination array address
2268 2260 const Register count = O2; // elements count
2269 2261 const Register offset0 = O4; // element offset
2270 2262 const Register offset8 = O5; // next element offset
2271 2263
2272 2264 __ deccc(count, 2);
2273 2265 __ mov(G0, offset0); // offset from start of arrays (0)
2274 2266 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2275 2267 __ delayed()->add(offset0, 8, offset8);
2276 2268
2277 2269 // Copy by 64 bytes chunks
2278 2270 Label L_copy_64_bytes;
2279 2271 const Register from64 = O3; // source address
2280 2272 const Register to64 = G3; // destination address
2281 2273 __ subcc(count, 6, O3);
2282 2274 __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2283 2275 __ delayed()->mov(to, to64);
2284 2276 // Now we can use O4(offset0), O5(offset8) as temps
2285 2277 __ mov(O3, count);
2286 2278 __ mov(from, from64);
2287 2279
2288 2280 __ align(OptoLoopAlignment);
2289 2281 __ BIND(L_copy_64_bytes);
2290 2282 for( int off = 0; off < 64; off += 16 ) {
2291 2283 __ ldx(from64, off+0, O4);
2292 2284 __ ldx(from64, off+8, O5);
2293 2285 __ stx(O4, to64, off+0);
2294 2286 __ stx(O5, to64, off+8);
2295 2287 }
2296 2288 __ deccc(count, 8);
2297 2289 __ inc(from64, 64);
2298 2290 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes);
2299 2291 __ delayed()->inc(to64, 64);
2300 2292
2301 2293 // Restore O4(offset0), O5(offset8)
2302 2294 __ sub(from64, from, offset0);
2303 2295 __ inccc(count, 6);
2304 2296 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2305 2297 __ delayed()->add(offset0, 8, offset8);
2306 2298
2307 2299 // Copy by 16 bytes chunks
2308 2300 __ align(OptoLoopAlignment);
2309 2301 __ BIND(L_copy_16_bytes);
2310 2302 __ ldx(from, offset0, O3);
2311 2303 __ ldx(from, offset8, G3);
2312 2304 __ deccc(count, 2);
2313 2305 __ stx(O3, to, offset0);
2314 2306 __ inc(offset0, 16);
2315 2307 __ stx(G3, to, offset8);
2316 2308 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2317 2309 __ delayed()->inc(offset8, 16);
2318 2310
2319 2311 // Copy last 8 bytes
2320 2312 __ BIND(L_copy_8_bytes);
2321 2313 __ inccc(count, 2);
2322 2314 __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2323 2315 __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
2324 2316 __ ldx(from, offset0, O3);
2325 2317 __ stx(O3, to, offset0);
2326 2318 __ BIND(L_exit);
2327 2319 }
2328 2320
↓ open down ↓ |
98 lines elided |
↑ open up ↑ |
2329 2321 //
2330 2322 // Generate stub for disjoint long copy.
2331 2323 // "aligned" is ignored, because we must make the stronger
2332 2324 // assumption that both addresses are always 64-bit aligned.
2333 2325 //
2334 2326 // Arguments for generated stub:
2335 2327 // from: O0
2336 2328 // to: O1
2337 2329 // count: O2 treated as signed
2338 2330 //
2339 - address generate_disjoint_long_copy(bool aligned, const char * name) {
2331 + address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
2340 2332 __ align(CodeEntryAlignment);
2341 2333 StubCodeMark mark(this, "StubRoutines", name);
2342 2334 address start = __ pc();
2343 2335
2344 2336 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2345 2337
2346 - if (!aligned) disjoint_long_copy_entry = __ pc();
2347 - // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2348 - if (!aligned) BLOCK_COMMENT("Entry:");
2338 + if (entry != NULL) {
2339 + *entry = __ pc();
2340 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2341 + BLOCK_COMMENT("Entry:");
2342 + }
2349 2343
2350 2344 generate_disjoint_long_copy_core(aligned);
2351 2345
2352 2346 // O3, O4 are used as temp registers
2353 2347 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2354 2348 __ retl();
2355 2349 __ delayed()->mov(G0, O0); // return 0
2356 2350 return start;
2357 2351 }
2358 2352
2359 2353 //
2360 2354 // Generate core code for conjoint long copy (and oop copy on 64-bit).
2361 2355 // "aligned" is ignored, because we must make the stronger
2362 2356 // assumption that both addresses are always 64-bit aligned.
2363 2357 //
2364 2358 // Arguments:
2365 2359 // from: O0
2366 2360 // to: O1
2367 2361 // count: O2 treated as signed
2368 2362 //
2369 2363 void generate_conjoint_long_copy_core(bool aligned) {
2370 2364 // Do reverse copy.
2371 2365 Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2372 2366 const Register from = O0; // source array address
2373 2367 const Register to = O1; // destination array address
2374 2368 const Register count = O2; // elements count
2375 2369 const Register offset8 = O4; // element offset
2376 2370 const Register offset0 = O5; // previous element offset
2377 2371
2378 2372 __ subcc(count, 1, count);
2379 2373 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
2380 2374 __ delayed()->sllx(count, LogBytesPerLong, offset8);
2381 2375 __ sub(offset8, 8, offset0);
2382 2376 __ align(OptoLoopAlignment);
2383 2377 __ BIND(L_copy_16_bytes);
2384 2378 __ ldx(from, offset8, O2);
2385 2379 __ ldx(from, offset0, O3);
2386 2380 __ stx(O2, to, offset8);
2387 2381 __ deccc(offset8, 16); // use offset8 as counter
2388 2382 __ stx(O3, to, offset0);
2389 2383 __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
2390 2384 __ delayed()->dec(offset0, 16);
2391 2385
2392 2386 __ BIND(L_copy_8_bytes);
2393 2387 __ brx(Assembler::negative, false, Assembler::pn, L_exit );
2394 2388 __ delayed()->nop();
2395 2389 __ ldx(from, 0, O3);
2396 2390 __ stx(O3, to, 0);
2397 2391 __ BIND(L_exit);
2398 2392 }
↓ open down ↓ |
40 lines elided |
↑ open up ↑ |
2399 2393
2400 2394 // Generate stub for conjoint long copy.
2401 2395 // "aligned" is ignored, because we must make the stronger
2402 2396 // assumption that both addresses are always 64-bit aligned.
2403 2397 //
2404 2398 // Arguments for generated stub:
2405 2399 // from: O0
2406 2400 // to: O1
2407 2401 // count: O2 treated as signed
2408 2402 //
2409 - address generate_conjoint_long_copy(bool aligned, const char * name) {
2403 + address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
2404 + address *entry, const char *name) {
2410 2405 __ align(CodeEntryAlignment);
2411 2406 StubCodeMark mark(this, "StubRoutines", name);
2412 2407 address start = __ pc();
2413 2408
2414 2409 assert(!aligned, "usage");
2415 - address nooverlap_target = disjoint_long_copy_entry;
2416 2410
2417 2411 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2418 2412
2419 - if (!aligned) long_copy_entry = __ pc();
2420 - // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2421 - if (!aligned) BLOCK_COMMENT("Entry:");
2413 + if (entry != NULL) {
2414 + *entry = __ pc();
2415 + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2416 + BLOCK_COMMENT("Entry:");
2417 + }
2422 2418
2423 2419 array_overlap_test(nooverlap_target, 3);
2424 2420
2425 2421 generate_conjoint_long_copy_core(aligned);
2426 2422
2427 2423 // O3, O4 are used as temp registers
2428 2424 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2429 2425 __ retl();
2430 2426 __ delayed()->mov(G0, O0); // return 0
2431 2427 return start;
2432 2428 }
2433 2429
2434 2430 // Generate stub for disjoint oop copy. If "aligned" is true, the
2435 2431 // "from" and "to" addresses are assumed to be heapword aligned.
2436 2432 //
2437 2433 // Arguments for generated stub:
2438 2434 // from: O0
2439 2435 // to: O1
2440 2436 // count: O2 treated as signed
2441 2437 //
2442 - address generate_disjoint_oop_copy(bool aligned, const char * name) {
2438 + address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name) {
2443 2439
2444 2440 const Register from = O0; // source array address
2445 2441 const Register to = O1; // destination array address
2446 2442 const Register count = O2; // elements count
2447 2443
2448 2444 __ align(CodeEntryAlignment);
2449 2445 StubCodeMark mark(this, "StubRoutines", name);
2450 2446 address start = __ pc();
2451 2447
2452 2448 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2453 2449
2454 - if (!aligned) disjoint_oop_copy_entry = __ pc();
2455 - // caller can pass a 64-bit byte count here
2456 - if (!aligned) BLOCK_COMMENT("Entry:");
2450 + if (entry != NULL) {
2451 + *entry = __ pc();
2452 + // caller can pass a 64-bit byte count here
2453 + BLOCK_COMMENT("Entry:");
2454 + }
2457 2455
2458 2456 // save arguments for barrier generation
2459 2457 __ mov(to, G1);
2460 2458 __ mov(count, G5);
2461 2459 gen_write_ref_array_pre_barrier(G1, G5);
2462 2460 #ifdef _LP64
2463 2461 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2464 2462 if (UseCompressedOops) {
2465 2463 generate_disjoint_int_copy_core(aligned);
2466 2464 } else {
2467 2465 generate_disjoint_long_copy_core(aligned);
2468 2466 }
2469 2467 #else
2470 2468 generate_disjoint_int_copy_core(aligned);
2471 2469 #endif
2472 2470 // O0 is used as temp register
2473 2471 gen_write_ref_array_post_barrier(G1, G5, O0);
2474 2472
2475 2473 // O3, O4 are used as temp registers
2476 2474 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2477 2475 __ retl();
2478 2476 __ delayed()->mov(G0, O0); // return 0
2479 2477 return start;
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
2480 2478 }
2481 2479
2482 2480 // Generate stub for conjoint oop copy. If "aligned" is true, the
2483 2481 // "from" and "to" addresses are assumed to be heapword aligned.
2484 2482 //
2485 2483 // Arguments for generated stub:
2486 2484 // from: O0
2487 2485 // to: O1
2488 2486 // count: O2 treated as signed
2489 2487 //
2490 - address generate_conjoint_oop_copy(bool aligned, const char * name) {
2488 + address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
2489 + address *entry, const char *name) {
2491 2490
2492 2491 const Register from = O0; // source array address
2493 2492 const Register to = O1; // destination array address
2494 2493 const Register count = O2; // elements count
2495 2494
2496 2495 __ align(CodeEntryAlignment);
2497 2496 StubCodeMark mark(this, "StubRoutines", name);
2498 2497 address start = __ pc();
2499 2498
2500 2499 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2501 2500
2502 - if (!aligned) oop_copy_entry = __ pc();
2503 - // caller can pass a 64-bit byte count here
2504 - if (!aligned) BLOCK_COMMENT("Entry:");
2501 + if (entry != NULL) {
2502 + *entry = __ pc();
2503 + // caller can pass a 64-bit byte count here
2504 + BLOCK_COMMENT("Entry:");
2505 + }
2506 +
2507 + array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
2505 2508
2506 2509 // save arguments for barrier generation
2507 2510 __ mov(to, G1);
2508 2511 __ mov(count, G5);
2509 -
2510 2512 gen_write_ref_array_pre_barrier(G1, G5);
2511 2513
2512 - address nooverlap_target = aligned ?
2513 - StubRoutines::arrayof_oop_disjoint_arraycopy() :
2514 - disjoint_oop_copy_entry;
2515 -
2516 - array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
2517 -
2518 2514 #ifdef _LP64
2519 2515 if (UseCompressedOops) {
2520 2516 generate_conjoint_int_copy_core(aligned);
2521 2517 } else {
2522 2518 generate_conjoint_long_copy_core(aligned);
2523 2519 }
2524 2520 #else
2525 2521 generate_conjoint_int_copy_core(aligned);
2526 2522 #endif
2527 2523
2528 2524 // O0 is used as temp register
2529 2525 gen_write_ref_array_post_barrier(G1, G5, O0);
2530 2526
2531 2527 // O3, O4 are used as temp registers
2532 2528 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2533 2529 __ retl();
2534 2530 __ delayed()->mov(G0, O0); // return 0
2535 2531 return start;
2536 2532 }
2537 2533
2538 2534
2539 2535 // Helper for generating a dynamic type check.
2540 2536 // Smashes only the given temp registers.
2541 2537 void generate_type_check(Register sub_klass,
2542 2538 Register super_check_offset,
2543 2539 Register super_klass,
2544 2540 Register temp,
2545 2541 Label& L_success) {
2546 2542 assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
2547 2543
2548 2544 BLOCK_COMMENT("type_check:");
2549 2545
2550 2546 Label L_miss, L_pop_to_miss;
2551 2547
2552 2548 assert_clean_int(super_check_offset, temp);
2553 2549
2554 2550 __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
2555 2551 &L_success, &L_miss, NULL,
2556 2552 super_check_offset);
2557 2553
2558 2554 BLOCK_COMMENT("type_check_slow_path:");
2559 2555 __ save_frame(0);
2560 2556 __ check_klass_subtype_slow_path(sub_klass->after_save(),
2561 2557 super_klass->after_save(),
2562 2558 L0, L1, L2, L4,
2563 2559 NULL, &L_pop_to_miss);
2564 2560 __ ba(false, L_success);
2565 2561 __ delayed()->restore();
2566 2562
2567 2563 __ bind(L_pop_to_miss);
2568 2564 __ restore();
2569 2565
2570 2566 // Fall through on failure!
2571 2567 __ BIND(L_miss);
2572 2568 }
2573 2569
2574 2570
↓ open down ↓ |
47 lines elided |
↑ open up ↑ |
2575 2571 // Generate stub for checked oop copy.
2576 2572 //
2577 2573 // Arguments for generated stub:
2578 2574 // from: O0
2579 2575 // to: O1
2580 2576 // count: O2 treated as signed
2581 2577 // ckoff: O3 (super_check_offset)
2582 2578 // ckval: O4 (super_klass)
2583 2579 // ret: O0 zero for success; (-1^K) where K is partial transfer count
2584 2580 //
2585 - address generate_checkcast_copy(const char* name) {
2581 + address generate_checkcast_copy(const char *name, address *entry) {
2586 2582
2587 2583 const Register O0_from = O0; // source array address
2588 2584 const Register O1_to = O1; // destination array address
2589 2585 const Register O2_count = O2; // elements count
2590 2586 const Register O3_ckoff = O3; // super_check_offset
2591 2587 const Register O4_ckval = O4; // super_klass
2592 2588
2593 2589 const Register O5_offset = O5; // loop var, with stride wordSize
2594 2590 const Register G1_remain = G1; // loop var, with stride -1
2595 2591 const Register G3_oop = G3; // actual oop copied
2596 2592 const Register G4_klass = G4; // oop._klass
2597 2593 const Register G5_super = G5; // oop._klass._primary_supers[ckval]
2598 2594
2599 2595 __ align(CodeEntryAlignment);
2600 2596 StubCodeMark mark(this, "StubRoutines", name);
2601 2597 address start = __ pc();
2602 2598
2603 - gen_write_ref_array_pre_barrier(O1, O2);
2604 -
2605 2599 #ifdef ASSERT
2606 2600 // We sometimes save a frame (see generate_type_check below).
2607 2601 // If this will cause trouble, let's fail now instead of later.
2608 2602 __ save_frame(0);
2609 2603 __ restore();
2610 2604 #endif
2611 2605
2612 2606 assert_clean_int(O2_count, G1); // Make sure 'count' is clean int.
2613 2607
2614 2608 #ifdef ASSERT
2615 2609 // caller guarantees that the arrays really are different
2616 2610 // otherwise, we would have to make conjoint checks
2617 2611 { Label L;
↓ open down ↓ |
3 lines elided |
↑ open up ↑ |
2618 2612 __ mov(O3, G1); // spill: overlap test smashes O3
2619 2613 __ mov(O4, G4); // spill: overlap test smashes O4
2620 2614 array_overlap_test(L, LogBytesPerHeapOop);
2621 2615 __ stop("checkcast_copy within a single array");
2622 2616 __ bind(L);
2623 2617 __ mov(G1, O3);
2624 2618 __ mov(G4, O4);
2625 2619 }
2626 2620 #endif //ASSERT
2627 2621
2628 - checkcast_copy_entry = __ pc();
2629 - // caller can pass a 64-bit byte count here (from generic stub)
2630 - BLOCK_COMMENT("Entry:");
2622 + if (entry != NULL) {
2623 + *entry = __ pc();
2624 + // caller can pass a 64-bit byte count here (from generic stub)
2625 + BLOCK_COMMENT("Entry:");
2626 + }
2627 +
2628 + gen_write_ref_array_pre_barrier(O1_to, O2_count);
2631 2629
2632 2630 Label load_element, store_element, do_card_marks, fail, done;
2633 2631 __ addcc(O2_count, 0, G1_remain); // initialize loop index, and test it
2634 2632 __ brx(Assembler::notZero, false, Assembler::pt, load_element);
2635 2633 __ delayed()->mov(G0, O5_offset); // offset from start of arrays
2636 2634
2637 2635 // Empty array: Nothing to do.
2638 2636 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2639 2637 __ retl();
2640 2638 __ delayed()->set(0, O0); // return 0 on (trivial) success
2641 2639
2642 2640 // ======== begin loop ========
2643 2641 // (Loop is rotated; its entry is load_element.)
2644 2642 // Loop variables:
2645 2643 // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
2646 2644 // (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
2647 2645 // G3, G4, G5 --- current oop, oop.klass, oop.klass.super
2648 2646 __ align(OptoLoopAlignment);
2649 2647
2650 2648 __ BIND(store_element);
2651 2649 __ deccc(G1_remain); // decrement the count
2652 2650 __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
2653 2651 __ inc(O5_offset, heapOopSize); // step to next offset
2654 2652 __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
2655 2653 __ delayed()->set(0, O0); // return -1 on success
2656 2654
2657 2655 // ======== loop entry is here ========
2658 2656 __ BIND(load_element);
2659 2657 __ load_heap_oop(O0_from, O5_offset, G3_oop); // load the oop
2660 2658 __ br_null(G3_oop, true, Assembler::pt, store_element);
2661 2659 __ delayed()->nop();
2662 2660
2663 2661 __ load_klass(G3_oop, G4_klass); // query the object klass
2664 2662
2665 2663 generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
2666 2664 // branch to this on success:
2667 2665 store_element);
2668 2666 // ======== end loop ========
2669 2667
2670 2668 // It was a real error; we must depend on the caller to finish the job.
2671 2669 // Register G1 has number of *remaining* oops, O2 number of *total* oops.
2672 2670 // Emit GC store barriers for the oops we have copied (O2 minus G1),
2673 2671 // and report their number to the caller.
2674 2672 __ BIND(fail);
2675 2673 __ subcc(O2_count, G1_remain, O2_count);
2676 2674 __ brx(Assembler::zero, false, Assembler::pt, done);
2677 2675 __ delayed()->not1(O2_count, O0); // report (-1^K) to caller
2678 2676
2679 2677 __ BIND(do_card_marks);
2680 2678 gen_write_ref_array_post_barrier(O1_to, O2_count, O3); // store check on O1[0..O2]
2681 2679
2682 2680 __ BIND(done);
2683 2681 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2684 2682 __ retl();
2685 2683 __ delayed()->nop(); // return value in 00
2686 2684
2687 2685 return start;
2688 2686 }
2689 2687
2690 2688
2691 2689 // Generate 'unsafe' array copy stub
2692 2690 // Though just as safe as the other stubs, it takes an unscaled
↓ open down ↓ |
52 lines elided |
↑ open up ↑ |
2693 2691 // size_t argument instead of an element count.
2694 2692 //
2695 2693 // Arguments for generated stub:
2696 2694 // from: O0
2697 2695 // to: O1
2698 2696 // count: O2 byte count, treated as ssize_t, can be zero
2699 2697 //
2700 2698 // Examines the alignment of the operands and dispatches
2701 2699 // to a long, int, short, or byte copy loop.
2702 2700 //
2703 - address generate_unsafe_copy(const char* name) {
2701 + address generate_unsafe_copy(const char* name,
2702 + address byte_copy_entry,
2703 + address short_copy_entry,
2704 + address int_copy_entry,
2705 + address long_copy_entry) {
2704 2706
2705 2707 const Register O0_from = O0; // source array address
2706 2708 const Register O1_to = O1; // destination array address
2707 2709 const Register O2_count = O2; // elements count
2708 2710
2709 2711 const Register G1_bits = G1; // test copy of low bits
2710 2712
2711 2713 __ align(CodeEntryAlignment);
2712 2714 StubCodeMark mark(this, "StubRoutines", name);
2713 2715 address start = __ pc();
2714 2716
2715 2717 // bump this on entry, not on exit:
2716 2718 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
2717 2719
2718 2720 __ or3(O0_from, O1_to, G1_bits);
2719 2721 __ or3(O2_count, G1_bits, G1_bits);
2720 2722
2721 2723 __ btst(BytesPerLong-1, G1_bits);
2722 2724 __ br(Assembler::zero, true, Assembler::pt,
2723 2725 long_copy_entry, relocInfo::runtime_call_type);
2724 2726 // scale the count on the way out:
2725 2727 __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
2726 2728
2727 2729 __ btst(BytesPerInt-1, G1_bits);
2728 2730 __ br(Assembler::zero, true, Assembler::pt,
2729 2731 int_copy_entry, relocInfo::runtime_call_type);
2730 2732 // scale the count on the way out:
2731 2733 __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
2732 2734
2733 2735 __ btst(BytesPerShort-1, G1_bits);
2734 2736 __ br(Assembler::zero, true, Assembler::pt,
2735 2737 short_copy_entry, relocInfo::runtime_call_type);
2736 2738 // scale the count on the way out:
2737 2739 __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
2738 2740
2739 2741 __ br(Assembler::always, false, Assembler::pt,
2740 2742 byte_copy_entry, relocInfo::runtime_call_type);
2741 2743 __ delayed()->nop();
2742 2744
2743 2745 return start;
2744 2746 }
2745 2747
2746 2748
2747 2749 // Perform range checks on the proposed arraycopy.
2748 2750 // Kills the two temps, but nothing else.
2749 2751 // Also, clean the sign bits of src_pos and dst_pos.
2750 2752 void arraycopy_range_checks(Register src, // source array oop (O0)
2751 2753 Register src_pos, // source position (O1)
2752 2754 Register dst, // destination array oo (O2)
2753 2755 Register dst_pos, // destination position (O3)
2754 2756 Register length, // length of copy (O4)
2755 2757 Register temp1, Register temp2,
2756 2758 Label& L_failed) {
2757 2759 BLOCK_COMMENT("arraycopy_range_checks:");
2758 2760
2759 2761 // if (src_pos + length > arrayOop(src)->length() ) FAIL;
2760 2762
2761 2763 const Register array_length = temp1; // scratch
2762 2764 const Register end_pos = temp2; // scratch
2763 2765
2764 2766 // Note: This next instruction may be in the delay slot of a branch:
2765 2767 __ add(length, src_pos, end_pos); // src_pos + length
2766 2768 __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
2767 2769 __ cmp(end_pos, array_length);
2768 2770 __ br(Assembler::greater, false, Assembler::pn, L_failed);
2769 2771
2770 2772 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2771 2773 __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
2772 2774 __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
2773 2775 __ cmp(end_pos, array_length);
2774 2776 __ br(Assembler::greater, false, Assembler::pn, L_failed);
2775 2777
2776 2778 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2777 2779 // Move with sign extension can be used since they are positive.
2778 2780 __ delayed()->signx(src_pos, src_pos);
2779 2781 __ signx(dst_pos, dst_pos);
2780 2782
2781 2783 BLOCK_COMMENT("arraycopy_range_checks done");
2782 2784 }
2783 2785
2784 2786
2785 2787 //
2786 2788 // Generate generic array copy stubs
2787 2789 //
2788 2790 // Input:
↓ open down ↓ |
75 lines elided |
↑ open up ↑ |
2789 2791 // O0 - src oop
2790 2792 // O1 - src_pos
2791 2793 // O2 - dst oop
2792 2794 // O3 - dst_pos
2793 2795 // O4 - element count
2794 2796 //
2795 2797 // Output:
2796 2798 // O0 == 0 - success
2797 2799 // O0 == -1 - need to call System.arraycopy
2798 2800 //
2799 - address generate_generic_copy(const char *name) {
2800 -
2801 + address generate_generic_copy(const char *name,
2802 + address entry_jbyte_arraycopy,
2803 + address entry_jshort_arraycopy,
2804 + address entry_jint_arraycopy,
2805 + address entry_oop_arraycopy,
2806 + address entry_jlong_arraycopy,
2807 + address entry_checkcast_arraycopy) {
2801 2808 Label L_failed, L_objArray;
2802 2809
2803 2810 // Input registers
2804 2811 const Register src = O0; // source array oop
2805 2812 const Register src_pos = O1; // source position
2806 2813 const Register dst = O2; // destination array oop
2807 2814 const Register dst_pos = O3; // destination position
2808 2815 const Register length = O4; // elements count
2809 2816
2810 2817 // registers used as temp
2811 2818 const Register G3_src_klass = G3; // source array klass
2812 2819 const Register G4_dst_klass = G4; // destination array klass
2813 2820 const Register G5_lh = G5; // layout handler
2814 2821 const Register O5_temp = O5;
2815 2822
2816 2823 __ align(CodeEntryAlignment);
2817 2824 StubCodeMark mark(this, "StubRoutines", name);
2818 2825 address start = __ pc();
2819 2826
2820 2827 // bump this on entry, not on exit:
2821 2828 inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
2822 2829
2823 2830 // In principle, the int arguments could be dirty.
2824 2831 //assert_clean_int(src_pos, G1);
2825 2832 //assert_clean_int(dst_pos, G1);
2826 2833 //assert_clean_int(length, G1);
2827 2834
2828 2835 //-----------------------------------------------------------------------
2829 2836 // Assembler stubs will be used for this call to arraycopy
2830 2837 // if the following conditions are met:
2831 2838 //
2832 2839 // (1) src and dst must not be null.
2833 2840 // (2) src_pos must not be negative.
2834 2841 // (3) dst_pos must not be negative.
2835 2842 // (4) length must not be negative.
2836 2843 // (5) src klass and dst klass should be the same and not NULL.
2837 2844 // (6) src and dst should be arrays.
2838 2845 // (7) src_pos + length must not exceed length of src.
2839 2846 // (8) dst_pos + length must not exceed length of dst.
2840 2847 BLOCK_COMMENT("arraycopy initial argument checks");
2841 2848
2842 2849 // if (src == NULL) return -1;
2843 2850 __ br_null(src, false, Assembler::pn, L_failed);
2844 2851
2845 2852 // if (src_pos < 0) return -1;
2846 2853 __ delayed()->tst(src_pos);
2847 2854 __ br(Assembler::negative, false, Assembler::pn, L_failed);
2848 2855 __ delayed()->nop();
2849 2856
2850 2857 // if (dst == NULL) return -1;
2851 2858 __ br_null(dst, false, Assembler::pn, L_failed);
2852 2859
2853 2860 // if (dst_pos < 0) return -1;
2854 2861 __ delayed()->tst(dst_pos);
2855 2862 __ br(Assembler::negative, false, Assembler::pn, L_failed);
2856 2863
2857 2864 // if (length < 0) return -1;
2858 2865 __ delayed()->tst(length);
2859 2866 __ br(Assembler::negative, false, Assembler::pn, L_failed);
2860 2867
2861 2868 BLOCK_COMMENT("arraycopy argument klass checks");
2862 2869 // get src->klass()
2863 2870 if (UseCompressedOops) {
2864 2871 __ delayed()->nop(); // ??? not good
2865 2872 __ load_klass(src, G3_src_klass);
2866 2873 } else {
2867 2874 __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
2868 2875 }
2869 2876
2870 2877 #ifdef ASSERT
2871 2878 // assert(src->klass() != NULL);
2872 2879 BLOCK_COMMENT("assert klasses not null");
2873 2880 { Label L_a, L_b;
2874 2881 __ br_notnull(G3_src_klass, false, Assembler::pt, L_b); // it is broken if klass is NULL
2875 2882 __ delayed()->nop();
2876 2883 __ bind(L_a);
2877 2884 __ stop("broken null klass");
2878 2885 __ bind(L_b);
2879 2886 __ load_klass(dst, G4_dst_klass);
2880 2887 __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
2881 2888 __ delayed()->mov(G0, G4_dst_klass); // scribble the temp
2882 2889 BLOCK_COMMENT("assert done");
2883 2890 }
2884 2891 #endif
2885 2892
2886 2893 // Load layout helper
2887 2894 //
2888 2895 // |array_tag| | header_size | element_type | |log2_element_size|
2889 2896 // 32 30 24 16 8 2 0
2890 2897 //
2891 2898 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2892 2899 //
2893 2900
2894 2901 int lh_offset = klassOopDesc::header_size() * HeapWordSize +
2895 2902 Klass::layout_helper_offset_in_bytes();
2896 2903
2897 2904 // Load 32-bits signed value. Use br() instruction with it to check icc.
2898 2905 __ lduw(G3_src_klass, lh_offset, G5_lh);
2899 2906
2900 2907 if (UseCompressedOops) {
2901 2908 __ load_klass(dst, G4_dst_klass);
2902 2909 }
2903 2910 // Handle objArrays completely differently...
2904 2911 juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2905 2912 __ set(objArray_lh, O5_temp);
2906 2913 __ cmp(G5_lh, O5_temp);
2907 2914 __ br(Assembler::equal, false, Assembler::pt, L_objArray);
2908 2915 if (UseCompressedOops) {
2909 2916 __ delayed()->nop();
2910 2917 } else {
2911 2918 __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
2912 2919 }
2913 2920
2914 2921 // if (src->klass() != dst->klass()) return -1;
2915 2922 __ cmp(G3_src_klass, G4_dst_klass);
2916 2923 __ brx(Assembler::notEqual, false, Assembler::pn, L_failed);
2917 2924 __ delayed()->nop();
2918 2925
2919 2926 // if (!src->is_Array()) return -1;
2920 2927 __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
2921 2928 __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
2922 2929
2923 2930 // At this point, it is known to be a typeArray (array_tag 0x3).
2924 2931 #ifdef ASSERT
2925 2932 __ delayed()->nop();
2926 2933 { Label L;
2927 2934 jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2928 2935 __ set(lh_prim_tag_in_place, O5_temp);
2929 2936 __ cmp(G5_lh, O5_temp);
2930 2937 __ br(Assembler::greaterEqual, false, Assembler::pt, L);
2931 2938 __ delayed()->nop();
2932 2939 __ stop("must be a primitive array");
2933 2940 __ bind(L);
2934 2941 }
2935 2942 #else
2936 2943 __ delayed(); // match next insn to prev branch
2937 2944 #endif
2938 2945
2939 2946 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2940 2947 O5_temp, G4_dst_klass, L_failed);
2941 2948
2942 2949 // typeArrayKlass
2943 2950 //
2944 2951 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2945 2952 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2946 2953 //
2947 2954
2948 2955 const Register G4_offset = G4_dst_klass; // array offset
2949 2956 const Register G3_elsize = G3_src_klass; // log2 element size
2950 2957
2951 2958 __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
2952 2959 __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
2953 2960 __ add(src, G4_offset, src); // src array offset
2954 2961 __ add(dst, G4_offset, dst); // dst array offset
2955 2962 __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
2956 2963
2957 2964 // next registers should be set before the jump to corresponding stub
2958 2965 const Register from = O0; // source array address
2959 2966 const Register to = O1; // destination array address
2960 2967 const Register count = O2; // elements count
2961 2968
2962 2969 // 'from', 'to', 'count' registers should be set in this order
↓ open down ↓ |
152 lines elided |
↑ open up ↑ |
2963 2970 // since they are the same as 'src', 'src_pos', 'dst'.
2964 2971
2965 2972 BLOCK_COMMENT("scale indexes to element size");
2966 2973 __ sll_ptr(src_pos, G3_elsize, src_pos);
2967 2974 __ sll_ptr(dst_pos, G3_elsize, dst_pos);
2968 2975 __ add(src, src_pos, from); // src_addr
2969 2976 __ add(dst, dst_pos, to); // dst_addr
2970 2977
2971 2978 BLOCK_COMMENT("choose copy loop based on element size");
2972 2979 __ cmp(G3_elsize, 0);
2973 - __ br(Assembler::equal,true,Assembler::pt,StubRoutines::_jbyte_arraycopy);
2980 + __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
2974 2981 __ delayed()->signx(length, count); // length
2975 2982
2976 2983 __ cmp(G3_elsize, LogBytesPerShort);
2977 - __ br(Assembler::equal,true,Assembler::pt,StubRoutines::_jshort_arraycopy);
2984 + __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
2978 2985 __ delayed()->signx(length, count); // length
2979 2986
2980 2987 __ cmp(G3_elsize, LogBytesPerInt);
2981 - __ br(Assembler::equal,true,Assembler::pt,StubRoutines::_jint_arraycopy);
2988 + __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
2982 2989 __ delayed()->signx(length, count); // length
2983 2990 #ifdef ASSERT
2984 2991 { Label L;
2985 2992 __ cmp(G3_elsize, LogBytesPerLong);
2986 2993 __ br(Assembler::equal, false, Assembler::pt, L);
2987 2994 __ delayed()->nop();
2988 2995 __ stop("must be long copy, but elsize is wrong");
2989 2996 __ bind(L);
2990 2997 }
2991 2998 #endif
2992 - __ br(Assembler::always,false,Assembler::pt,StubRoutines::_jlong_arraycopy);
2999 + __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
2993 3000 __ delayed()->signx(length, count); // length
2994 3001
2995 3002 // objArrayKlass
2996 3003 __ BIND(L_objArray);
2997 3004 // live at this point: G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
2998 3005
2999 3006 Label L_plain_copy, L_checkcast_copy;
3000 3007 // test array classes for subtyping
3001 3008 __ cmp(G3_src_klass, G4_dst_klass); // usual case is exact equality
3002 3009 __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
3003 3010 __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
3004 3011
3005 3012 // Identically typed arrays can be copied without element-wise checks.
↓ open down ↓ |
3 lines elided |
↑ open up ↑ |
3006 3013 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3007 3014 O5_temp, G5_lh, L_failed);
3008 3015
3009 3016 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3010 3017 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3011 3018 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3012 3019 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3013 3020 __ add(src, src_pos, from); // src_addr
3014 3021 __ add(dst, dst_pos, to); // dst_addr
3015 3022 __ BIND(L_plain_copy);
3016 - __ br(Assembler::always, false, Assembler::pt,StubRoutines::_oop_arraycopy);
3023 + __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
3017 3024 __ delayed()->signx(length, count); // length
3018 3025
3019 3026 __ BIND(L_checkcast_copy);
3020 3027 // live at this point: G3_src_klass, G4_dst_klass
3021 3028 {
3022 3029 // Before looking at dst.length, make sure dst is also an objArray.
3023 3030 // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
3024 3031 __ cmp(G5_lh, O5_temp);
3025 3032 __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
3026 3033
3027 3034 // It is safe to examine both src.length and dst.length.
3028 3035 __ delayed(); // match next insn to prev branch
3029 3036 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3030 3037 O5_temp, G5_lh, L_failed);
3031 3038
3032 3039 // Marshal the base address arguments now, freeing registers.
3033 3040 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3034 3041 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3035 3042 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3036 3043 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3037 3044 __ add(src, src_pos, from); // src_addr
3038 3045 __ add(dst, dst_pos, to); // dst_addr
3039 3046 __ signx(length, count); // length (reloaded)
3040 3047
3041 3048 Register sco_temp = O3; // this register is free now
3042 3049 assert_different_registers(from, to, count, sco_temp,
3043 3050 G4_dst_klass, G3_src_klass);
3044 3051
3045 3052 // Generate the type check.
3046 3053 int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
3047 3054 Klass::super_check_offset_offset_in_bytes());
3048 3055 __ lduw(G4_dst_klass, sco_offset, sco_temp);
3049 3056 generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
↓ open down ↓ |
23 lines elided |
↑ open up ↑ |
3050 3057 O5_temp, L_plain_copy);
3051 3058
3052 3059 // Fetch destination element klass from the objArrayKlass header.
3053 3060 int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
3054 3061 objArrayKlass::element_klass_offset_in_bytes());
3055 3062
3056 3063 // the checkcast_copy loop needs two extra arguments:
3057 3064 __ ld_ptr(G4_dst_klass, ek_offset, O4); // dest elem klass
3058 3065 // lduw(O4, sco_offset, O3); // sco of elem klass
3059 3066
3060 - __ br(Assembler::always, false, Assembler::pt, checkcast_copy_entry);
3067 + __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
3061 3068 __ delayed()->lduw(O4, sco_offset, O3);
3062 3069 }
3063 3070
3064 3071 __ BIND(L_failed);
3065 3072 __ retl();
3066 3073 __ delayed()->sub(G0, 1, O0); // return -1
3067 3074 return start;
3068 3075 }
3069 3076
3070 3077 void generate_arraycopy_stubs() {
3078 + address entry;
3079 + address entry_jbyte_arraycopy;
3080 + address entry_jshort_arraycopy;
3081 + address entry_jint_arraycopy;
3082 + address entry_oop_arraycopy;
3083 + address entry_jlong_arraycopy;
3084 + address entry_checkcast_arraycopy;
3085 +
3086 + StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry,
3087 + "jbyte_disjoint_arraycopy");
3088 + StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3089 + "jbyte_arraycopy");
3090 + StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3091 + "jshort_disjoint_arraycopy");
3092 + StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3093 + "jshort_arraycopy");
3094 + StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry,
3095 + "jint_disjoint_arraycopy");
3096 + StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, &entry_jint_arraycopy,
3097 + "jint_arraycopy");
3098 + StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(false, &entry,
3099 + "jlong_disjoint_arraycopy");
3100 + StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(false, entry, &entry_jlong_arraycopy,
3101 + "jlong_arraycopy");
3102 + StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, &entry,
3103 + "oop_disjoint_arraycopy");
3104 + StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
3105 + "oop_arraycopy");
3106 +
3107 +
3108 + StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
3109 + "arrayof_jbyte_disjoint_arraycopy");
3110 + StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL,
3111 + "arrayof_jbyte_arraycopy");
3112 +
3113 + StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
3114 + "arrayof_jshort_disjoint_arraycopy");
3115 + StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL,
3116 + "arrayof_jshort_arraycopy");
3071 3117
3072 - // Note: the disjoint stubs must be generated first, some of
3073 - // the conjoint stubs use them.
3074 - StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
3075 - StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
3076 - StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
3077 - StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
3078 - StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy");
3079 - StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
3080 - StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
3081 - StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");
3082 - StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
3083 - StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy");
3084 -
3085 - StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
3086 - StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
3087 - StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, "jint_arraycopy");
3088 - StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(false, "jlong_arraycopy");
3089 - StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, "oop_arraycopy");
3090 - StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
3091 - StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
3118 + StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
3119 + "arrayof_jint_disjoint_arraycopy");
3092 3120 #ifdef _LP64
3093 3121 // since sizeof(jint) < sizeof(HeapWord), there's a different flavor:
3094 - StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");
3122 + StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, NULL, "arrayof_jint_arraycopy");
3095 3123 #else
3096 3124 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy;
3097 3125 #endif
3126 +
3127 + StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, NULL,
3128 + "arrayof_jlong_disjoint_arraycopy");
3129 + StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, NULL,
3130 + "arrayof_oop_disjoint_arraycopy");
3131 +
3098 3132 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy;
3099 3133 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy;
3100 3134
3101 - StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy");
3102 - StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy");
3103 - StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy");
3135 + StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3136 + StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy",
3137 + entry_jbyte_arraycopy,
3138 + entry_jshort_arraycopy,
3139 + entry_jint_arraycopy,
3140 + entry_jlong_arraycopy);
3141 + StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
3142 + entry_jbyte_arraycopy,
3143 + entry_jshort_arraycopy,
3144 + entry_jint_arraycopy,
3145 + entry_oop_arraycopy,
3146 + entry_jlong_arraycopy,
3147 + entry_checkcast_arraycopy);
3104 3148
3105 3149 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3106 3150 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3107 3151 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3108 3152 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3109 3153 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3110 3154 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3111 3155 }
3112 3156
3113 3157 void generate_initial() {
3114 3158 // Generates all stubs and initializes the entry points
3115 3159
3116 3160 //------------------------------------------------------------------------------------------------------------------------
3117 3161 // entry points that exist in all platforms
3118 3162 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
3119 3163 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
3120 3164 StubRoutines::_forward_exception_entry = generate_forward_exception();
3121 3165
3122 3166 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
3123 3167 StubRoutines::_catch_exception_entry = generate_catch_exception();
3124 3168
3125 3169 //------------------------------------------------------------------------------------------------------------------------
3126 3170 // entry points that are platform specific
3127 3171 StubRoutines::Sparc::_test_stop_entry = generate_test_stop();
3128 3172
3129 3173 StubRoutines::Sparc::_stop_subroutine_entry = generate_stop_subroutine();
3130 3174 StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
3131 3175
3132 3176 #if !defined(COMPILER2) && !defined(_LP64)
3133 3177 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
3134 3178 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg();
3135 3179 StubRoutines::_atomic_add_entry = generate_atomic_add();
3136 3180 StubRoutines::_atomic_xchg_ptr_entry = StubRoutines::_atomic_xchg_entry;
3137 3181 StubRoutines::_atomic_cmpxchg_ptr_entry = StubRoutines::_atomic_cmpxchg_entry;
3138 3182 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
3139 3183 StubRoutines::_atomic_add_ptr_entry = StubRoutines::_atomic_add_entry;
3140 3184 #endif // COMPILER2 !=> _LP64
3141 3185 }
3142 3186
3143 3187
3144 3188 void generate_all() {
3145 3189 // Generates all stubs and initializes the entry points
3146 3190
3147 3191 // Generate partial_subtype_check first here since its code depends on
3148 3192 // UseZeroBaseCompressedOops which is defined after heap initialization.
3149 3193 StubRoutines::Sparc::_partial_subtype_check = generate_partial_subtype_check();
3150 3194 // These entry points require SharedInfo::stack0 to be set up in non-core builds
3151 3195 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false);
3152 3196 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError), false);
3153 3197 StubRoutines::_throw_ArithmeticException_entry = generate_throw_exception("ArithmeticException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException), true);
3154 3198 StubRoutines::_throw_NullPointerException_entry = generate_throw_exception("NullPointerException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
3155 3199 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
3156 3200 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
3157 3201
3158 3202 StubRoutines::_handler_for_unsafe_access_entry =
3159 3203 generate_handler_for_unsafe_access();
3160 3204
3161 3205 // support for verify_oop (must happen after universe_init)
3162 3206 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop_subroutine();
3163 3207
3164 3208 // arraycopy stubs used by compilers
3165 3209 generate_arraycopy_stubs();
3166 3210
3167 3211 // Don't initialize the platform math functions since sparc
3168 3212 // doesn't have intrinsics for these operations.
3169 3213 }
3170 3214
3171 3215
3172 3216 public:
3173 3217 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3174 3218 // replace the standard masm with a special one:
3175 3219 _masm = new MacroAssembler(code);
3176 3220
3177 3221 _stub_count = !all ? 0x100 : 0x200;
3178 3222 if (all) {
3179 3223 generate_all();
3180 3224 } else {
3181 3225 generate_initial();
3182 3226 }
3183 3227
3184 3228 // make sure this stub is available for all local calls
3185 3229 if (_atomic_add_stub.is_unbound()) {
3186 3230 // generate a second time, if necessary
3187 3231 (void) generate_atomic_add();
3188 3232 }
3189 3233 }
3190 3234
3191 3235
3192 3236 private:
3193 3237 int _stub_count;
3194 3238 void stub_prolog(StubCodeDesc* cdesc) {
3195 3239 # ifdef ASSERT
3196 3240 // put extra information in the stub code, to make it more readable
3197 3241 #ifdef _LP64
3198 3242 // Write the high part of the address
3199 3243 // [RGV] Check if there is a dependency on the size of this prolog
3200 3244 __ emit_data((intptr_t)cdesc >> 32, relocInfo::none);
3201 3245 #endif
3202 3246 __ emit_data((intptr_t)cdesc, relocInfo::none);
3203 3247 __ emit_data(++_stub_count, relocInfo::none);
3204 3248 # endif
3205 3249 align(true);
3206 3250 }
3207 3251
3208 3252 void align(bool at_header = false) {
3209 3253 // %%%%% move this constant somewhere else
3210 3254 // UltraSPARC cache line size is 8 instructions:
3211 3255 const unsigned int icache_line_size = 32;
3212 3256 const unsigned int icache_half_line_size = 16;
3213 3257
3214 3258 if (at_header) {
3215 3259 while ((intptr_t)(__ pc()) % icache_line_size != 0) {
3216 3260 __ emit_data(0, relocInfo::none);
↓ open down ↓ |
103 lines elided |
↑ open up ↑ |
3217 3261 }
3218 3262 } else {
3219 3263 while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
3220 3264 __ nop();
3221 3265 }
3222 3266 }
3223 3267 }
3224 3268
3225 3269 }; // end class declaration
3226 3270
3227 -
3228 -address StubGenerator::disjoint_byte_copy_entry = NULL;
3229 -address StubGenerator::disjoint_short_copy_entry = NULL;
3230 -address StubGenerator::disjoint_int_copy_entry = NULL;
3231 -address StubGenerator::disjoint_long_copy_entry = NULL;
3232 -address StubGenerator::disjoint_oop_copy_entry = NULL;
3233 -
3234 -address StubGenerator::byte_copy_entry = NULL;
3235 -address StubGenerator::short_copy_entry = NULL;
3236 -address StubGenerator::int_copy_entry = NULL;
3237 -address StubGenerator::long_copy_entry = NULL;
3238 -address StubGenerator::oop_copy_entry = NULL;
3239 -
3240 -address StubGenerator::checkcast_copy_entry = NULL;
3241 -
3242 3271 void StubGenerator_generate(CodeBuffer* code, bool all) {
3243 3272 StubGenerator g(code, all);
3244 3273 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX