1 /* 2 * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "assembler_sparc.inline.hpp" 28 #include "interpreter/interpreter.hpp" 29 #include "nativeInst_sparc.hpp" 30 #include "oops/instanceOop.hpp" 31 #include "oops/methodOop.hpp" 32 #include "oops/objArrayKlass.hpp" 33 #include "oops/oop.inline.hpp" 34 #include "prims/methodHandles.hpp" 35 #include "runtime/frame.inline.hpp" 36 #include "runtime/handles.inline.hpp" 37 #include "runtime/sharedRuntime.hpp" 38 #include "runtime/stubCodeGenerator.hpp" 39 #include "runtime/stubRoutines.hpp" 40 #include "utilities/top.hpp" 41 #ifdef TARGET_OS_FAMILY_linux 42 # include "thread_linux.inline.hpp" 43 #endif 44 #ifdef TARGET_OS_FAMILY_solaris 45 # include "thread_solaris.inline.hpp" 46 #endif 47 #ifdef COMPILER2 48 #include "opto/runtime.hpp" 49 #endif 50 51 // Declaration and definition of StubGenerator (no .hpp file). 52 // For a more detailed description of the stub routine structure 53 // see the comment in stubRoutines.hpp. 54 55 #define __ _masm-> 56 57 #ifdef PRODUCT 58 #define BLOCK_COMMENT(str) /* nothing */ 59 #else 60 #define BLOCK_COMMENT(str) __ block_comment(str) 61 #endif 62 63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 64 65 // Note: The register L7 is used as L7_thread_cache, and may not be used 66 // any other way within this module. 67 68 69 static const Register& Lstub_temp = L2; 70 71 // ------------------------------------------------------------------------------------------------------------------------- 72 // Stub Code definitions 73 74 static address handle_unsafe_access() { 75 JavaThread* thread = JavaThread::current(); 76 address pc = thread->saved_exception_pc(); 77 address npc = thread->saved_exception_npc(); 78 // pc is the instruction which we must emulate 79 // doing a no-op is fine: return garbage from the load 80 81 // request an async exception 82 thread->set_pending_unsafe_access_error(); 83 84 // return address of next instruction to execute 85 return npc; 86 } 87 88 class StubGenerator: public StubCodeGenerator { 89 private: 90 91 #ifdef PRODUCT 92 #define inc_counter_np(a,b,c) (0) 93 #else 94 #define inc_counter_np(counter, t1, t2) \ 95 BLOCK_COMMENT("inc_counter " #counter); \ 96 __ inc_counter(&counter, t1, t2); 97 #endif 98 99 //---------------------------------------------------------------------------------------------------- 100 // Call stubs are used to call Java from C 101 102 address generate_call_stub(address& return_pc) { 103 StubCodeMark mark(this, "StubRoutines", "call_stub"); 104 address start = __ pc(); 105 106 // Incoming arguments: 107 // 108 // o0 : call wrapper address 109 // o1 : result (address) 110 // o2 : result type 111 // o3 : method 112 // o4 : (interpreter) entry point 113 // o5 : parameters (address) 114 // [sp + 0x5c]: parameter size (in words) 115 // [sp + 0x60]: thread 116 // 117 // +---------------+ <--- sp + 0 118 // | | 119 // . reg save area . 120 // | | 121 // +---------------+ <--- sp + 0x40 122 // | | 123 // . extra 7 slots . 124 // | | 125 // +---------------+ <--- sp + 0x5c 126 // | param. size | 127 // +---------------+ <--- sp + 0x60 128 // | thread | 129 // +---------------+ 130 // | | 131 132 // note: if the link argument position changes, adjust 133 // the code in frame::entry_frame_call_wrapper() 134 135 const Argument link = Argument(0, false); // used only for GC 136 const Argument result = Argument(1, false); 137 const Argument result_type = Argument(2, false); 138 const Argument method = Argument(3, false); 139 const Argument entry_point = Argument(4, false); 140 const Argument parameters = Argument(5, false); 141 const Argument parameter_size = Argument(6, false); 142 const Argument thread = Argument(7, false); 143 144 // setup thread register 145 __ ld_ptr(thread.as_address(), G2_thread); 146 __ reinit_heapbase(); 147 148 #ifdef ASSERT 149 // make sure we have no pending exceptions 150 { const Register t = G3_scratch; 151 Label L; 152 __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t); 153 __ br_null(t, false, Assembler::pt, L); 154 __ delayed()->nop(); 155 __ stop("StubRoutines::call_stub: entered with pending exception"); 156 __ bind(L); 157 } 158 #endif 159 160 // create activation frame & allocate space for parameters 161 { const Register t = G3_scratch; 162 __ ld_ptr(parameter_size.as_address(), t); // get parameter size (in words) 163 __ add(t, frame::memory_parameter_word_sp_offset, t); // add space for save area (in words) 164 __ round_to(t, WordsPerLong); // make sure it is multiple of 2 (in words) 165 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes 166 __ neg(t); // negate so it can be used with save 167 __ save(SP, t, SP); // setup new frame 168 } 169 170 // +---------------+ <--- sp + 0 171 // | | 172 // . reg save area . 173 // | | 174 // +---------------+ <--- sp + 0x40 175 // | | 176 // . extra 7 slots . 177 // | | 178 // +---------------+ <--- sp + 0x5c 179 // | empty slot | (only if parameter size is even) 180 // +---------------+ 181 // | | 182 // . parameters . 183 // | | 184 // +---------------+ <--- fp + 0 185 // | | 186 // . reg save area . 187 // | | 188 // +---------------+ <--- fp + 0x40 189 // | | 190 // . extra 7 slots . 191 // | | 192 // +---------------+ <--- fp + 0x5c 193 // | param. size | 194 // +---------------+ <--- fp + 0x60 195 // | thread | 196 // +---------------+ 197 // | | 198 199 // pass parameters if any 200 BLOCK_COMMENT("pass parameters if any"); 201 { const Register src = parameters.as_in().as_register(); 202 const Register dst = Lentry_args; 203 const Register tmp = G3_scratch; 204 const Register cnt = G4_scratch; 205 206 // test if any parameters & setup of Lentry_args 207 Label exit; 208 __ ld_ptr(parameter_size.as_in().as_address(), cnt); // parameter counter 209 __ add( FP, STACK_BIAS, dst ); 210 __ tst(cnt); 211 __ br(Assembler::zero, false, Assembler::pn, exit); 212 __ delayed()->sub(dst, BytesPerWord, dst); // setup Lentry_args 213 214 // copy parameters if any 215 Label loop; 216 __ BIND(loop); 217 // Store parameter value 218 __ ld_ptr(src, 0, tmp); 219 __ add(src, BytesPerWord, src); 220 __ st_ptr(tmp, dst, 0); 221 __ deccc(cnt); 222 __ br(Assembler::greater, false, Assembler::pt, loop); 223 __ delayed()->sub(dst, Interpreter::stackElementSize, dst); 224 225 // done 226 __ BIND(exit); 227 } 228 229 // setup parameters, method & call Java function 230 #ifdef ASSERT 231 // layout_activation_impl checks it's notion of saved SP against 232 // this register, so if this changes update it as well. 233 const Register saved_SP = Lscratch; 234 __ mov(SP, saved_SP); // keep track of SP before call 235 #endif 236 237 // setup parameters 238 const Register t = G3_scratch; 239 __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words) 240 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes 241 __ sub(FP, t, Gargs); // setup parameter pointer 242 #ifdef _LP64 243 __ add( Gargs, STACK_BIAS, Gargs ); // Account for LP64 stack bias 244 #endif 245 __ mov(SP, O5_savedSP); 246 247 248 // do the call 249 // 250 // the following register must be setup: 251 // 252 // G2_thread 253 // G5_method 254 // Gargs 255 BLOCK_COMMENT("call Java function"); 256 __ jmpl(entry_point.as_in().as_register(), G0, O7); 257 __ delayed()->mov(method.as_in().as_register(), G5_method); // setup method 258 259 BLOCK_COMMENT("call_stub_return_address:"); 260 return_pc = __ pc(); 261 262 // The callee, if it wasn't interpreted, can return with SP changed so 263 // we can no longer assert of change of SP. 264 265 // store result depending on type 266 // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE 267 // is treated as T_INT) 268 { const Register addr = result .as_in().as_register(); 269 const Register type = result_type.as_in().as_register(); 270 Label is_long, is_float, is_double, is_object, exit; 271 __ cmp(type, T_OBJECT); __ br(Assembler::equal, false, Assembler::pn, is_object); 272 __ delayed()->cmp(type, T_FLOAT); __ br(Assembler::equal, false, Assembler::pn, is_float); 273 __ delayed()->cmp(type, T_DOUBLE); __ br(Assembler::equal, false, Assembler::pn, is_double); 274 __ delayed()->cmp(type, T_LONG); __ br(Assembler::equal, false, Assembler::pn, is_long); 275 __ delayed()->nop(); 276 277 // store int result 278 __ st(O0, addr, G0); 279 280 __ BIND(exit); 281 __ ret(); 282 __ delayed()->restore(); 283 284 __ BIND(is_object); 285 __ ba(false, exit); 286 __ delayed()->st_ptr(O0, addr, G0); 287 288 __ BIND(is_float); 289 __ ba(false, exit); 290 __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0); 291 292 __ BIND(is_double); 293 __ ba(false, exit); 294 __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0); 295 296 __ BIND(is_long); 297 #ifdef _LP64 298 __ ba(false, exit); 299 __ delayed()->st_long(O0, addr, G0); // store entire long 300 #else 301 #if defined(COMPILER2) 302 // All return values are where we want them, except for Longs. C2 returns 303 // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1. 304 // Since the interpreter will return longs in G1 and O0/O1 in the 32bit 305 // build we simply always use G1. 306 // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to 307 // do this here. Unfortunately if we did a rethrow we'd see an machepilog node 308 // first which would move g1 -> O0/O1 and destroy the exception we were throwing. 309 310 __ ba(false, exit); 311 __ delayed()->stx(G1, addr, G0); // store entire long 312 #else 313 __ st(O1, addr, BytesPerInt); 314 __ ba(false, exit); 315 __ delayed()->st(O0, addr, G0); 316 #endif /* COMPILER2 */ 317 #endif /* _LP64 */ 318 } 319 return start; 320 } 321 322 323 //---------------------------------------------------------------------------------------------------- 324 // Return point for a Java call if there's an exception thrown in Java code. 325 // The exception is caught and transformed into a pending exception stored in 326 // JavaThread that can be tested from within the VM. 327 // 328 // Oexception: exception oop 329 330 address generate_catch_exception() { 331 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 332 333 address start = __ pc(); 334 // verify that thread corresponds 335 __ verify_thread(); 336 337 const Register& temp_reg = Gtemp; 338 Address pending_exception_addr (G2_thread, Thread::pending_exception_offset()); 339 Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset ()); 340 Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset ()); 341 342 // set pending exception 343 __ verify_oop(Oexception); 344 __ st_ptr(Oexception, pending_exception_addr); 345 __ set((intptr_t)__FILE__, temp_reg); 346 __ st_ptr(temp_reg, exception_file_offset_addr); 347 __ set((intptr_t)__LINE__, temp_reg); 348 __ st(temp_reg, exception_line_offset_addr); 349 350 // complete return to VM 351 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before"); 352 353 AddressLiteral stub_ret(StubRoutines::_call_stub_return_address); 354 __ jump_to(stub_ret, temp_reg); 355 __ delayed()->nop(); 356 357 return start; 358 } 359 360 361 //---------------------------------------------------------------------------------------------------- 362 // Continuation point for runtime calls returning with a pending exception 363 // The pending exception check happened in the runtime or native call stub 364 // The pending exception in Thread is converted into a Java-level exception 365 // 366 // Contract with Java-level exception handler: O0 = exception 367 // O1 = throwing pc 368 369 address generate_forward_exception() { 370 StubCodeMark mark(this, "StubRoutines", "forward_exception"); 371 address start = __ pc(); 372 373 // Upon entry, O7 has the return address returning into Java 374 // (interpreted or compiled) code; i.e. the return address 375 // becomes the throwing pc. 376 377 const Register& handler_reg = Gtemp; 378 379 Address exception_addr(G2_thread, Thread::pending_exception_offset()); 380 381 #ifdef ASSERT 382 // make sure that this code is only executed if there is a pending exception 383 { Label L; 384 __ ld_ptr(exception_addr, Gtemp); 385 __ br_notnull(Gtemp, false, Assembler::pt, L); 386 __ delayed()->nop(); 387 __ stop("StubRoutines::forward exception: no pending exception (1)"); 388 __ bind(L); 389 } 390 #endif 391 392 // compute exception handler into handler_reg 393 __ get_thread(); 394 __ ld_ptr(exception_addr, Oexception); 395 __ verify_oop(Oexception); 396 __ save_frame(0); // compensates for compiler weakness 397 __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC 398 BLOCK_COMMENT("call exception_handler_for_return_address"); 399 __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch); 400 __ mov(O0, handler_reg); 401 __ restore(); // compensates for compiler weakness 402 403 __ ld_ptr(exception_addr, Oexception); 404 __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC 405 406 #ifdef ASSERT 407 // make sure exception is set 408 { Label L; 409 __ br_notnull(Oexception, false, Assembler::pt, L); 410 __ delayed()->nop(); 411 __ stop("StubRoutines::forward exception: no pending exception (2)"); 412 __ bind(L); 413 } 414 #endif 415 // jump to exception handler 416 __ jmp(handler_reg, 0); 417 // clear pending exception 418 __ delayed()->st_ptr(G0, exception_addr); 419 420 return start; 421 } 422 423 424 //------------------------------------------------------------------------------------------------------------------------ 425 // Continuation point for throwing of implicit exceptions that are not handled in 426 // the current activation. Fabricates an exception oop and initiates normal 427 // exception dispatching in this frame. Only callee-saved registers are preserved 428 // (through the normal register window / RegisterMap handling). 429 // If the compiler needs all registers to be preserved between the fault 430 // point and the exception handler then it must assume responsibility for that in 431 // AbstractCompiler::continuation_for_implicit_null_exception or 432 // continuation_for_implicit_division_by_zero_exception. All other implicit 433 // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are 434 // either at call sites or otherwise assume that stack unwinding will be initiated, 435 // so caller saved registers were assumed volatile in the compiler. 436 437 // Note that we generate only this stub into a RuntimeStub, because it needs to be 438 // properly traversed and ignored during GC, so we change the meaning of the "__" 439 // macro within this method. 440 #undef __ 441 #define __ masm-> 442 443 address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc, 444 Register arg1 = noreg, Register arg2 = noreg) { 445 #ifdef ASSERT 446 int insts_size = VerifyThread ? 1 * K : 600; 447 #else 448 int insts_size = VerifyThread ? 1 * K : 256; 449 #endif /* ASSERT */ 450 int locs_size = 32; 451 452 CodeBuffer code(name, insts_size, locs_size); 453 MacroAssembler* masm = new MacroAssembler(&code); 454 455 __ verify_thread(); 456 457 // This is an inlined and slightly modified version of call_VM 458 // which has the ability to fetch the return PC out of thread-local storage 459 __ assert_not_delayed(); 460 461 // Note that we always push a frame because on the SPARC 462 // architecture, for all of our implicit exception kinds at call 463 // sites, the implicit exception is taken before the callee frame 464 // is pushed. 465 __ save_frame(0); 466 467 int frame_complete = __ offset(); 468 469 if (restore_saved_exception_pc) { 470 __ ld_ptr(G2_thread, JavaThread::saved_exception_pc_offset(), I7); 471 __ sub(I7, frame::pc_return_offset, I7); 472 } 473 474 // Note that we always have a runtime stub frame on the top of stack by this point 475 Register last_java_sp = SP; 476 // 64-bit last_java_sp is biased! 477 __ set_last_Java_frame(last_java_sp, G0); 478 if (VerifyThread) __ mov(G2_thread, O0); // about to be smashed; pass early 479 __ save_thread(noreg); 480 if (arg1 != noreg) { 481 assert(arg2 != O1, "clobbered"); 482 __ mov(arg1, O1); 483 } 484 if (arg2 != noreg) { 485 __ mov(arg2, O2); 486 } 487 // do the call 488 BLOCK_COMMENT("call runtime_entry"); 489 __ call(runtime_entry, relocInfo::runtime_call_type); 490 if (!VerifyThread) 491 __ delayed()->mov(G2_thread, O0); // pass thread as first argument 492 else 493 __ delayed()->nop(); // (thread already passed) 494 __ restore_thread(noreg); 495 __ reset_last_Java_frame(); 496 497 // check for pending exceptions. use Gtemp as scratch register. 498 #ifdef ASSERT 499 Label L; 500 501 Address exception_addr(G2_thread, Thread::pending_exception_offset()); 502 Register scratch_reg = Gtemp; 503 __ ld_ptr(exception_addr, scratch_reg); 504 __ br_notnull(scratch_reg, false, Assembler::pt, L); 505 __ delayed()->nop(); 506 __ should_not_reach_here(); 507 __ bind(L); 508 #endif // ASSERT 509 BLOCK_COMMENT("call forward_exception_entry"); 510 __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type); 511 // we use O7 linkage so that forward_exception_entry has the issuing PC 512 __ delayed()->restore(); 513 514 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false); 515 return stub->entry_point(); 516 } 517 518 #undef __ 519 #define __ _masm-> 520 521 522 // Generate a routine that sets all the registers so we 523 // can tell if the stop routine prints them correctly. 524 address generate_test_stop() { 525 StubCodeMark mark(this, "StubRoutines", "test_stop"); 526 address start = __ pc(); 527 528 int i; 529 530 __ save_frame(0); 531 532 static jfloat zero = 0.0, one = 1.0; 533 534 // put addr in L0, then load through L0 to F0 535 __ set((intptr_t)&zero, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F0); 536 __ set((intptr_t)&one, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1 537 538 // use add to put 2..18 in F2..F18 539 for ( i = 2; i <= 18; ++i ) { 540 __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1), as_FloatRegister(i)); 541 } 542 543 // Now put double 2 in F16, double 18 in F18 544 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 ); 545 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 ); 546 547 // use add to put 20..32 in F20..F32 548 for (i = 20; i < 32; i += 2) { 549 __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2), as_FloatRegister(i)); 550 } 551 552 // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's 553 for ( i = 0; i < 8; ++i ) { 554 if (i < 6) { 555 __ set( i, as_iRegister(i)); 556 __ set(16 + i, as_oRegister(i)); 557 __ set(24 + i, as_gRegister(i)); 558 } 559 __ set( 8 + i, as_lRegister(i)); 560 } 561 562 __ stop("testing stop"); 563 564 565 __ ret(); 566 __ delayed()->restore(); 567 568 return start; 569 } 570 571 572 address generate_stop_subroutine() { 573 StubCodeMark mark(this, "StubRoutines", "stop_subroutine"); 574 address start = __ pc(); 575 576 __ stop_subroutine(); 577 578 return start; 579 } 580 581 address generate_flush_callers_register_windows() { 582 StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows"); 583 address start = __ pc(); 584 585 __ flush_windows(); 586 __ retl(false); 587 __ delayed()->add( FP, STACK_BIAS, O0 ); 588 // The returned value must be a stack pointer whose register save area 589 // is flushed, and will stay flushed while the caller executes. 590 591 return start; 592 } 593 594 // Helper functions for v8 atomic operations. 595 // 596 void get_v8_oop_lock_ptr(Register lock_ptr_reg, Register mark_oop_reg, Register scratch_reg) { 597 if (mark_oop_reg == noreg) { 598 address lock_ptr = (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(); 599 __ set((intptr_t)lock_ptr, lock_ptr_reg); 600 } else { 601 assert(scratch_reg != noreg, "just checking"); 602 address lock_ptr = (address)StubRoutines::Sparc::_v8_oop_lock_cache; 603 __ set((intptr_t)lock_ptr, lock_ptr_reg); 604 __ and3(mark_oop_reg, StubRoutines::Sparc::v8_oop_lock_mask_in_place, scratch_reg); 605 __ add(lock_ptr_reg, scratch_reg, lock_ptr_reg); 606 } 607 } 608 609 void generate_v8_lock_prologue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) { 610 611 get_v8_oop_lock_ptr(lock_ptr_reg, mark_oop_reg, scratch_reg); 612 __ set(StubRoutines::Sparc::locked, lock_reg); 613 // Initialize yield counter 614 __ mov(G0,yield_reg); 615 616 __ BIND(retry); 617 __ cmp(yield_reg, V8AtomicOperationUnderLockSpinCount); 618 __ br(Assembler::less, false, Assembler::pt, dontyield); 619 __ delayed()->nop(); 620 621 // This code can only be called from inside the VM, this 622 // stub is only invoked from Atomic::add(). We do not 623 // want to use call_VM, because _last_java_sp and such 624 // must already be set. 625 // 626 // Save the regs and make space for a C call 627 __ save(SP, -96, SP); 628 __ save_all_globals_into_locals(); 629 BLOCK_COMMENT("call os::naked_sleep"); 630 __ call(CAST_FROM_FN_PTR(address, os::naked_sleep)); 631 __ delayed()->nop(); 632 __ restore_globals_from_locals(); 633 __ restore(); 634 // reset the counter 635 __ mov(G0,yield_reg); 636 637 __ BIND(dontyield); 638 639 // try to get lock 640 __ swap(lock_ptr_reg, 0, lock_reg); 641 642 // did we get the lock? 643 __ cmp(lock_reg, StubRoutines::Sparc::unlocked); 644 __ br(Assembler::notEqual, true, Assembler::pn, retry); 645 __ delayed()->add(yield_reg,1,yield_reg); 646 647 // yes, got lock. do the operation here. 648 } 649 650 void generate_v8_lock_epilogue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) { 651 __ st(lock_reg, lock_ptr_reg, 0); // unlock 652 } 653 654 // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest). 655 // 656 // Arguments : 657 // 658 // exchange_value: O0 659 // dest: O1 660 // 661 // Results: 662 // 663 // O0: the value previously stored in dest 664 // 665 address generate_atomic_xchg() { 666 StubCodeMark mark(this, "StubRoutines", "atomic_xchg"); 667 address start = __ pc(); 668 669 if (UseCASForSwap) { 670 // Use CAS instead of swap, just in case the MP hardware 671 // prefers to work with just one kind of synch. instruction. 672 Label retry; 673 __ BIND(retry); 674 __ mov(O0, O3); // scratch copy of exchange value 675 __ ld(O1, 0, O2); // observe the previous value 676 // try to replace O2 with O3 677 __ cas_under_lock(O1, O2, O3, 678 (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false); 679 __ cmp(O2, O3); 680 __ br(Assembler::notEqual, false, Assembler::pn, retry); 681 __ delayed()->nop(); 682 683 __ retl(false); 684 __ delayed()->mov(O2, O0); // report previous value to caller 685 686 } else { 687 if (VM_Version::v9_instructions_work()) { 688 __ retl(false); 689 __ delayed()->swap(O1, 0, O0); 690 } else { 691 const Register& lock_reg = O2; 692 const Register& lock_ptr_reg = O3; 693 const Register& yield_reg = O4; 694 695 Label retry; 696 Label dontyield; 697 698 generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield); 699 // got the lock, do the swap 700 __ swap(O1, 0, O0); 701 702 generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield); 703 __ retl(false); 704 __ delayed()->nop(); 705 } 706 } 707 708 return start; 709 } 710 711 712 // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value) 713 // 714 // Arguments : 715 // 716 // exchange_value: O0 717 // dest: O1 718 // compare_value: O2 719 // 720 // Results: 721 // 722 // O0: the value previously stored in dest 723 // 724 // Overwrites (v8): O3,O4,O5 725 // 726 address generate_atomic_cmpxchg() { 727 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg"); 728 address start = __ pc(); 729 730 // cmpxchg(dest, compare_value, exchange_value) 731 __ cas_under_lock(O1, O2, O0, 732 (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false); 733 __ retl(false); 734 __ delayed()->nop(); 735 736 return start; 737 } 738 739 // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value) 740 // 741 // Arguments : 742 // 743 // exchange_value: O1:O0 744 // dest: O2 745 // compare_value: O4:O3 746 // 747 // Results: 748 // 749 // O1:O0: the value previously stored in dest 750 // 751 // This only works on V9, on V8 we don't generate any 752 // code and just return NULL. 753 // 754 // Overwrites: G1,G2,G3 755 // 756 address generate_atomic_cmpxchg_long() { 757 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long"); 758 address start = __ pc(); 759 760 if (!VM_Version::supports_cx8()) 761 return NULL;; 762 __ sllx(O0, 32, O0); 763 __ srl(O1, 0, O1); 764 __ or3(O0,O1,O0); // O0 holds 64-bit value from compare_value 765 __ sllx(O3, 32, O3); 766 __ srl(O4, 0, O4); 767 __ or3(O3,O4,O3); // O3 holds 64-bit value from exchange_value 768 __ casx(O2, O3, O0); 769 __ srl(O0, 0, O1); // unpacked return value in O1:O0 770 __ retl(false); 771 __ delayed()->srlx(O0, 32, O0); 772 773 return start; 774 } 775 776 777 // Support for jint Atomic::add(jint add_value, volatile jint* dest). 778 // 779 // Arguments : 780 // 781 // add_value: O0 (e.g., +1 or -1) 782 // dest: O1 783 // 784 // Results: 785 // 786 // O0: the new value stored in dest 787 // 788 // Overwrites (v9): O3 789 // Overwrites (v8): O3,O4,O5 790 // 791 address generate_atomic_add() { 792 StubCodeMark mark(this, "StubRoutines", "atomic_add"); 793 address start = __ pc(); 794 __ BIND(_atomic_add_stub); 795 796 if (VM_Version::v9_instructions_work()) { 797 Label(retry); 798 __ BIND(retry); 799 800 __ lduw(O1, 0, O2); 801 __ add(O0, O2, O3); 802 __ cas(O1, O2, O3); 803 __ cmp( O2, O3); 804 __ br(Assembler::notEqual, false, Assembler::pn, retry); 805 __ delayed()->nop(); 806 __ retl(false); 807 __ delayed()->add(O0, O2, O0); // note that cas made O2==O3 808 } else { 809 const Register& lock_reg = O2; 810 const Register& lock_ptr_reg = O3; 811 const Register& value_reg = O4; 812 const Register& yield_reg = O5; 813 814 Label(retry); 815 Label(dontyield); 816 817 generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield); 818 // got lock, do the increment 819 __ ld(O1, 0, value_reg); 820 __ add(O0, value_reg, value_reg); 821 __ st(value_reg, O1, 0); 822 823 // %%% only for RMO and PSO 824 __ membar(Assembler::StoreStore); 825 826 generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield); 827 828 __ retl(false); 829 __ delayed()->mov(value_reg, O0); 830 } 831 832 return start; 833 } 834 Label _atomic_add_stub; // called from other stubs 835 836 837 //------------------------------------------------------------------------------------------------------------------------ 838 // The following routine generates a subroutine to throw an asynchronous 839 // UnknownError when an unsafe access gets a fault that could not be 840 // reasonably prevented by the programmer. (Example: SIGBUS/OBJERR.) 841 // 842 // Arguments : 843 // 844 // trapping PC: O7 845 // 846 // Results: 847 // posts an asynchronous exception, skips the trapping instruction 848 // 849 850 address generate_handler_for_unsafe_access() { 851 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access"); 852 address start = __ pc(); 853 854 const int preserve_register_words = (64 * 2); 855 Address preserve_addr(FP, (-preserve_register_words * wordSize) + STACK_BIAS); 856 857 Register Lthread = L7_thread_cache; 858 int i; 859 860 __ save_frame(0); 861 __ mov(G1, L1); 862 __ mov(G2, L2); 863 __ mov(G3, L3); 864 __ mov(G4, L4); 865 __ mov(G5, L5); 866 for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) { 867 __ stf(FloatRegisterImpl::D, as_FloatRegister(i), preserve_addr, i * wordSize); 868 } 869 870 address entry_point = CAST_FROM_FN_PTR(address, handle_unsafe_access); 871 BLOCK_COMMENT("call handle_unsafe_access"); 872 __ call(entry_point, relocInfo::runtime_call_type); 873 __ delayed()->nop(); 874 875 __ mov(L1, G1); 876 __ mov(L2, G2); 877 __ mov(L3, G3); 878 __ mov(L4, G4); 879 __ mov(L5, G5); 880 for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) { 881 __ ldf(FloatRegisterImpl::D, preserve_addr, as_FloatRegister(i), i * wordSize); 882 } 883 884 __ verify_thread(); 885 886 __ jmp(O0, 0); 887 __ delayed()->restore(); 888 889 return start; 890 } 891 892 893 // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super ); 894 // Arguments : 895 // 896 // ret : O0, returned 897 // icc/xcc: set as O0 (depending on wordSize) 898 // sub : O1, argument, not changed 899 // super: O2, argument, not changed 900 // raddr: O7, blown by call 901 address generate_partial_subtype_check() { 902 __ align(CodeEntryAlignment); 903 StubCodeMark mark(this, "StubRoutines", "partial_subtype_check"); 904 address start = __ pc(); 905 Label miss; 906 907 #if defined(COMPILER2) && !defined(_LP64) 908 // Do not use a 'save' because it blows the 64-bit O registers. 909 __ add(SP,-4*wordSize,SP); // Make space for 4 temps (stack must be 2 words aligned) 910 __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize); 911 __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize); 912 __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize); 913 __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize); 914 Register Rret = O0; 915 Register Rsub = O1; 916 Register Rsuper = O2; 917 #else 918 __ save_frame(0); 919 Register Rret = I0; 920 Register Rsub = I1; 921 Register Rsuper = I2; 922 #endif 923 924 Register L0_ary_len = L0; 925 Register L1_ary_ptr = L1; 926 Register L2_super = L2; 927 Register L3_index = L3; 928 929 __ check_klass_subtype_slow_path(Rsub, Rsuper, 930 L0, L1, L2, L3, 931 NULL, &miss); 932 933 // Match falls through here. 934 __ addcc(G0,0,Rret); // set Z flags, Z result 935 936 #if defined(COMPILER2) && !defined(_LP64) 937 __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0); 938 __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1); 939 __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2); 940 __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3); 941 __ retl(); // Result in Rret is zero; flags set to Z 942 __ delayed()->add(SP,4*wordSize,SP); 943 #else 944 __ ret(); // Result in Rret is zero; flags set to Z 945 __ delayed()->restore(); 946 #endif 947 948 __ BIND(miss); 949 __ addcc(G0,1,Rret); // set NZ flags, NZ result 950 951 #if defined(COMPILER2) && !defined(_LP64) 952 __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0); 953 __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1); 954 __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2); 955 __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3); 956 __ retl(); // Result in Rret is != 0; flags set to NZ 957 __ delayed()->add(SP,4*wordSize,SP); 958 #else 959 __ ret(); // Result in Rret is != 0; flags set to NZ 960 __ delayed()->restore(); 961 #endif 962 963 return start; 964 } 965 966 967 // Called from MacroAssembler::verify_oop 968 // 969 address generate_verify_oop_subroutine() { 970 StubCodeMark mark(this, "StubRoutines", "verify_oop_stub"); 971 972 address start = __ pc(); 973 974 __ verify_oop_subroutine(); 975 976 return start; 977 } 978 979 980 // 981 // Verify that a register contains clean 32-bits positive value 982 // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax). 983 // 984 // Input: 985 // Rint - 32-bits value 986 // Rtmp - scratch 987 // 988 void assert_clean_int(Register Rint, Register Rtmp) { 989 #if defined(ASSERT) && defined(_LP64) 990 __ signx(Rint, Rtmp); 991 __ cmp(Rint, Rtmp); 992 __ breakpoint_trap(Assembler::notEqual, Assembler::xcc); 993 #endif 994 } 995 996 // 997 // Generate overlap test for array copy stubs 998 // 999 // Input: 1000 // O0 - array1 1001 // O1 - array2 1002 // O2 - element count 1003 // 1004 // Kills temps: O3, O4 1005 // 1006 void array_overlap_test(address no_overlap_target, int log2_elem_size) { 1007 assert(no_overlap_target != NULL, "must be generated"); 1008 array_overlap_test(no_overlap_target, NULL, log2_elem_size); 1009 } 1010 void array_overlap_test(Label& L_no_overlap, int log2_elem_size) { 1011 array_overlap_test(NULL, &L_no_overlap, log2_elem_size); 1012 } 1013 void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) { 1014 const Register from = O0; 1015 const Register to = O1; 1016 const Register count = O2; 1017 const Register to_from = O3; // to - from 1018 const Register byte_count = O4; // count << log2_elem_size 1019 1020 __ subcc(to, from, to_from); 1021 __ sll_ptr(count, log2_elem_size, byte_count); 1022 if (NOLp == NULL) 1023 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target); 1024 else 1025 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp)); 1026 __ delayed()->cmp(to_from, byte_count); 1027 if (NOLp == NULL) 1028 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target); 1029 else 1030 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp)); 1031 __ delayed()->nop(); 1032 } 1033 1034 // 1035 // Generate pre-write barrier for array. 1036 // 1037 // Input: 1038 // addr - register containing starting address 1039 // count - register containing element count 1040 // tmp - scratch register 1041 // 1042 // The input registers are overwritten. 1043 // 1044 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 1045 BarrierSet* bs = Universe::heap()->barrier_set(); 1046 switch (bs->kind()) { 1047 case BarrierSet::G1SATBCT: 1048 case BarrierSet::G1SATBCTLogging: 1049 // With G1, don't generate the call if we statically know that the target in uninitialized 1050 if (!dest_uninitialized) { 1051 __ save_frame(0); 1052 // Save the necessary global regs... will be used after. 1053 if (addr->is_global()) { 1054 __ mov(addr, L0); 1055 } 1056 if (count->is_global()) { 1057 __ mov(count, L1); 1058 } 1059 __ mov(addr->after_save(), O0); 1060 // Get the count into O1 1061 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre)); 1062 __ delayed()->mov(count->after_save(), O1); 1063 if (addr->is_global()) { 1064 __ mov(L0, addr); 1065 } 1066 if (count->is_global()) { 1067 __ mov(L1, count); 1068 } 1069 __ restore(); 1070 } 1071 break; 1072 case BarrierSet::CardTableModRef: 1073 case BarrierSet::CardTableExtension: 1074 case BarrierSet::ModRef: 1075 break; 1076 default: 1077 ShouldNotReachHere(); 1078 } 1079 } 1080 // 1081 // Generate post-write barrier for array. 1082 // 1083 // Input: 1084 // addr - register containing starting address 1085 // count - register containing element count 1086 // tmp - scratch register 1087 // 1088 // The input registers are overwritten. 1089 // 1090 void gen_write_ref_array_post_barrier(Register addr, Register count, 1091 Register tmp) { 1092 BarrierSet* bs = Universe::heap()->barrier_set(); 1093 1094 switch (bs->kind()) { 1095 case BarrierSet::G1SATBCT: 1096 case BarrierSet::G1SATBCTLogging: 1097 { 1098 // Get some new fresh output registers. 1099 __ save_frame(0); 1100 __ mov(addr->after_save(), O0); 1101 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post)); 1102 __ delayed()->mov(count->after_save(), O1); 1103 __ restore(); 1104 } 1105 break; 1106 case BarrierSet::CardTableModRef: 1107 case BarrierSet::CardTableExtension: 1108 { 1109 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 1110 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 1111 assert_different_registers(addr, count, tmp); 1112 1113 Label L_loop; 1114 1115 __ sll_ptr(count, LogBytesPerHeapOop, count); 1116 __ sub(count, BytesPerHeapOop, count); 1117 __ add(count, addr, count); 1118 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.) 1119 __ srl_ptr(addr, CardTableModRefBS::card_shift, addr); 1120 __ srl_ptr(count, CardTableModRefBS::card_shift, count); 1121 __ sub(count, addr, count); 1122 AddressLiteral rs(ct->byte_map_base); 1123 __ set(rs, tmp); 1124 __ BIND(L_loop); 1125 __ stb(G0, tmp, addr); 1126 __ subcc(count, 1, count); 1127 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1128 __ delayed()->add(addr, 1, addr); 1129 } 1130 break; 1131 case BarrierSet::ModRef: 1132 break; 1133 default: 1134 ShouldNotReachHere(); 1135 } 1136 } 1137 1138 1139 // Copy big chunks forward with shift 1140 // 1141 // Inputs: 1142 // from - source arrays 1143 // to - destination array aligned to 8-bytes 1144 // count - elements count to copy >= the count equivalent to 16 bytes 1145 // count_dec - elements count's decrement equivalent to 16 bytes 1146 // L_copy_bytes - copy exit label 1147 // 1148 void copy_16_bytes_forward_with_shift(Register from, Register to, 1149 Register count, int count_dec, Label& L_copy_bytes) { 1150 Label L_loop, L_aligned_copy, L_copy_last_bytes; 1151 1152 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy 1153 __ andcc(from, 7, G1); // misaligned bytes 1154 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1155 __ delayed()->nop(); 1156 1157 const Register left_shift = G1; // left shift bit counter 1158 const Register right_shift = G5; // right shift bit counter 1159 1160 __ sll(G1, LogBitsPerByte, left_shift); 1161 __ mov(64, right_shift); 1162 __ sub(right_shift, left_shift, right_shift); 1163 1164 // 1165 // Load 2 aligned 8-bytes chunks and use one from previous iteration 1166 // to form 2 aligned 8-bytes chunks to store. 1167 // 1168 __ deccc(count, count_dec); // Pre-decrement 'count' 1169 __ andn(from, 7, from); // Align address 1170 __ ldx(from, 0, O3); 1171 __ inc(from, 8); 1172 __ align(OptoLoopAlignment); 1173 __ BIND(L_loop); 1174 __ ldx(from, 0, O4); 1175 __ deccc(count, count_dec); // Can we do next iteration after this one? 1176 __ ldx(from, 8, G4); 1177 __ inc(to, 16); 1178 __ inc(from, 16); 1179 __ sllx(O3, left_shift, O3); 1180 __ srlx(O4, right_shift, G3); 1181 __ bset(G3, O3); 1182 __ stx(O3, to, -16); 1183 __ sllx(O4, left_shift, O4); 1184 __ srlx(G4, right_shift, G3); 1185 __ bset(G3, O4); 1186 __ stx(O4, to, -8); 1187 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1188 __ delayed()->mov(G4, O3); 1189 1190 __ inccc(count, count_dec>>1 ); // + 8 bytes 1191 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 1192 __ delayed()->inc(count, count_dec>>1); // restore 'count' 1193 1194 // copy 8 bytes, part of them already loaded in O3 1195 __ ldx(from, 0, O4); 1196 __ inc(to, 8); 1197 __ inc(from, 8); 1198 __ sllx(O3, left_shift, O3); 1199 __ srlx(O4, right_shift, G3); 1200 __ bset(O3, G3); 1201 __ stx(G3, to, -8); 1202 1203 __ BIND(L_copy_last_bytes); 1204 __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes 1205 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 1206 __ delayed()->sub(from, right_shift, from); // restore address 1207 1208 __ BIND(L_aligned_copy); 1209 } 1210 1211 // Copy big chunks backward with shift 1212 // 1213 // Inputs: 1214 // end_from - source arrays end address 1215 // end_to - destination array end address aligned to 8-bytes 1216 // count - elements count to copy >= the count equivalent to 16 bytes 1217 // count_dec - elements count's decrement equivalent to 16 bytes 1218 // L_aligned_copy - aligned copy exit label 1219 // L_copy_bytes - copy exit label 1220 // 1221 void copy_16_bytes_backward_with_shift(Register end_from, Register end_to, 1222 Register count, int count_dec, 1223 Label& L_aligned_copy, Label& L_copy_bytes) { 1224 Label L_loop, L_copy_last_bytes; 1225 1226 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy 1227 __ andcc(end_from, 7, G1); // misaligned bytes 1228 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1229 __ delayed()->deccc(count, count_dec); // Pre-decrement 'count' 1230 1231 const Register left_shift = G1; // left shift bit counter 1232 const Register right_shift = G5; // right shift bit counter 1233 1234 __ sll(G1, LogBitsPerByte, left_shift); 1235 __ mov(64, right_shift); 1236 __ sub(right_shift, left_shift, right_shift); 1237 1238 // 1239 // Load 2 aligned 8-bytes chunks and use one from previous iteration 1240 // to form 2 aligned 8-bytes chunks to store. 1241 // 1242 __ andn(end_from, 7, end_from); // Align address 1243 __ ldx(end_from, 0, O3); 1244 __ align(OptoLoopAlignment); 1245 __ BIND(L_loop); 1246 __ ldx(end_from, -8, O4); 1247 __ deccc(count, count_dec); // Can we do next iteration after this one? 1248 __ ldx(end_from, -16, G4); 1249 __ dec(end_to, 16); 1250 __ dec(end_from, 16); 1251 __ srlx(O3, right_shift, O3); 1252 __ sllx(O4, left_shift, G3); 1253 __ bset(G3, O3); 1254 __ stx(O3, end_to, 8); 1255 __ srlx(O4, right_shift, O4); 1256 __ sllx(G4, left_shift, G3); 1257 __ bset(G3, O4); 1258 __ stx(O4, end_to, 0); 1259 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1260 __ delayed()->mov(G4, O3); 1261 1262 __ inccc(count, count_dec>>1 ); // + 8 bytes 1263 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 1264 __ delayed()->inc(count, count_dec>>1); // restore 'count' 1265 1266 // copy 8 bytes, part of them already loaded in O3 1267 __ ldx(end_from, -8, O4); 1268 __ dec(end_to, 8); 1269 __ dec(end_from, 8); 1270 __ srlx(O3, right_shift, O3); 1271 __ sllx(O4, left_shift, G3); 1272 __ bset(O3, G3); 1273 __ stx(G3, end_to, 0); 1274 1275 __ BIND(L_copy_last_bytes); 1276 __ srl(left_shift, LogBitsPerByte, left_shift); // misaligned bytes 1277 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 1278 __ delayed()->add(end_from, left_shift, end_from); // restore address 1279 } 1280 1281 // 1282 // Generate stub for disjoint byte copy. If "aligned" is true, the 1283 // "from" and "to" addresses are assumed to be heapword aligned. 1284 // 1285 // Arguments for generated stub: 1286 // from: O0 1287 // to: O1 1288 // count: O2 treated as signed 1289 // 1290 address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) { 1291 __ align(CodeEntryAlignment); 1292 StubCodeMark mark(this, "StubRoutines", name); 1293 address start = __ pc(); 1294 1295 Label L_skip_alignment, L_align; 1296 Label L_copy_byte, L_copy_byte_loop, L_exit; 1297 1298 const Register from = O0; // source array address 1299 const Register to = O1; // destination array address 1300 const Register count = O2; // elements count 1301 const Register offset = O5; // offset from start of arrays 1302 // O3, O4, G3, G4 are used as temp registers 1303 1304 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1305 1306 if (entry != NULL) { 1307 *entry = __ pc(); 1308 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1309 BLOCK_COMMENT("Entry:"); 1310 } 1311 1312 // for short arrays, just do single element copy 1313 __ cmp(count, 23); // 16 + 7 1314 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte); 1315 __ delayed()->mov(G0, offset); 1316 1317 if (aligned) { 1318 // 'aligned' == true when it is known statically during compilation 1319 // of this arraycopy call site that both 'from' and 'to' addresses 1320 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1321 // 1322 // Aligned arrays have 4 bytes alignment in 32-bits VM 1323 // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM 1324 // 1325 #ifndef _LP64 1326 // copy a 4-bytes word if necessary to align 'to' to 8 bytes 1327 __ andcc(to, 7, G0); 1328 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment); 1329 __ delayed()->ld(from, 0, O3); 1330 __ inc(from, 4); 1331 __ inc(to, 4); 1332 __ dec(count, 4); 1333 __ st(O3, to, -4); 1334 __ BIND(L_skip_alignment); 1335 #endif 1336 } else { 1337 // copy bytes to align 'to' on 8 byte boundary 1338 __ andcc(to, 7, G1); // misaligned bytes 1339 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1340 __ delayed()->neg(G1); 1341 __ inc(G1, 8); // bytes need to copy to next 8-bytes alignment 1342 __ sub(count, G1, count); 1343 __ BIND(L_align); 1344 __ ldub(from, 0, O3); 1345 __ deccc(G1); 1346 __ inc(from); 1347 __ stb(O3, to, 0); 1348 __ br(Assembler::notZero, false, Assembler::pt, L_align); 1349 __ delayed()->inc(to); 1350 __ BIND(L_skip_alignment); 1351 } 1352 #ifdef _LP64 1353 if (!aligned) 1354 #endif 1355 { 1356 // Copy with shift 16 bytes per iteration if arrays do not have 1357 // the same alignment mod 8, otherwise fall through to the next 1358 // code for aligned copy. 1359 // The compare above (count >= 23) guarantes 'count' >= 16 bytes. 1360 // Also jump over aligned copy after the copy with shift completed. 1361 1362 copy_16_bytes_forward_with_shift(from, to, count, 16, L_copy_byte); 1363 } 1364 1365 // Both array are 8 bytes aligned, copy 16 bytes at a time 1366 __ and3(count, 7, G4); // Save count 1367 __ srl(count, 3, count); 1368 generate_disjoint_long_copy_core(aligned); 1369 __ mov(G4, count); // Restore count 1370 1371 // copy tailing bytes 1372 __ BIND(L_copy_byte); 1373 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); 1374 __ delayed()->nop(); 1375 __ align(OptoLoopAlignment); 1376 __ BIND(L_copy_byte_loop); 1377 __ ldub(from, offset, O3); 1378 __ deccc(count); 1379 __ stb(O3, to, offset); 1380 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop); 1381 __ delayed()->inc(offset); 1382 1383 __ BIND(L_exit); 1384 // O3, O4 are used as temp registers 1385 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4); 1386 __ retl(); 1387 __ delayed()->mov(G0, O0); // return 0 1388 return start; 1389 } 1390 1391 // 1392 // Generate stub for conjoint byte copy. If "aligned" is true, the 1393 // "from" and "to" addresses are assumed to be heapword aligned. 1394 // 1395 // Arguments for generated stub: 1396 // from: O0 1397 // to: O1 1398 // count: O2 treated as signed 1399 // 1400 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1401 address *entry, const char *name) { 1402 // Do reverse copy. 1403 1404 __ align(CodeEntryAlignment); 1405 StubCodeMark mark(this, "StubRoutines", name); 1406 address start = __ pc(); 1407 1408 Label L_skip_alignment, L_align, L_aligned_copy; 1409 Label L_copy_byte, L_copy_byte_loop, L_exit; 1410 1411 const Register from = O0; // source array address 1412 const Register to = O1; // destination array address 1413 const Register count = O2; // elements count 1414 const Register end_from = from; // source array end address 1415 const Register end_to = to; // destination array end address 1416 1417 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1418 1419 if (entry != NULL) { 1420 *entry = __ pc(); 1421 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1422 BLOCK_COMMENT("Entry:"); 1423 } 1424 1425 array_overlap_test(nooverlap_target, 0); 1426 1427 __ add(to, count, end_to); // offset after last copied element 1428 1429 // for short arrays, just do single element copy 1430 __ cmp(count, 23); // 16 + 7 1431 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte); 1432 __ delayed()->add(from, count, end_from); 1433 1434 { 1435 // Align end of arrays since they could be not aligned even 1436 // when arrays itself are aligned. 1437 1438 // copy bytes to align 'end_to' on 8 byte boundary 1439 __ andcc(end_to, 7, G1); // misaligned bytes 1440 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1441 __ delayed()->nop(); 1442 __ sub(count, G1, count); 1443 __ BIND(L_align); 1444 __ dec(end_from); 1445 __ dec(end_to); 1446 __ ldub(end_from, 0, O3); 1447 __ deccc(G1); 1448 __ brx(Assembler::notZero, false, Assembler::pt, L_align); 1449 __ delayed()->stb(O3, end_to, 0); 1450 __ BIND(L_skip_alignment); 1451 } 1452 #ifdef _LP64 1453 if (aligned) { 1454 // Both arrays are aligned to 8-bytes in 64-bits VM. 1455 // The 'count' is decremented in copy_16_bytes_backward_with_shift() 1456 // in unaligned case. 1457 __ dec(count, 16); 1458 } else 1459 #endif 1460 { 1461 // Copy with shift 16 bytes per iteration if arrays do not have 1462 // the same alignment mod 8, otherwise jump to the next 1463 // code for aligned copy (and substracting 16 from 'count' before jump). 1464 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1465 // Also jump over aligned copy after the copy with shift completed. 1466 1467 copy_16_bytes_backward_with_shift(end_from, end_to, count, 16, 1468 L_aligned_copy, L_copy_byte); 1469 } 1470 // copy 4 elements (16 bytes) at a time 1471 __ align(OptoLoopAlignment); 1472 __ BIND(L_aligned_copy); 1473 __ dec(end_from, 16); 1474 __ ldx(end_from, 8, O3); 1475 __ ldx(end_from, 0, O4); 1476 __ dec(end_to, 16); 1477 __ deccc(count, 16); 1478 __ stx(O3, end_to, 8); 1479 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 1480 __ delayed()->stx(O4, end_to, 0); 1481 __ inc(count, 16); 1482 1483 // copy 1 element (2 bytes) at a time 1484 __ BIND(L_copy_byte); 1485 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); 1486 __ delayed()->nop(); 1487 __ align(OptoLoopAlignment); 1488 __ BIND(L_copy_byte_loop); 1489 __ dec(end_from); 1490 __ dec(end_to); 1491 __ ldub(end_from, 0, O4); 1492 __ deccc(count); 1493 __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop); 1494 __ delayed()->stb(O4, end_to, 0); 1495 1496 __ BIND(L_exit); 1497 // O3, O4 are used as temp registers 1498 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4); 1499 __ retl(); 1500 __ delayed()->mov(G0, O0); // return 0 1501 return start; 1502 } 1503 1504 // 1505 // Generate stub for disjoint short copy. If "aligned" is true, the 1506 // "from" and "to" addresses are assumed to be heapword aligned. 1507 // 1508 // Arguments for generated stub: 1509 // from: O0 1510 // to: O1 1511 // count: O2 treated as signed 1512 // 1513 address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) { 1514 __ align(CodeEntryAlignment); 1515 StubCodeMark mark(this, "StubRoutines", name); 1516 address start = __ pc(); 1517 1518 Label L_skip_alignment, L_skip_alignment2; 1519 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit; 1520 1521 const Register from = O0; // source array address 1522 const Register to = O1; // destination array address 1523 const Register count = O2; // elements count 1524 const Register offset = O5; // offset from start of arrays 1525 // O3, O4, G3, G4 are used as temp registers 1526 1527 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1528 1529 if (entry != NULL) { 1530 *entry = __ pc(); 1531 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1532 BLOCK_COMMENT("Entry:"); 1533 } 1534 1535 // for short arrays, just do single element copy 1536 __ cmp(count, 11); // 8 + 3 (22 bytes) 1537 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes); 1538 __ delayed()->mov(G0, offset); 1539 1540 if (aligned) { 1541 // 'aligned' == true when it is known statically during compilation 1542 // of this arraycopy call site that both 'from' and 'to' addresses 1543 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1544 // 1545 // Aligned arrays have 4 bytes alignment in 32-bits VM 1546 // and 8 bytes - in 64-bits VM. 1547 // 1548 #ifndef _LP64 1549 // copy a 2-elements word if necessary to align 'to' to 8 bytes 1550 __ andcc(to, 7, G0); 1551 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1552 __ delayed()->ld(from, 0, O3); 1553 __ inc(from, 4); 1554 __ inc(to, 4); 1555 __ dec(count, 2); 1556 __ st(O3, to, -4); 1557 __ BIND(L_skip_alignment); 1558 #endif 1559 } else { 1560 // copy 1 element if necessary to align 'to' on an 4 bytes 1561 __ andcc(to, 3, G0); 1562 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1563 __ delayed()->lduh(from, 0, O3); 1564 __ inc(from, 2); 1565 __ inc(to, 2); 1566 __ dec(count); 1567 __ sth(O3, to, -2); 1568 __ BIND(L_skip_alignment); 1569 1570 // copy 2 elements to align 'to' on an 8 byte boundary 1571 __ andcc(to, 7, G0); 1572 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2); 1573 __ delayed()->lduh(from, 0, O3); 1574 __ dec(count, 2); 1575 __ lduh(from, 2, O4); 1576 __ inc(from, 4); 1577 __ inc(to, 4); 1578 __ sth(O3, to, -4); 1579 __ sth(O4, to, -2); 1580 __ BIND(L_skip_alignment2); 1581 } 1582 #ifdef _LP64 1583 if (!aligned) 1584 #endif 1585 { 1586 // Copy with shift 16 bytes per iteration if arrays do not have 1587 // the same alignment mod 8, otherwise fall through to the next 1588 // code for aligned copy. 1589 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1590 // Also jump over aligned copy after the copy with shift completed. 1591 1592 copy_16_bytes_forward_with_shift(from, to, count, 8, L_copy_2_bytes); 1593 } 1594 1595 // Both array are 8 bytes aligned, copy 16 bytes at a time 1596 __ and3(count, 3, G4); // Save 1597 __ srl(count, 2, count); 1598 generate_disjoint_long_copy_core(aligned); 1599 __ mov(G4, count); // restore 1600 1601 // copy 1 element at a time 1602 __ BIND(L_copy_2_bytes); 1603 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); 1604 __ delayed()->nop(); 1605 __ align(OptoLoopAlignment); 1606 __ BIND(L_copy_2_bytes_loop); 1607 __ lduh(from, offset, O3); 1608 __ deccc(count); 1609 __ sth(O3, to, offset); 1610 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop); 1611 __ delayed()->inc(offset, 2); 1612 1613 __ BIND(L_exit); 1614 // O3, O4 are used as temp registers 1615 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4); 1616 __ retl(); 1617 __ delayed()->mov(G0, O0); // return 0 1618 return start; 1619 } 1620 1621 // 1622 // Generate stub for disjoint short fill. If "aligned" is true, the 1623 // "to" address is assumed to be heapword aligned. 1624 // 1625 // Arguments for generated stub: 1626 // to: O0 1627 // value: O1 1628 // count: O2 treated as signed 1629 // 1630 address generate_fill(BasicType t, bool aligned, const char* name) { 1631 __ align(CodeEntryAlignment); 1632 StubCodeMark mark(this, "StubRoutines", name); 1633 address start = __ pc(); 1634 1635 const Register to = O0; // source array address 1636 const Register value = O1; // fill value 1637 const Register count = O2; // elements count 1638 // O3 is used as a temp register 1639 1640 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1641 1642 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte; 1643 Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes; 1644 1645 int shift = -1; 1646 switch (t) { 1647 case T_BYTE: 1648 shift = 2; 1649 break; 1650 case T_SHORT: 1651 shift = 1; 1652 break; 1653 case T_INT: 1654 shift = 0; 1655 break; 1656 default: ShouldNotReachHere(); 1657 } 1658 1659 BLOCK_COMMENT("Entry:"); 1660 1661 if (t == T_BYTE) { 1662 // Zero extend value 1663 __ and3(value, 0xff, value); 1664 __ sllx(value, 8, O3); 1665 __ or3(value, O3, value); 1666 } 1667 if (t == T_SHORT) { 1668 // Zero extend value 1669 __ sllx(value, 48, value); 1670 __ srlx(value, 48, value); 1671 } 1672 if (t == T_BYTE || t == T_SHORT) { 1673 __ sllx(value, 16, O3); 1674 __ or3(value, O3, value); 1675 } 1676 1677 __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element 1678 __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp 1679 __ delayed()->andcc(count, 1, G0); 1680 1681 if (!aligned && (t == T_BYTE || t == T_SHORT)) { 1682 // align source address at 4 bytes address boundary 1683 if (t == T_BYTE) { 1684 // One byte misalignment happens only for byte arrays 1685 __ andcc(to, 1, G0); 1686 __ br(Assembler::zero, false, Assembler::pt, L_skip_align1); 1687 __ delayed()->nop(); 1688 __ stb(value, to, 0); 1689 __ inc(to, 1); 1690 __ dec(count, 1); 1691 __ BIND(L_skip_align1); 1692 } 1693 // Two bytes misalignment happens only for byte and short (char) arrays 1694 __ andcc(to, 2, G0); 1695 __ br(Assembler::zero, false, Assembler::pt, L_skip_align2); 1696 __ delayed()->nop(); 1697 __ sth(value, to, 0); 1698 __ inc(to, 2); 1699 __ dec(count, 1 << (shift - 1)); 1700 __ BIND(L_skip_align2); 1701 } 1702 #ifdef _LP64 1703 if (!aligned) { 1704 #endif 1705 // align to 8 bytes, we know we are 4 byte aligned to start 1706 __ andcc(to, 7, G0); 1707 __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes); 1708 __ delayed()->nop(); 1709 __ stw(value, to, 0); 1710 __ inc(to, 4); 1711 __ dec(count, 1 << shift); 1712 __ BIND(L_fill_32_bytes); 1713 #ifdef _LP64 1714 } 1715 #endif 1716 1717 if (t == T_INT) { 1718 // Zero extend value 1719 __ srl(value, 0, value); 1720 } 1721 if (t == T_BYTE || t == T_SHORT || t == T_INT) { 1722 __ sllx(value, 32, O3); 1723 __ or3(value, O3, value); 1724 } 1725 1726 Label L_check_fill_8_bytes; 1727 // Fill 32-byte chunks 1728 __ subcc(count, 8 << shift, count); 1729 __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes); 1730 __ delayed()->nop(); 1731 1732 Label L_fill_32_bytes_loop, L_fill_4_bytes; 1733 __ align(16); 1734 __ BIND(L_fill_32_bytes_loop); 1735 1736 __ stx(value, to, 0); 1737 __ stx(value, to, 8); 1738 __ stx(value, to, 16); 1739 __ stx(value, to, 24); 1740 1741 __ subcc(count, 8 << shift, count); 1742 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop); 1743 __ delayed()->add(to, 32, to); 1744 1745 __ BIND(L_check_fill_8_bytes); 1746 __ addcc(count, 8 << shift, count); 1747 __ brx(Assembler::zero, false, Assembler::pn, L_exit); 1748 __ delayed()->subcc(count, 1 << (shift + 1), count); 1749 __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes); 1750 __ delayed()->andcc(count, 1<<shift, G0); 1751 1752 // 1753 // length is too short, just fill 8 bytes at a time 1754 // 1755 Label L_fill_8_bytes_loop; 1756 __ BIND(L_fill_8_bytes_loop); 1757 __ stx(value, to, 0); 1758 __ subcc(count, 1 << (shift + 1), count); 1759 __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop); 1760 __ delayed()->add(to, 8, to); 1761 1762 // fill trailing 4 bytes 1763 __ andcc(count, 1<<shift, G0); // in delay slot of branches 1764 if (t == T_INT) { 1765 __ BIND(L_fill_elements); 1766 } 1767 __ BIND(L_fill_4_bytes); 1768 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes); 1769 if (t == T_BYTE || t == T_SHORT) { 1770 __ delayed()->andcc(count, 1<<(shift-1), G0); 1771 } else { 1772 __ delayed()->nop(); 1773 } 1774 __ stw(value, to, 0); 1775 if (t == T_BYTE || t == T_SHORT) { 1776 __ inc(to, 4); 1777 // fill trailing 2 bytes 1778 __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches 1779 __ BIND(L_fill_2_bytes); 1780 __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte); 1781 __ delayed()->andcc(count, 1, count); 1782 __ sth(value, to, 0); 1783 if (t == T_BYTE) { 1784 __ inc(to, 2); 1785 // fill trailing byte 1786 __ andcc(count, 1, count); // in delay slot of branches 1787 __ BIND(L_fill_byte); 1788 __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1789 __ delayed()->nop(); 1790 __ stb(value, to, 0); 1791 } else { 1792 __ BIND(L_fill_byte); 1793 } 1794 } else { 1795 __ BIND(L_fill_2_bytes); 1796 } 1797 __ BIND(L_exit); 1798 __ retl(); 1799 __ delayed()->nop(); 1800 1801 // Handle copies less than 8 bytes. Int is handled elsewhere. 1802 if (t == T_BYTE) { 1803 __ BIND(L_fill_elements); 1804 Label L_fill_2, L_fill_4; 1805 // in delay slot __ andcc(count, 1, G0); 1806 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2); 1807 __ delayed()->andcc(count, 2, G0); 1808 __ stb(value, to, 0); 1809 __ inc(to, 1); 1810 __ BIND(L_fill_2); 1811 __ brx(Assembler::zero, false, Assembler::pt, L_fill_4); 1812 __ delayed()->andcc(count, 4, G0); 1813 __ stb(value, to, 0); 1814 __ stb(value, to, 1); 1815 __ inc(to, 2); 1816 __ BIND(L_fill_4); 1817 __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1818 __ delayed()->nop(); 1819 __ stb(value, to, 0); 1820 __ stb(value, to, 1); 1821 __ stb(value, to, 2); 1822 __ retl(); 1823 __ delayed()->stb(value, to, 3); 1824 } 1825 1826 if (t == T_SHORT) { 1827 Label L_fill_2; 1828 __ BIND(L_fill_elements); 1829 // in delay slot __ andcc(count, 1, G0); 1830 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2); 1831 __ delayed()->andcc(count, 2, G0); 1832 __ sth(value, to, 0); 1833 __ inc(to, 2); 1834 __ BIND(L_fill_2); 1835 __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1836 __ delayed()->nop(); 1837 __ sth(value, to, 0); 1838 __ retl(); 1839 __ delayed()->sth(value, to, 2); 1840 } 1841 return start; 1842 } 1843 1844 // 1845 // Generate stub for conjoint short copy. If "aligned" is true, the 1846 // "from" and "to" addresses are assumed to be heapword aligned. 1847 // 1848 // Arguments for generated stub: 1849 // from: O0 1850 // to: O1 1851 // count: O2 treated as signed 1852 // 1853 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1854 address *entry, const char *name) { 1855 // Do reverse copy. 1856 1857 __ align(CodeEntryAlignment); 1858 StubCodeMark mark(this, "StubRoutines", name); 1859 address start = __ pc(); 1860 1861 Label L_skip_alignment, L_skip_alignment2, L_aligned_copy; 1862 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit; 1863 1864 const Register from = O0; // source array address 1865 const Register to = O1; // destination array address 1866 const Register count = O2; // elements count 1867 const Register end_from = from; // source array end address 1868 const Register end_to = to; // destination array end address 1869 1870 const Register byte_count = O3; // bytes count to copy 1871 1872 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1873 1874 if (entry != NULL) { 1875 *entry = __ pc(); 1876 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1877 BLOCK_COMMENT("Entry:"); 1878 } 1879 1880 array_overlap_test(nooverlap_target, 1); 1881 1882 __ sllx(count, LogBytesPerShort, byte_count); 1883 __ add(to, byte_count, end_to); // offset after last copied element 1884 1885 // for short arrays, just do single element copy 1886 __ cmp(count, 11); // 8 + 3 (22 bytes) 1887 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes); 1888 __ delayed()->add(from, byte_count, end_from); 1889 1890 { 1891 // Align end of arrays since they could be not aligned even 1892 // when arrays itself are aligned. 1893 1894 // copy 1 element if necessary to align 'end_to' on an 4 bytes 1895 __ andcc(end_to, 3, G0); 1896 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1897 __ delayed()->lduh(end_from, -2, O3); 1898 __ dec(end_from, 2); 1899 __ dec(end_to, 2); 1900 __ dec(count); 1901 __ sth(O3, end_to, 0); 1902 __ BIND(L_skip_alignment); 1903 1904 // copy 2 elements to align 'end_to' on an 8 byte boundary 1905 __ andcc(end_to, 7, G0); 1906 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2); 1907 __ delayed()->lduh(end_from, -2, O3); 1908 __ dec(count, 2); 1909 __ lduh(end_from, -4, O4); 1910 __ dec(end_from, 4); 1911 __ dec(end_to, 4); 1912 __ sth(O3, end_to, 2); 1913 __ sth(O4, end_to, 0); 1914 __ BIND(L_skip_alignment2); 1915 } 1916 #ifdef _LP64 1917 if (aligned) { 1918 // Both arrays are aligned to 8-bytes in 64-bits VM. 1919 // The 'count' is decremented in copy_16_bytes_backward_with_shift() 1920 // in unaligned case. 1921 __ dec(count, 8); 1922 } else 1923 #endif 1924 { 1925 // Copy with shift 16 bytes per iteration if arrays do not have 1926 // the same alignment mod 8, otherwise jump to the next 1927 // code for aligned copy (and substracting 8 from 'count' before jump). 1928 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1929 // Also jump over aligned copy after the copy with shift completed. 1930 1931 copy_16_bytes_backward_with_shift(end_from, end_to, count, 8, 1932 L_aligned_copy, L_copy_2_bytes); 1933 } 1934 // copy 4 elements (16 bytes) at a time 1935 __ align(OptoLoopAlignment); 1936 __ BIND(L_aligned_copy); 1937 __ dec(end_from, 16); 1938 __ ldx(end_from, 8, O3); 1939 __ ldx(end_from, 0, O4); 1940 __ dec(end_to, 16); 1941 __ deccc(count, 8); 1942 __ stx(O3, end_to, 8); 1943 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 1944 __ delayed()->stx(O4, end_to, 0); 1945 __ inc(count, 8); 1946 1947 // copy 1 element (2 bytes) at a time 1948 __ BIND(L_copy_2_bytes); 1949 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); 1950 __ delayed()->nop(); 1951 __ BIND(L_copy_2_bytes_loop); 1952 __ dec(end_from, 2); 1953 __ dec(end_to, 2); 1954 __ lduh(end_from, 0, O4); 1955 __ deccc(count); 1956 __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop); 1957 __ delayed()->sth(O4, end_to, 0); 1958 1959 __ BIND(L_exit); 1960 // O3, O4 are used as temp registers 1961 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4); 1962 __ retl(); 1963 __ delayed()->mov(G0, O0); // return 0 1964 return start; 1965 } 1966 1967 // 1968 // Generate core code for disjoint int copy (and oop copy on 32-bit). 1969 // If "aligned" is true, the "from" and "to" addresses are assumed 1970 // to be heapword aligned. 1971 // 1972 // Arguments: 1973 // from: O0 1974 // to: O1 1975 // count: O2 treated as signed 1976 // 1977 void generate_disjoint_int_copy_core(bool aligned) { 1978 1979 Label L_skip_alignment, L_aligned_copy; 1980 Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 1981 1982 const Register from = O0; // source array address 1983 const Register to = O1; // destination array address 1984 const Register count = O2; // elements count 1985 const Register offset = O5; // offset from start of arrays 1986 // O3, O4, G3, G4 are used as temp registers 1987 1988 // 'aligned' == true when it is known statically during compilation 1989 // of this arraycopy call site that both 'from' and 'to' addresses 1990 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1991 // 1992 // Aligned arrays have 4 bytes alignment in 32-bits VM 1993 // and 8 bytes - in 64-bits VM. 1994 // 1995 #ifdef _LP64 1996 if (!aligned) 1997 #endif 1998 { 1999 // The next check could be put under 'ifndef' since the code in 2000 // generate_disjoint_long_copy_core() has own checks and set 'offset'. 2001 2002 // for short arrays, just do single element copy 2003 __ cmp(count, 5); // 4 + 1 (20 bytes) 2004 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes); 2005 __ delayed()->mov(G0, offset); 2006 2007 // copy 1 element to align 'to' on an 8 byte boundary 2008 __ andcc(to, 7, G0); 2009 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 2010 __ delayed()->ld(from, 0, O3); 2011 __ inc(from, 4); 2012 __ inc(to, 4); 2013 __ dec(count); 2014 __ st(O3, to, -4); 2015 __ BIND(L_skip_alignment); 2016 2017 // if arrays have same alignment mod 8, do 4 elements copy 2018 __ andcc(from, 7, G0); 2019 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 2020 __ delayed()->ld(from, 0, O3); 2021 2022 // 2023 // Load 2 aligned 8-bytes chunks and use one from previous iteration 2024 // to form 2 aligned 8-bytes chunks to store. 2025 // 2026 // copy_16_bytes_forward_with_shift() is not used here since this 2027 // code is more optimal. 2028 2029 // copy with shift 4 elements (16 bytes) at a time 2030 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 2031 2032 __ align(OptoLoopAlignment); 2033 __ BIND(L_copy_16_bytes); 2034 __ ldx(from, 4, O4); 2035 __ deccc(count, 4); // Can we do next iteration after this one? 2036 __ ldx(from, 12, G4); 2037 __ inc(to, 16); 2038 __ inc(from, 16); 2039 __ sllx(O3, 32, O3); 2040 __ srlx(O4, 32, G3); 2041 __ bset(G3, O3); 2042 __ stx(O3, to, -16); 2043 __ sllx(O4, 32, O4); 2044 __ srlx(G4, 32, G3); 2045 __ bset(G3, O4); 2046 __ stx(O4, to, -8); 2047 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); 2048 __ delayed()->mov(G4, O3); 2049 2050 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); 2051 __ delayed()->inc(count, 4); // restore 'count' 2052 2053 __ BIND(L_aligned_copy); 2054 } 2055 // copy 4 elements (16 bytes) at a time 2056 __ and3(count, 1, G4); // Save 2057 __ srl(count, 1, count); 2058 generate_disjoint_long_copy_core(aligned); 2059 __ mov(G4, count); // Restore 2060 2061 // copy 1 element at a time 2062 __ BIND(L_copy_4_bytes); 2063 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); 2064 __ delayed()->nop(); 2065 __ BIND(L_copy_4_bytes_loop); 2066 __ ld(from, offset, O3); 2067 __ deccc(count); 2068 __ st(O3, to, offset); 2069 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop); 2070 __ delayed()->inc(offset, 4); 2071 __ BIND(L_exit); 2072 } 2073 2074 // 2075 // Generate stub for disjoint int copy. If "aligned" is true, the 2076 // "from" and "to" addresses are assumed to be heapword aligned. 2077 // 2078 // Arguments for generated stub: 2079 // from: O0 2080 // to: O1 2081 // count: O2 treated as signed 2082 // 2083 address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) { 2084 __ align(CodeEntryAlignment); 2085 StubCodeMark mark(this, "StubRoutines", name); 2086 address start = __ pc(); 2087 2088 const Register count = O2; 2089 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2090 2091 if (entry != NULL) { 2092 *entry = __ pc(); 2093 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2094 BLOCK_COMMENT("Entry:"); 2095 } 2096 2097 generate_disjoint_int_copy_core(aligned); 2098 2099 // O3, O4 are used as temp registers 2100 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4); 2101 __ retl(); 2102 __ delayed()->mov(G0, O0); // return 0 2103 return start; 2104 } 2105 2106 // 2107 // Generate core code for conjoint int copy (and oop copy on 32-bit). 2108 // If "aligned" is true, the "from" and "to" addresses are assumed 2109 // to be heapword aligned. 2110 // 2111 // Arguments: 2112 // from: O0 2113 // to: O1 2114 // count: O2 treated as signed 2115 // 2116 void generate_conjoint_int_copy_core(bool aligned) { 2117 // Do reverse copy. 2118 2119 Label L_skip_alignment, L_aligned_copy; 2120 Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 2121 2122 const Register from = O0; // source array address 2123 const Register to = O1; // destination array address 2124 const Register count = O2; // elements count 2125 const Register end_from = from; // source array end address 2126 const Register end_to = to; // destination array end address 2127 // O3, O4, O5, G3 are used as temp registers 2128 2129 const Register byte_count = O3; // bytes count to copy 2130 2131 __ sllx(count, LogBytesPerInt, byte_count); 2132 __ add(to, byte_count, end_to); // offset after last copied element 2133 2134 __ cmp(count, 5); // for short arrays, just do single element copy 2135 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes); 2136 __ delayed()->add(from, byte_count, end_from); 2137 2138 // copy 1 element to align 'to' on an 8 byte boundary 2139 __ andcc(end_to, 7, G0); 2140 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 2141 __ delayed()->nop(); 2142 __ dec(count); 2143 __ dec(end_from, 4); 2144 __ dec(end_to, 4); 2145 __ ld(end_from, 0, O4); 2146 __ st(O4, end_to, 0); 2147 __ BIND(L_skip_alignment); 2148 2149 // Check if 'end_from' and 'end_to' has the same alignment. 2150 __ andcc(end_from, 7, G0); 2151 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 2152 __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4 2153 2154 // copy with shift 4 elements (16 bytes) at a time 2155 // 2156 // Load 2 aligned 8-bytes chunks and use one from previous iteration 2157 // to form 2 aligned 8-bytes chunks to store. 2158 // 2159 __ ldx(end_from, -4, O3); 2160 __ align(OptoLoopAlignment); 2161 __ BIND(L_copy_16_bytes); 2162 __ ldx(end_from, -12, O4); 2163 __ deccc(count, 4); 2164 __ ldx(end_from, -20, O5); 2165 __ dec(end_to, 16); 2166 __ dec(end_from, 16); 2167 __ srlx(O3, 32, O3); 2168 __ sllx(O4, 32, G3); 2169 __ bset(G3, O3); 2170 __ stx(O3, end_to, 8); 2171 __ srlx(O4, 32, O4); 2172 __ sllx(O5, 32, G3); 2173 __ bset(O4, G3); 2174 __ stx(G3, end_to, 0); 2175 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); 2176 __ delayed()->mov(O5, O3); 2177 2178 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); 2179 __ delayed()->inc(count, 4); 2180 2181 // copy 4 elements (16 bytes) at a time 2182 __ align(OptoLoopAlignment); 2183 __ BIND(L_aligned_copy); 2184 __ dec(end_from, 16); 2185 __ ldx(end_from, 8, O3); 2186 __ ldx(end_from, 0, O4); 2187 __ dec(end_to, 16); 2188 __ deccc(count, 4); 2189 __ stx(O3, end_to, 8); 2190 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 2191 __ delayed()->stx(O4, end_to, 0); 2192 __ inc(count, 4); 2193 2194 // copy 1 element (4 bytes) at a time 2195 __ BIND(L_copy_4_bytes); 2196 __ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit); 2197 __ delayed()->nop(); 2198 __ BIND(L_copy_4_bytes_loop); 2199 __ dec(end_from, 4); 2200 __ dec(end_to, 4); 2201 __ ld(end_from, 0, O4); 2202 __ deccc(count); 2203 __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop); 2204 __ delayed()->st(O4, end_to, 0); 2205 __ BIND(L_exit); 2206 } 2207 2208 // 2209 // Generate stub for conjoint int copy. If "aligned" is true, the 2210 // "from" and "to" addresses are assumed to be heapword aligned. 2211 // 2212 // Arguments for generated stub: 2213 // from: O0 2214 // to: O1 2215 // count: O2 treated as signed 2216 // 2217 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 2218 address *entry, const char *name) { 2219 __ align(CodeEntryAlignment); 2220 StubCodeMark mark(this, "StubRoutines", name); 2221 address start = __ pc(); 2222 2223 assert_clean_int(O2, O3); // Make sure 'count' is clean int. 2224 2225 if (entry != NULL) { 2226 *entry = __ pc(); 2227 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2228 BLOCK_COMMENT("Entry:"); 2229 } 2230 2231 array_overlap_test(nooverlap_target, 2); 2232 2233 generate_conjoint_int_copy_core(aligned); 2234 2235 // O3, O4 are used as temp registers 2236 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4); 2237 __ retl(); 2238 __ delayed()->mov(G0, O0); // return 0 2239 return start; 2240 } 2241 2242 // 2243 // Generate core code for disjoint long copy (and oop copy on 64-bit). 2244 // "aligned" is ignored, because we must make the stronger 2245 // assumption that both addresses are always 64-bit aligned. 2246 // 2247 // Arguments: 2248 // from: O0 2249 // to: O1 2250 // count: O2 treated as signed 2251 // 2252 // count -= 2; 2253 // if ( count >= 0 ) { // >= 2 elements 2254 // if ( count > 6) { // >= 8 elements 2255 // count -= 6; // original count - 8 2256 // do { 2257 // copy_8_elements; 2258 // count -= 8; 2259 // } while ( count >= 0 ); 2260 // count += 6; 2261 // } 2262 // if ( count >= 0 ) { // >= 2 elements 2263 // do { 2264 // copy_2_elements; 2265 // } while ( (count=count-2) >= 0 ); 2266 // } 2267 // } 2268 // count += 2; 2269 // if ( count != 0 ) { // 1 element left 2270 // copy_1_element; 2271 // } 2272 // 2273 void generate_disjoint_long_copy_core(bool aligned) { 2274 Label L_copy_8_bytes, L_copy_16_bytes, L_exit; 2275 const Register from = O0; // source array address 2276 const Register to = O1; // destination array address 2277 const Register count = O2; // elements count 2278 const Register offset0 = O4; // element offset 2279 const Register offset8 = O5; // next element offset 2280 2281 __ deccc(count, 2); 2282 __ mov(G0, offset0); // offset from start of arrays (0) 2283 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 2284 __ delayed()->add(offset0, 8, offset8); 2285 2286 // Copy by 64 bytes chunks 2287 Label L_copy_64_bytes; 2288 const Register from64 = O3; // source address 2289 const Register to64 = G3; // destination address 2290 __ subcc(count, 6, O3); 2291 __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes ); 2292 __ delayed()->mov(to, to64); 2293 // Now we can use O4(offset0), O5(offset8) as temps 2294 __ mov(O3, count); 2295 __ mov(from, from64); 2296 2297 __ align(OptoLoopAlignment); 2298 __ BIND(L_copy_64_bytes); 2299 for( int off = 0; off < 64; off += 16 ) { 2300 __ ldx(from64, off+0, O4); 2301 __ ldx(from64, off+8, O5); 2302 __ stx(O4, to64, off+0); 2303 __ stx(O5, to64, off+8); 2304 } 2305 __ deccc(count, 8); 2306 __ inc(from64, 64); 2307 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes); 2308 __ delayed()->inc(to64, 64); 2309 2310 // Restore O4(offset0), O5(offset8) 2311 __ sub(from64, from, offset0); 2312 __ inccc(count, 6); 2313 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 2314 __ delayed()->add(offset0, 8, offset8); 2315 2316 // Copy by 16 bytes chunks 2317 __ align(OptoLoopAlignment); 2318 __ BIND(L_copy_16_bytes); 2319 __ ldx(from, offset0, O3); 2320 __ ldx(from, offset8, G3); 2321 __ deccc(count, 2); 2322 __ stx(O3, to, offset0); 2323 __ inc(offset0, 16); 2324 __ stx(G3, to, offset8); 2325 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); 2326 __ delayed()->inc(offset8, 16); 2327 2328 // Copy last 8 bytes 2329 __ BIND(L_copy_8_bytes); 2330 __ inccc(count, 2); 2331 __ brx(Assembler::zero, true, Assembler::pn, L_exit ); 2332 __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs 2333 __ ldx(from, offset0, O3); 2334 __ stx(O3, to, offset0); 2335 __ BIND(L_exit); 2336 } 2337 2338 // 2339 // Generate stub for disjoint long copy. 2340 // "aligned" is ignored, because we must make the stronger 2341 // assumption that both addresses are always 64-bit aligned. 2342 // 2343 // Arguments for generated stub: 2344 // from: O0 2345 // to: O1 2346 // count: O2 treated as signed 2347 // 2348 address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) { 2349 __ align(CodeEntryAlignment); 2350 StubCodeMark mark(this, "StubRoutines", name); 2351 address start = __ pc(); 2352 2353 assert_clean_int(O2, O3); // Make sure 'count' is clean int. 2354 2355 if (entry != NULL) { 2356 *entry = __ pc(); 2357 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2358 BLOCK_COMMENT("Entry:"); 2359 } 2360 2361 generate_disjoint_long_copy_core(aligned); 2362 2363 // O3, O4 are used as temp registers 2364 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4); 2365 __ retl(); 2366 __ delayed()->mov(G0, O0); // return 0 2367 return start; 2368 } 2369 2370 // 2371 // Generate core code for conjoint long copy (and oop copy on 64-bit). 2372 // "aligned" is ignored, because we must make the stronger 2373 // assumption that both addresses are always 64-bit aligned. 2374 // 2375 // Arguments: 2376 // from: O0 2377 // to: O1 2378 // count: O2 treated as signed 2379 // 2380 void generate_conjoint_long_copy_core(bool aligned) { 2381 // Do reverse copy. 2382 Label L_copy_8_bytes, L_copy_16_bytes, L_exit; 2383 const Register from = O0; // source array address 2384 const Register to = O1; // destination array address 2385 const Register count = O2; // elements count 2386 const Register offset8 = O4; // element offset 2387 const Register offset0 = O5; // previous element offset 2388 2389 __ subcc(count, 1, count); 2390 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes ); 2391 __ delayed()->sllx(count, LogBytesPerLong, offset8); 2392 __ sub(offset8, 8, offset0); 2393 __ align(OptoLoopAlignment); 2394 __ BIND(L_copy_16_bytes); 2395 __ ldx(from, offset8, O2); 2396 __ ldx(from, offset0, O3); 2397 __ stx(O2, to, offset8); 2398 __ deccc(offset8, 16); // use offset8 as counter 2399 __ stx(O3, to, offset0); 2400 __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes); 2401 __ delayed()->dec(offset0, 16); 2402 2403 __ BIND(L_copy_8_bytes); 2404 __ brx(Assembler::negative, false, Assembler::pn, L_exit ); 2405 __ delayed()->nop(); 2406 __ ldx(from, 0, O3); 2407 __ stx(O3, to, 0); 2408 __ BIND(L_exit); 2409 } 2410 2411 // Generate stub for conjoint long copy. 2412 // "aligned" is ignored, because we must make the stronger 2413 // assumption that both addresses are always 64-bit aligned. 2414 // 2415 // Arguments for generated stub: 2416 // from: O0 2417 // to: O1 2418 // count: O2 treated as signed 2419 // 2420 address generate_conjoint_long_copy(bool aligned, address nooverlap_target, 2421 address *entry, const char *name) { 2422 __ align(CodeEntryAlignment); 2423 StubCodeMark mark(this, "StubRoutines", name); 2424 address start = __ pc(); 2425 2426 assert(aligned, "Should always be aligned"); 2427 2428 assert_clean_int(O2, O3); // Make sure 'count' is clean int. 2429 2430 if (entry != NULL) { 2431 *entry = __ pc(); 2432 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2433 BLOCK_COMMENT("Entry:"); 2434 } 2435 2436 array_overlap_test(nooverlap_target, 3); 2437 2438 generate_conjoint_long_copy_core(aligned); 2439 2440 // O3, O4 are used as temp registers 2441 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4); 2442 __ retl(); 2443 __ delayed()->mov(G0, O0); // return 0 2444 return start; 2445 } 2446 2447 // Generate stub for disjoint oop copy. If "aligned" is true, the 2448 // "from" and "to" addresses are assumed to be heapword aligned. 2449 // 2450 // Arguments for generated stub: 2451 // from: O0 2452 // to: O1 2453 // count: O2 treated as signed 2454 // 2455 address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name, 2456 bool dest_uninitialized = false) { 2457 2458 const Register from = O0; // source array address 2459 const Register to = O1; // destination array address 2460 const Register count = O2; // elements count 2461 2462 __ align(CodeEntryAlignment); 2463 StubCodeMark mark(this, "StubRoutines", name); 2464 address start = __ pc(); 2465 2466 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2467 2468 if (entry != NULL) { 2469 *entry = __ pc(); 2470 // caller can pass a 64-bit byte count here 2471 BLOCK_COMMENT("Entry:"); 2472 } 2473 2474 // save arguments for barrier generation 2475 __ mov(to, G1); 2476 __ mov(count, G5); 2477 gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized); 2478 #ifdef _LP64 2479 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2480 if (UseCompressedOops) { 2481 generate_disjoint_int_copy_core(aligned); 2482 } else { 2483 generate_disjoint_long_copy_core(aligned); 2484 } 2485 #else 2486 generate_disjoint_int_copy_core(aligned); 2487 #endif 2488 // O0 is used as temp register 2489 gen_write_ref_array_post_barrier(G1, G5, O0); 2490 2491 // O3, O4 are used as temp registers 2492 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4); 2493 __ retl(); 2494 __ delayed()->mov(G0, O0); // return 0 2495 return start; 2496 } 2497 2498 // Generate stub for conjoint oop copy. If "aligned" is true, the 2499 // "from" and "to" addresses are assumed to be heapword aligned. 2500 // 2501 // Arguments for generated stub: 2502 // from: O0 2503 // to: O1 2504 // count: O2 treated as signed 2505 // 2506 address generate_conjoint_oop_copy(bool aligned, address nooverlap_target, 2507 address *entry, const char *name, 2508 bool dest_uninitialized = false) { 2509 2510 const Register from = O0; // source array address 2511 const Register to = O1; // destination array address 2512 const Register count = O2; // elements count 2513 2514 __ align(CodeEntryAlignment); 2515 StubCodeMark mark(this, "StubRoutines", name); 2516 address start = __ pc(); 2517 2518 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2519 2520 if (entry != NULL) { 2521 *entry = __ pc(); 2522 // caller can pass a 64-bit byte count here 2523 BLOCK_COMMENT("Entry:"); 2524 } 2525 2526 array_overlap_test(nooverlap_target, LogBytesPerHeapOop); 2527 2528 // save arguments for barrier generation 2529 __ mov(to, G1); 2530 __ mov(count, G5); 2531 gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized); 2532 2533 #ifdef _LP64 2534 if (UseCompressedOops) { 2535 generate_conjoint_int_copy_core(aligned); 2536 } else { 2537 generate_conjoint_long_copy_core(aligned); 2538 } 2539 #else 2540 generate_conjoint_int_copy_core(aligned); 2541 #endif 2542 2543 // O0 is used as temp register 2544 gen_write_ref_array_post_barrier(G1, G5, O0); 2545 2546 // O3, O4 are used as temp registers 2547 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4); 2548 __ retl(); 2549 __ delayed()->mov(G0, O0); // return 0 2550 return start; 2551 } 2552 2553 2554 // Helper for generating a dynamic type check. 2555 // Smashes only the given temp registers. 2556 void generate_type_check(Register sub_klass, 2557 Register super_check_offset, 2558 Register super_klass, 2559 Register temp, 2560 Label& L_success) { 2561 assert_different_registers(sub_klass, super_check_offset, super_klass, temp); 2562 2563 BLOCK_COMMENT("type_check:"); 2564 2565 Label L_miss, L_pop_to_miss; 2566 2567 assert_clean_int(super_check_offset, temp); 2568 2569 __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg, 2570 &L_success, &L_miss, NULL, 2571 super_check_offset); 2572 2573 BLOCK_COMMENT("type_check_slow_path:"); 2574 __ save_frame(0); 2575 __ check_klass_subtype_slow_path(sub_klass->after_save(), 2576 super_klass->after_save(), 2577 L0, L1, L2, L4, 2578 NULL, &L_pop_to_miss); 2579 __ ba(false, L_success); 2580 __ delayed()->restore(); 2581 2582 __ bind(L_pop_to_miss); 2583 __ restore(); 2584 2585 // Fall through on failure! 2586 __ BIND(L_miss); 2587 } 2588 2589 2590 // Generate stub for checked oop copy. 2591 // 2592 // Arguments for generated stub: 2593 // from: O0 2594 // to: O1 2595 // count: O2 treated as signed 2596 // ckoff: O3 (super_check_offset) 2597 // ckval: O4 (super_klass) 2598 // ret: O0 zero for success; (-1^K) where K is partial transfer count 2599 // 2600 address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) { 2601 2602 const Register O0_from = O0; // source array address 2603 const Register O1_to = O1; // destination array address 2604 const Register O2_count = O2; // elements count 2605 const Register O3_ckoff = O3; // super_check_offset 2606 const Register O4_ckval = O4; // super_klass 2607 2608 const Register O5_offset = O5; // loop var, with stride wordSize 2609 const Register G1_remain = G1; // loop var, with stride -1 2610 const Register G3_oop = G3; // actual oop copied 2611 const Register G4_klass = G4; // oop._klass 2612 const Register G5_super = G5; // oop._klass._primary_supers[ckval] 2613 2614 __ align(CodeEntryAlignment); 2615 StubCodeMark mark(this, "StubRoutines", name); 2616 address start = __ pc(); 2617 2618 #ifdef ASSERT 2619 // We sometimes save a frame (see generate_type_check below). 2620 // If this will cause trouble, let's fail now instead of later. 2621 __ save_frame(0); 2622 __ restore(); 2623 #endif 2624 2625 assert_clean_int(O2_count, G1); // Make sure 'count' is clean int. 2626 2627 #ifdef ASSERT 2628 // caller guarantees that the arrays really are different 2629 // otherwise, we would have to make conjoint checks 2630 { Label L; 2631 __ mov(O3, G1); // spill: overlap test smashes O3 2632 __ mov(O4, G4); // spill: overlap test smashes O4 2633 array_overlap_test(L, LogBytesPerHeapOop); 2634 __ stop("checkcast_copy within a single array"); 2635 __ bind(L); 2636 __ mov(G1, O3); 2637 __ mov(G4, O4); 2638 } 2639 #endif //ASSERT 2640 2641 if (entry != NULL) { 2642 *entry = __ pc(); 2643 // caller can pass a 64-bit byte count here (from generic stub) 2644 BLOCK_COMMENT("Entry:"); 2645 } 2646 gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized); 2647 2648 Label load_element, store_element, do_card_marks, fail, done; 2649 __ addcc(O2_count, 0, G1_remain); // initialize loop index, and test it 2650 __ brx(Assembler::notZero, false, Assembler::pt, load_element); 2651 __ delayed()->mov(G0, O5_offset); // offset from start of arrays 2652 2653 // Empty array: Nothing to do. 2654 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4); 2655 __ retl(); 2656 __ delayed()->set(0, O0); // return 0 on (trivial) success 2657 2658 // ======== begin loop ======== 2659 // (Loop is rotated; its entry is load_element.) 2660 // Loop variables: 2661 // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays 2662 // (O2 = len; O2 != 0; O2--) --- number of oops *remaining* 2663 // G3, G4, G5 --- current oop, oop.klass, oop.klass.super 2664 __ align(OptoLoopAlignment); 2665 2666 __ BIND(store_element); 2667 __ deccc(G1_remain); // decrement the count 2668 __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop 2669 __ inc(O5_offset, heapOopSize); // step to next offset 2670 __ brx(Assembler::zero, true, Assembler::pt, do_card_marks); 2671 __ delayed()->set(0, O0); // return -1 on success 2672 2673 // ======== loop entry is here ======== 2674 __ BIND(load_element); 2675 __ load_heap_oop(O0_from, O5_offset, G3_oop); // load the oop 2676 __ br_null(G3_oop, true, Assembler::pt, store_element); 2677 __ delayed()->nop(); 2678 2679 __ load_klass(G3_oop, G4_klass); // query the object klass 2680 2681 generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super, 2682 // branch to this on success: 2683 store_element); 2684 // ======== end loop ======== 2685 2686 // It was a real error; we must depend on the caller to finish the job. 2687 // Register G1 has number of *remaining* oops, O2 number of *total* oops. 2688 // Emit GC store barriers for the oops we have copied (O2 minus G1), 2689 // and report their number to the caller. 2690 __ BIND(fail); 2691 __ subcc(O2_count, G1_remain, O2_count); 2692 __ brx(Assembler::zero, false, Assembler::pt, done); 2693 __ delayed()->not1(O2_count, O0); // report (-1^K) to caller 2694 2695 __ BIND(do_card_marks); 2696 gen_write_ref_array_post_barrier(O1_to, O2_count, O3); // store check on O1[0..O2] 2697 2698 __ BIND(done); 2699 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4); 2700 __ retl(); 2701 __ delayed()->nop(); // return value in 00 2702 2703 return start; 2704 } 2705 2706 2707 // Generate 'unsafe' array copy stub 2708 // Though just as safe as the other stubs, it takes an unscaled 2709 // size_t argument instead of an element count. 2710 // 2711 // Arguments for generated stub: 2712 // from: O0 2713 // to: O1 2714 // count: O2 byte count, treated as ssize_t, can be zero 2715 // 2716 // Examines the alignment of the operands and dispatches 2717 // to a long, int, short, or byte copy loop. 2718 // 2719 address generate_unsafe_copy(const char* name, 2720 address byte_copy_entry, 2721 address short_copy_entry, 2722 address int_copy_entry, 2723 address long_copy_entry) { 2724 2725 const Register O0_from = O0; // source array address 2726 const Register O1_to = O1; // destination array address 2727 const Register O2_count = O2; // elements count 2728 2729 const Register G1_bits = G1; // test copy of low bits 2730 2731 __ align(CodeEntryAlignment); 2732 StubCodeMark mark(this, "StubRoutines", name); 2733 address start = __ pc(); 2734 2735 // bump this on entry, not on exit: 2736 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3); 2737 2738 __ or3(O0_from, O1_to, G1_bits); 2739 __ or3(O2_count, G1_bits, G1_bits); 2740 2741 __ btst(BytesPerLong-1, G1_bits); 2742 __ br(Assembler::zero, true, Assembler::pt, 2743 long_copy_entry, relocInfo::runtime_call_type); 2744 // scale the count on the way out: 2745 __ delayed()->srax(O2_count, LogBytesPerLong, O2_count); 2746 2747 __ btst(BytesPerInt-1, G1_bits); 2748 __ br(Assembler::zero, true, Assembler::pt, 2749 int_copy_entry, relocInfo::runtime_call_type); 2750 // scale the count on the way out: 2751 __ delayed()->srax(O2_count, LogBytesPerInt, O2_count); 2752 2753 __ btst(BytesPerShort-1, G1_bits); 2754 __ br(Assembler::zero, true, Assembler::pt, 2755 short_copy_entry, relocInfo::runtime_call_type); 2756 // scale the count on the way out: 2757 __ delayed()->srax(O2_count, LogBytesPerShort, O2_count); 2758 2759 __ br(Assembler::always, false, Assembler::pt, 2760 byte_copy_entry, relocInfo::runtime_call_type); 2761 __ delayed()->nop(); 2762 2763 return start; 2764 } 2765 2766 2767 // Perform range checks on the proposed arraycopy. 2768 // Kills the two temps, but nothing else. 2769 // Also, clean the sign bits of src_pos and dst_pos. 2770 void arraycopy_range_checks(Register src, // source array oop (O0) 2771 Register src_pos, // source position (O1) 2772 Register dst, // destination array oo (O2) 2773 Register dst_pos, // destination position (O3) 2774 Register length, // length of copy (O4) 2775 Register temp1, Register temp2, 2776 Label& L_failed) { 2777 BLOCK_COMMENT("arraycopy_range_checks:"); 2778 2779 // if (src_pos + length > arrayOop(src)->length() ) FAIL; 2780 2781 const Register array_length = temp1; // scratch 2782 const Register end_pos = temp2; // scratch 2783 2784 // Note: This next instruction may be in the delay slot of a branch: 2785 __ add(length, src_pos, end_pos); // src_pos + length 2786 __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length); 2787 __ cmp(end_pos, array_length); 2788 __ br(Assembler::greater, false, Assembler::pn, L_failed); 2789 2790 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL; 2791 __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length 2792 __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length); 2793 __ cmp(end_pos, array_length); 2794 __ br(Assembler::greater, false, Assembler::pn, L_failed); 2795 2796 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'. 2797 // Move with sign extension can be used since they are positive. 2798 __ delayed()->signx(src_pos, src_pos); 2799 __ signx(dst_pos, dst_pos); 2800 2801 BLOCK_COMMENT("arraycopy_range_checks done"); 2802 } 2803 2804 2805 // 2806 // Generate generic array copy stubs 2807 // 2808 // Input: 2809 // O0 - src oop 2810 // O1 - src_pos 2811 // O2 - dst oop 2812 // O3 - dst_pos 2813 // O4 - element count 2814 // 2815 // Output: 2816 // O0 == 0 - success 2817 // O0 == -1 - need to call System.arraycopy 2818 // 2819 address generate_generic_copy(const char *name, 2820 address entry_jbyte_arraycopy, 2821 address entry_jshort_arraycopy, 2822 address entry_jint_arraycopy, 2823 address entry_oop_arraycopy, 2824 address entry_jlong_arraycopy, 2825 address entry_checkcast_arraycopy) { 2826 Label L_failed, L_objArray; 2827 2828 // Input registers 2829 const Register src = O0; // source array oop 2830 const Register src_pos = O1; // source position 2831 const Register dst = O2; // destination array oop 2832 const Register dst_pos = O3; // destination position 2833 const Register length = O4; // elements count 2834 2835 // registers used as temp 2836 const Register G3_src_klass = G3; // source array klass 2837 const Register G4_dst_klass = G4; // destination array klass 2838 const Register G5_lh = G5; // layout handler 2839 const Register O5_temp = O5; 2840 2841 __ align(CodeEntryAlignment); 2842 StubCodeMark mark(this, "StubRoutines", name); 2843 address start = __ pc(); 2844 2845 // bump this on entry, not on exit: 2846 inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3); 2847 2848 // In principle, the int arguments could be dirty. 2849 //assert_clean_int(src_pos, G1); 2850 //assert_clean_int(dst_pos, G1); 2851 //assert_clean_int(length, G1); 2852 2853 //----------------------------------------------------------------------- 2854 // Assembler stubs will be used for this call to arraycopy 2855 // if the following conditions are met: 2856 // 2857 // (1) src and dst must not be null. 2858 // (2) src_pos must not be negative. 2859 // (3) dst_pos must not be negative. 2860 // (4) length must not be negative. 2861 // (5) src klass and dst klass should be the same and not NULL. 2862 // (6) src and dst should be arrays. 2863 // (7) src_pos + length must not exceed length of src. 2864 // (8) dst_pos + length must not exceed length of dst. 2865 BLOCK_COMMENT("arraycopy initial argument checks"); 2866 2867 // if (src == NULL) return -1; 2868 __ br_null(src, false, Assembler::pn, L_failed); 2869 2870 // if (src_pos < 0) return -1; 2871 __ delayed()->tst(src_pos); 2872 __ br(Assembler::negative, false, Assembler::pn, L_failed); 2873 __ delayed()->nop(); 2874 2875 // if (dst == NULL) return -1; 2876 __ br_null(dst, false, Assembler::pn, L_failed); 2877 2878 // if (dst_pos < 0) return -1; 2879 __ delayed()->tst(dst_pos); 2880 __ br(Assembler::negative, false, Assembler::pn, L_failed); 2881 2882 // if (length < 0) return -1; 2883 __ delayed()->tst(length); 2884 __ br(Assembler::negative, false, Assembler::pn, L_failed); 2885 2886 BLOCK_COMMENT("arraycopy argument klass checks"); 2887 // get src->klass() 2888 if (UseCompressedOops) { 2889 __ delayed()->nop(); // ??? not good 2890 __ load_klass(src, G3_src_klass); 2891 } else { 2892 __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass); 2893 } 2894 2895 #ifdef ASSERT 2896 // assert(src->klass() != NULL); 2897 BLOCK_COMMENT("assert klasses not null"); 2898 { Label L_a, L_b; 2899 __ br_notnull(G3_src_klass, false, Assembler::pt, L_b); // it is broken if klass is NULL 2900 __ delayed()->nop(); 2901 __ bind(L_a); 2902 __ stop("broken null klass"); 2903 __ bind(L_b); 2904 __ load_klass(dst, G4_dst_klass); 2905 __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also 2906 __ delayed()->mov(G0, G4_dst_klass); // scribble the temp 2907 BLOCK_COMMENT("assert done"); 2908 } 2909 #endif 2910 2911 // Load layout helper 2912 // 2913 // |array_tag| | header_size | element_type | |log2_element_size| 2914 // 32 30 24 16 8 2 0 2915 // 2916 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2917 // 2918 2919 int lh_offset = klassOopDesc::header_size() * HeapWordSize + 2920 Klass::layout_helper_offset_in_bytes(); 2921 2922 // Load 32-bits signed value. Use br() instruction with it to check icc. 2923 __ lduw(G3_src_klass, lh_offset, G5_lh); 2924 2925 if (UseCompressedOops) { 2926 __ load_klass(dst, G4_dst_klass); 2927 } 2928 // Handle objArrays completely differently... 2929 juint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2930 __ set(objArray_lh, O5_temp); 2931 __ cmp(G5_lh, O5_temp); 2932 __ br(Assembler::equal, false, Assembler::pt, L_objArray); 2933 if (UseCompressedOops) { 2934 __ delayed()->nop(); 2935 } else { 2936 __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass); 2937 } 2938 2939 // if (src->klass() != dst->klass()) return -1; 2940 __ cmp(G3_src_klass, G4_dst_klass); 2941 __ brx(Assembler::notEqual, false, Assembler::pn, L_failed); 2942 __ delayed()->nop(); 2943 2944 // if (!src->is_Array()) return -1; 2945 __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0 2946 __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed); 2947 2948 // At this point, it is known to be a typeArray (array_tag 0x3). 2949 #ifdef ASSERT 2950 __ delayed()->nop(); 2951 { Label L; 2952 jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2953 __ set(lh_prim_tag_in_place, O5_temp); 2954 __ cmp(G5_lh, O5_temp); 2955 __ br(Assembler::greaterEqual, false, Assembler::pt, L); 2956 __ delayed()->nop(); 2957 __ stop("must be a primitive array"); 2958 __ bind(L); 2959 } 2960 #else 2961 __ delayed(); // match next insn to prev branch 2962 #endif 2963 2964 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2965 O5_temp, G4_dst_klass, L_failed); 2966 2967 // typeArrayKlass 2968 // 2969 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2970 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2971 // 2972 2973 const Register G4_offset = G4_dst_klass; // array offset 2974 const Register G3_elsize = G3_src_klass; // log2 element size 2975 2976 __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset); 2977 __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset 2978 __ add(src, G4_offset, src); // src array offset 2979 __ add(dst, G4_offset, dst); // dst array offset 2980 __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size 2981 2982 // next registers should be set before the jump to corresponding stub 2983 const Register from = O0; // source array address 2984 const Register to = O1; // destination array address 2985 const Register count = O2; // elements count 2986 2987 // 'from', 'to', 'count' registers should be set in this order 2988 // since they are the same as 'src', 'src_pos', 'dst'. 2989 2990 BLOCK_COMMENT("scale indexes to element size"); 2991 __ sll_ptr(src_pos, G3_elsize, src_pos); 2992 __ sll_ptr(dst_pos, G3_elsize, dst_pos); 2993 __ add(src, src_pos, from); // src_addr 2994 __ add(dst, dst_pos, to); // dst_addr 2995 2996 BLOCK_COMMENT("choose copy loop based on element size"); 2997 __ cmp(G3_elsize, 0); 2998 __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy); 2999 __ delayed()->signx(length, count); // length 3000 3001 __ cmp(G3_elsize, LogBytesPerShort); 3002 __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy); 3003 __ delayed()->signx(length, count); // length 3004 3005 __ cmp(G3_elsize, LogBytesPerInt); 3006 __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy); 3007 __ delayed()->signx(length, count); // length 3008 #ifdef ASSERT 3009 { Label L; 3010 __ cmp(G3_elsize, LogBytesPerLong); 3011 __ br(Assembler::equal, false, Assembler::pt, L); 3012 __ delayed()->nop(); 3013 __ stop("must be long copy, but elsize is wrong"); 3014 __ bind(L); 3015 } 3016 #endif 3017 __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy); 3018 __ delayed()->signx(length, count); // length 3019 3020 // objArrayKlass 3021 __ BIND(L_objArray); 3022 // live at this point: G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length 3023 3024 Label L_plain_copy, L_checkcast_copy; 3025 // test array classes for subtyping 3026 __ cmp(G3_src_klass, G4_dst_klass); // usual case is exact equality 3027 __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy); 3028 __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below 3029 3030 // Identically typed arrays can be copied without element-wise checks. 3031 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 3032 O5_temp, G5_lh, L_failed); 3033 3034 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset 3035 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset 3036 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos); 3037 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos); 3038 __ add(src, src_pos, from); // src_addr 3039 __ add(dst, dst_pos, to); // dst_addr 3040 __ BIND(L_plain_copy); 3041 __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy); 3042 __ delayed()->signx(length, count); // length 3043 3044 __ BIND(L_checkcast_copy); 3045 // live at this point: G3_src_klass, G4_dst_klass 3046 { 3047 // Before looking at dst.length, make sure dst is also an objArray. 3048 // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot 3049 __ cmp(G5_lh, O5_temp); 3050 __ br(Assembler::notEqual, false, Assembler::pn, L_failed); 3051 3052 // It is safe to examine both src.length and dst.length. 3053 __ delayed(); // match next insn to prev branch 3054 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 3055 O5_temp, G5_lh, L_failed); 3056 3057 // Marshal the base address arguments now, freeing registers. 3058 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset 3059 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset 3060 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos); 3061 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos); 3062 __ add(src, src_pos, from); // src_addr 3063 __ add(dst, dst_pos, to); // dst_addr 3064 __ signx(length, count); // length (reloaded) 3065 3066 Register sco_temp = O3; // this register is free now 3067 assert_different_registers(from, to, count, sco_temp, 3068 G4_dst_klass, G3_src_klass); 3069 3070 // Generate the type check. 3071 int sco_offset = (klassOopDesc::header_size() * HeapWordSize + 3072 Klass::super_check_offset_offset_in_bytes()); 3073 __ lduw(G4_dst_klass, sco_offset, sco_temp); 3074 generate_type_check(G3_src_klass, sco_temp, G4_dst_klass, 3075 O5_temp, L_plain_copy); 3076 3077 // Fetch destination element klass from the objArrayKlass header. 3078 int ek_offset = (klassOopDesc::header_size() * HeapWordSize + 3079 objArrayKlass::element_klass_offset_in_bytes()); 3080 3081 // the checkcast_copy loop needs two extra arguments: 3082 __ ld_ptr(G4_dst_klass, ek_offset, O4); // dest elem klass 3083 // lduw(O4, sco_offset, O3); // sco of elem klass 3084 3085 __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy); 3086 __ delayed()->lduw(O4, sco_offset, O3); 3087 } 3088 3089 __ BIND(L_failed); 3090 __ retl(); 3091 __ delayed()->sub(G0, 1, O0); // return -1 3092 return start; 3093 } 3094 3095 void generate_arraycopy_stubs() { 3096 address entry; 3097 address entry_jbyte_arraycopy; 3098 address entry_jshort_arraycopy; 3099 address entry_jint_arraycopy; 3100 address entry_oop_arraycopy; 3101 address entry_jlong_arraycopy; 3102 address entry_checkcast_arraycopy; 3103 3104 //*** jbyte 3105 // Always need aligned and unaligned versions 3106 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 3107 "jbyte_disjoint_arraycopy"); 3108 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 3109 &entry_jbyte_arraycopy, 3110 "jbyte_arraycopy"); 3111 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 3112 "arrayof_jbyte_disjoint_arraycopy"); 3113 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 3114 "arrayof_jbyte_arraycopy"); 3115 3116 //*** jshort 3117 // Always need aligned and unaligned versions 3118 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 3119 "jshort_disjoint_arraycopy"); 3120 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 3121 &entry_jshort_arraycopy, 3122 "jshort_arraycopy"); 3123 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 3124 "arrayof_jshort_disjoint_arraycopy"); 3125 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 3126 "arrayof_jshort_arraycopy"); 3127 3128 //*** jint 3129 // Aligned versions 3130 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 3131 "arrayof_jint_disjoint_arraycopy"); 3132 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 3133 "arrayof_jint_arraycopy"); 3134 #ifdef _LP64 3135 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 3136 // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it). 3137 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 3138 "jint_disjoint_arraycopy"); 3139 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 3140 &entry_jint_arraycopy, 3141 "jint_arraycopy"); 3142 #else 3143 // In 32 bit jints are always HeapWordSize aligned, so always use the aligned version 3144 // (in fact in 32bit we always have a pre-loop part even in the aligned version, 3145 // because it uses 64-bit loads/stores, so the aligned flag is actually ignored). 3146 StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy; 3147 StubRoutines::_jint_arraycopy = StubRoutines::_arrayof_jint_arraycopy; 3148 #endif 3149 3150 3151 //*** jlong 3152 // It is always aligned 3153 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 3154 "arrayof_jlong_disjoint_arraycopy"); 3155 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 3156 "arrayof_jlong_arraycopy"); 3157 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 3158 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 3159 3160 3161 //*** oops 3162 // Aligned versions 3163 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, &entry, 3164 "arrayof_oop_disjoint_arraycopy"); 3165 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy, 3166 "arrayof_oop_arraycopy"); 3167 // Aligned versions without pre-barriers 3168 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry, 3169 "arrayof_oop_disjoint_arraycopy_uninit", 3170 /*dest_uninitialized*/true); 3171 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, entry, NULL, 3172 "arrayof_oop_arraycopy_uninit", 3173 /*dest_uninitialized*/true); 3174 #ifdef _LP64 3175 if (UseCompressedOops) { 3176 // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy. 3177 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, &entry, 3178 "oop_disjoint_arraycopy"); 3179 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy, 3180 "oop_arraycopy"); 3181 // Unaligned versions without pre-barriers 3182 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, &entry, 3183 "oop_disjoint_arraycopy_uninit", 3184 /*dest_uninitialized*/true); 3185 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, entry, NULL, 3186 "oop_arraycopy_uninit", 3187 /*dest_uninitialized*/true); 3188 } else 3189 #endif 3190 { 3191 // oop arraycopy is always aligned on 32bit and 64bit without compressed oops 3192 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 3193 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 3194 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 3195 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 3196 } 3197 3198 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 3199 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 3200 /*dest_uninitialized*/true); 3201 3202 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 3203 entry_jbyte_arraycopy, 3204 entry_jshort_arraycopy, 3205 entry_jint_arraycopy, 3206 entry_jlong_arraycopy); 3207 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 3208 entry_jbyte_arraycopy, 3209 entry_jshort_arraycopy, 3210 entry_jint_arraycopy, 3211 entry_oop_arraycopy, 3212 entry_jlong_arraycopy, 3213 entry_checkcast_arraycopy); 3214 3215 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 3216 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 3217 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 3218 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 3219 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 3220 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 3221 } 3222 3223 void generate_initial() { 3224 // Generates all stubs and initializes the entry points 3225 3226 //------------------------------------------------------------------------------------------------------------------------ 3227 // entry points that exist in all platforms 3228 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than 3229 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp. 3230 StubRoutines::_forward_exception_entry = generate_forward_exception(); 3231 3232 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address); 3233 StubRoutines::_catch_exception_entry = generate_catch_exception(); 3234 3235 //------------------------------------------------------------------------------------------------------------------------ 3236 // entry points that are platform specific 3237 StubRoutines::Sparc::_test_stop_entry = generate_test_stop(); 3238 3239 StubRoutines::Sparc::_stop_subroutine_entry = generate_stop_subroutine(); 3240 StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows(); 3241 3242 #if !defined(COMPILER2) && !defined(_LP64) 3243 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg(); 3244 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg(); 3245 StubRoutines::_atomic_add_entry = generate_atomic_add(); 3246 StubRoutines::_atomic_xchg_ptr_entry = StubRoutines::_atomic_xchg_entry; 3247 StubRoutines::_atomic_cmpxchg_ptr_entry = StubRoutines::_atomic_cmpxchg_entry; 3248 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long(); 3249 StubRoutines::_atomic_add_ptr_entry = StubRoutines::_atomic_add_entry; 3250 #endif // COMPILER2 !=> _LP64 3251 3252 // Build this early so it's available for the interpreter. The 3253 // stub expects the required and actual type to already be in O1 3254 // and O2 respectively. 3255 StubRoutines::_throw_WrongMethodTypeException_entry = 3256 generate_throw_exception("WrongMethodTypeException throw_exception", 3257 CAST_FROM_FN_PTR(address, SharedRuntime::throw_WrongMethodTypeException), 3258 false, G5_method_type, G3_method_handle); 3259 } 3260 3261 3262 void generate_all() { 3263 // Generates all stubs and initializes the entry points 3264 3265 // Generate partial_subtype_check first here since its code depends on 3266 // UseZeroBaseCompressedOops which is defined after heap initialization. 3267 StubRoutines::Sparc::_partial_subtype_check = generate_partial_subtype_check(); 3268 // These entry points require SharedInfo::stack0 to be set up in non-core builds 3269 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false); 3270 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError), false); 3271 StubRoutines::_throw_ArithmeticException_entry = generate_throw_exception("ArithmeticException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException), true); 3272 StubRoutines::_throw_NullPointerException_entry = generate_throw_exception("NullPointerException throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true); 3273 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false); 3274 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false); 3275 3276 StubRoutines::_handler_for_unsafe_access_entry = 3277 generate_handler_for_unsafe_access(); 3278 3279 // support for verify_oop (must happen after universe_init) 3280 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop_subroutine(); 3281 3282 // arraycopy stubs used by compilers 3283 generate_arraycopy_stubs(); 3284 3285 // Don't initialize the platform math functions since sparc 3286 // doesn't have intrinsics for these operations. 3287 } 3288 3289 3290 public: 3291 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 3292 // replace the standard masm with a special one: 3293 _masm = new MacroAssembler(code); 3294 3295 _stub_count = !all ? 0x100 : 0x200; 3296 if (all) { 3297 generate_all(); 3298 } else { 3299 generate_initial(); 3300 } 3301 3302 // make sure this stub is available for all local calls 3303 if (_atomic_add_stub.is_unbound()) { 3304 // generate a second time, if necessary 3305 (void) generate_atomic_add(); 3306 } 3307 } 3308 3309 3310 private: 3311 int _stub_count; 3312 void stub_prolog(StubCodeDesc* cdesc) { 3313 # ifdef ASSERT 3314 // put extra information in the stub code, to make it more readable 3315 #ifdef _LP64 3316 // Write the high part of the address 3317 // [RGV] Check if there is a dependency on the size of this prolog 3318 __ emit_data((intptr_t)cdesc >> 32, relocInfo::none); 3319 #endif 3320 __ emit_data((intptr_t)cdesc, relocInfo::none); 3321 __ emit_data(++_stub_count, relocInfo::none); 3322 # endif 3323 align(true); 3324 } 3325 3326 void align(bool at_header = false) { 3327 // %%%%% move this constant somewhere else 3328 // UltraSPARC cache line size is 8 instructions: 3329 const unsigned int icache_line_size = 32; 3330 const unsigned int icache_half_line_size = 16; 3331 3332 if (at_header) { 3333 while ((intptr_t)(__ pc()) % icache_line_size != 0) { 3334 __ emit_data(0, relocInfo::none); 3335 } 3336 } else { 3337 while ((intptr_t)(__ pc()) % icache_half_line_size != 0) { 3338 __ nop(); 3339 } 3340 } 3341 } 3342 3343 }; // end class declaration 3344 3345 void StubGenerator_generate(CodeBuffer* code, bool all) { 3346 StubGenerator g(code, all); 3347 }