1 /* 2 * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "assembler_sparc.inline.hpp" 28 #include "interpreter/interpreter.hpp" 29 #include "nativeInst_sparc.hpp" 30 #include "oops/instanceOop.hpp" 31 #include "oops/method.hpp" 32 #include "oops/objArrayKlass.hpp" 33 #include "oops/oop.inline.hpp" 34 #include "prims/methodHandles.hpp" 35 #include "runtime/frame.inline.hpp" 36 #include "runtime/handles.inline.hpp" 37 #include "runtime/sharedRuntime.hpp" 38 #include "runtime/stubCodeGenerator.hpp" 39 #include "runtime/stubRoutines.hpp" 40 #include "utilities/top.hpp" 41 #ifdef TARGET_OS_FAMILY_linux 42 # include "thread_linux.inline.hpp" 43 #endif 44 #ifdef TARGET_OS_FAMILY_solaris 45 # include "thread_solaris.inline.hpp" 46 #endif 47 #ifdef COMPILER2 48 #include "opto/runtime.hpp" 49 #endif 50 51 // Declaration and definition of StubGenerator (no .hpp file). 52 // For a more detailed description of the stub routine structure 53 // see the comment in stubRoutines.hpp. 54 55 #define __ _masm-> 56 57 #ifdef PRODUCT 58 #define BLOCK_COMMENT(str) /* nothing */ 59 #else 60 #define BLOCK_COMMENT(str) __ block_comment(str) 61 #endif 62 63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 64 65 // Note: The register L7 is used as L7_thread_cache, and may not be used 66 // any other way within this module. 67 68 69 static const Register& Lstub_temp = L2; 70 71 // ------------------------------------------------------------------------------------------------------------------------- 72 // Stub Code definitions 73 74 static address handle_unsafe_access() { 75 JavaThread* thread = JavaThread::current(); 76 address pc = thread->saved_exception_pc(); 77 address npc = thread->saved_exception_npc(); 78 // pc is the instruction which we must emulate 79 // doing a no-op is fine: return garbage from the load 80 81 // request an async exception 82 thread->set_pending_unsafe_access_error(); 83 84 // return address of next instruction to execute 85 return npc; 86 } 87 88 class StubGenerator: public StubCodeGenerator { 89 private: 90 91 #ifdef PRODUCT 92 #define inc_counter_np(a,b,c) (0) 93 #else 94 #define inc_counter_np(counter, t1, t2) \ 95 BLOCK_COMMENT("inc_counter " #counter); \ 96 __ inc_counter(&counter, t1, t2); 97 #endif 98 99 //---------------------------------------------------------------------------------------------------- 100 // Call stubs are used to call Java from C 101 102 address generate_call_stub(address& return_pc) { 103 StubCodeMark mark(this, "StubRoutines", "call_stub"); 104 address start = __ pc(); 105 106 // Incoming arguments: 107 // 108 // o0 : call wrapper address 109 // o1 : result (address) 110 // o2 : result type 111 // o3 : method 112 // o4 : (interpreter) entry point 113 // o5 : parameters (address) 114 // [sp + 0x5c]: parameter size (in words) 115 // [sp + 0x60]: thread 116 // 117 // +---------------+ <--- sp + 0 118 // | | 119 // . reg save area . 120 // | | 121 // +---------------+ <--- sp + 0x40 122 // | | 123 // . extra 7 slots . 124 // | | 125 // +---------------+ <--- sp + 0x5c 126 // | param. size | 127 // +---------------+ <--- sp + 0x60 128 // | thread | 129 // +---------------+ 130 // | | 131 132 // note: if the link argument position changes, adjust 133 // the code in frame::entry_frame_call_wrapper() 134 135 const Argument link = Argument(0, false); // used only for GC 136 const Argument result = Argument(1, false); 137 const Argument result_type = Argument(2, false); 138 const Argument method = Argument(3, false); 139 const Argument entry_point = Argument(4, false); 140 const Argument parameters = Argument(5, false); 141 const Argument parameter_size = Argument(6, false); 142 const Argument thread = Argument(7, false); 143 144 // setup thread register 145 __ ld_ptr(thread.as_address(), G2_thread); 146 __ reinit_heapbase(); 147 148 #ifdef ASSERT 149 // make sure we have no pending exceptions 150 { const Register t = G3_scratch; 151 Label L; 152 __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t); 153 __ br_null_short(t, Assembler::pt, L); 154 __ stop("StubRoutines::call_stub: entered with pending exception"); 155 __ bind(L); 156 } 157 #endif 158 159 // create activation frame & allocate space for parameters 160 { const Register t = G3_scratch; 161 __ ld_ptr(parameter_size.as_address(), t); // get parameter size (in words) 162 __ add(t, frame::memory_parameter_word_sp_offset, t); // add space for save area (in words) 163 __ round_to(t, WordsPerLong); // make sure it is multiple of 2 (in words) 164 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes 165 __ neg(t); // negate so it can be used with save 166 __ save(SP, t, SP); // setup new frame 167 } 168 169 // +---------------+ <--- sp + 0 170 // | | 171 // . reg save area . 172 // | | 173 // +---------------+ <--- sp + 0x40 174 // | | 175 // . extra 7 slots . 176 // | | 177 // +---------------+ <--- sp + 0x5c 178 // | empty slot | (only if parameter size is even) 179 // +---------------+ 180 // | | 181 // . parameters . 182 // | | 183 // +---------------+ <--- fp + 0 184 // | | 185 // . reg save area . 186 // | | 187 // +---------------+ <--- fp + 0x40 188 // | | 189 // . extra 7 slots . 190 // | | 191 // +---------------+ <--- fp + 0x5c 192 // | param. size | 193 // +---------------+ <--- fp + 0x60 194 // | thread | 195 // +---------------+ 196 // | | 197 198 // pass parameters if any 199 BLOCK_COMMENT("pass parameters if any"); 200 { const Register src = parameters.as_in().as_register(); 201 const Register dst = Lentry_args; 202 const Register tmp = G3_scratch; 203 const Register cnt = G4_scratch; 204 205 // test if any parameters & setup of Lentry_args 206 Label exit; 207 __ ld_ptr(parameter_size.as_in().as_address(), cnt); // parameter counter 208 __ add( FP, STACK_BIAS, dst ); 209 __ cmp_zero_and_br(Assembler::zero, cnt, exit); 210 __ delayed()->sub(dst, BytesPerWord, dst); // setup Lentry_args 211 212 // copy parameters if any 213 Label loop; 214 __ BIND(loop); 215 // Store parameter value 216 __ ld_ptr(src, 0, tmp); 217 __ add(src, BytesPerWord, src); 218 __ st_ptr(tmp, dst, 0); 219 __ deccc(cnt); 220 __ br(Assembler::greater, false, Assembler::pt, loop); 221 __ delayed()->sub(dst, Interpreter::stackElementSize, dst); 222 223 // done 224 __ BIND(exit); 225 } 226 227 // setup parameters, method & call Java function 228 #ifdef ASSERT 229 // layout_activation_impl checks it's notion of saved SP against 230 // this register, so if this changes update it as well. 231 const Register saved_SP = Lscratch; 232 __ mov(SP, saved_SP); // keep track of SP before call 233 #endif 234 235 // setup parameters 236 const Register t = G3_scratch; 237 __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words) 238 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes 239 __ sub(FP, t, Gargs); // setup parameter pointer 240 #ifdef _LP64 241 __ add( Gargs, STACK_BIAS, Gargs ); // Account for LP64 stack bias 242 #endif 243 __ mov(SP, O5_savedSP); 244 245 246 // do the call 247 // 248 // the following register must be setup: 249 // 250 // G2_thread 251 // G5_method 252 // Gargs 253 BLOCK_COMMENT("call Java function"); 254 __ jmpl(entry_point.as_in().as_register(), G0, O7); 255 __ delayed()->mov(method.as_in().as_register(), G5_method); // setup method 256 257 BLOCK_COMMENT("call_stub_return_address:"); 258 return_pc = __ pc(); 259 260 // The callee, if it wasn't interpreted, can return with SP changed so 261 // we can no longer assert of change of SP. 262 263 // store result depending on type 264 // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE 265 // is treated as T_INT) 266 { const Register addr = result .as_in().as_register(); 267 const Register type = result_type.as_in().as_register(); 268 Label is_long, is_float, is_double, is_object, exit; 269 __ cmp(type, T_OBJECT); __ br(Assembler::equal, false, Assembler::pn, is_object); 270 __ delayed()->cmp(type, T_FLOAT); __ br(Assembler::equal, false, Assembler::pn, is_float); 271 __ delayed()->cmp(type, T_DOUBLE); __ br(Assembler::equal, false, Assembler::pn, is_double); 272 __ delayed()->cmp(type, T_LONG); __ br(Assembler::equal, false, Assembler::pn, is_long); 273 __ delayed()->nop(); 274 275 // store int result 276 __ st(O0, addr, G0); 277 278 __ BIND(exit); 279 __ ret(); 280 __ delayed()->restore(); 281 282 __ BIND(is_object); 283 __ ba(exit); 284 __ delayed()->st_ptr(O0, addr, G0); 285 286 __ BIND(is_float); 287 __ ba(exit); 288 __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0); 289 290 __ BIND(is_double); 291 __ ba(exit); 292 __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0); 293 294 __ BIND(is_long); 295 #ifdef _LP64 296 __ ba(exit); 297 __ delayed()->st_long(O0, addr, G0); // store entire long 298 #else 299 #if defined(COMPILER2) 300 // All return values are where we want them, except for Longs. C2 returns 301 // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1. 302 // Since the interpreter will return longs in G1 and O0/O1 in the 32bit 303 // build we simply always use G1. 304 // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to 305 // do this here. Unfortunately if we did a rethrow we'd see an machepilog node 306 // first which would move g1 -> O0/O1 and destroy the exception we were throwing. 307 308 __ ba(exit); 309 __ delayed()->stx(G1, addr, G0); // store entire long 310 #else 311 __ st(O1, addr, BytesPerInt); 312 __ ba(exit); 313 __ delayed()->st(O0, addr, G0); 314 #endif /* COMPILER2 */ 315 #endif /* _LP64 */ 316 } 317 return start; 318 } 319 320 321 //---------------------------------------------------------------------------------------------------- 322 // Return point for a Java call if there's an exception thrown in Java code. 323 // The exception is caught and transformed into a pending exception stored in 324 // JavaThread that can be tested from within the VM. 325 // 326 // Oexception: exception oop 327 328 address generate_catch_exception() { 329 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 330 331 address start = __ pc(); 332 // verify that thread corresponds 333 __ verify_thread(); 334 335 const Register& temp_reg = Gtemp; 336 Address pending_exception_addr (G2_thread, Thread::pending_exception_offset()); 337 Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset ()); 338 Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset ()); 339 340 // set pending exception 341 __ verify_oop(Oexception); 342 __ st_ptr(Oexception, pending_exception_addr); 343 __ set((intptr_t)__FILE__, temp_reg); 344 __ st_ptr(temp_reg, exception_file_offset_addr); 345 __ set((intptr_t)__LINE__, temp_reg); 346 __ st(temp_reg, exception_line_offset_addr); 347 348 // complete return to VM 349 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before"); 350 351 AddressLiteral stub_ret(StubRoutines::_call_stub_return_address); 352 __ jump_to(stub_ret, temp_reg); 353 __ delayed()->nop(); 354 355 return start; 356 } 357 358 359 //---------------------------------------------------------------------------------------------------- 360 // Continuation point for runtime calls returning with a pending exception 361 // The pending exception check happened in the runtime or native call stub 362 // The pending exception in Thread is converted into a Java-level exception 363 // 364 // Contract with Java-level exception handler: O0 = exception 365 // O1 = throwing pc 366 367 address generate_forward_exception() { 368 StubCodeMark mark(this, "StubRoutines", "forward_exception"); 369 address start = __ pc(); 370 371 // Upon entry, O7 has the return address returning into Java 372 // (interpreted or compiled) code; i.e. the return address 373 // becomes the throwing pc. 374 375 const Register& handler_reg = Gtemp; 376 377 Address exception_addr(G2_thread, Thread::pending_exception_offset()); 378 379 #ifdef ASSERT 380 // make sure that this code is only executed if there is a pending exception 381 { Label L; 382 __ ld_ptr(exception_addr, Gtemp); 383 __ br_notnull_short(Gtemp, Assembler::pt, L); 384 __ stop("StubRoutines::forward exception: no pending exception (1)"); 385 __ bind(L); 386 } 387 #endif 388 389 // compute exception handler into handler_reg 390 __ get_thread(); 391 __ ld_ptr(exception_addr, Oexception); 392 __ verify_oop(Oexception); 393 __ save_frame(0); // compensates for compiler weakness 394 __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC 395 BLOCK_COMMENT("call exception_handler_for_return_address"); 396 __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch); 397 __ mov(O0, handler_reg); 398 __ restore(); // compensates for compiler weakness 399 400 __ ld_ptr(exception_addr, Oexception); 401 __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC 402 403 #ifdef ASSERT 404 // make sure exception is set 405 { Label L; 406 __ br_notnull_short(Oexception, Assembler::pt, L); 407 __ stop("StubRoutines::forward exception: no pending exception (2)"); 408 __ bind(L); 409 } 410 #endif 411 // jump to exception handler 412 __ jmp(handler_reg, 0); 413 // clear pending exception 414 __ delayed()->st_ptr(G0, exception_addr); 415 416 return start; 417 } 418 419 420 //------------------------------------------------------------------------------------------------------------------------ 421 // Continuation point for throwing of implicit exceptions that are not handled in 422 // the current activation. Fabricates an exception oop and initiates normal 423 // exception dispatching in this frame. Only callee-saved registers are preserved 424 // (through the normal register window / RegisterMap handling). 425 // If the compiler needs all registers to be preserved between the fault 426 // point and the exception handler then it must assume responsibility for that in 427 // AbstractCompiler::continuation_for_implicit_null_exception or 428 // continuation_for_implicit_division_by_zero_exception. All other implicit 429 // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are 430 // either at call sites or otherwise assume that stack unwinding will be initiated, 431 // so caller saved registers were assumed volatile in the compiler. 432 433 // Note that we generate only this stub into a RuntimeStub, because it needs to be 434 // properly traversed and ignored during GC, so we change the meaning of the "__" 435 // macro within this method. 436 #undef __ 437 #define __ masm-> 438 439 address generate_throw_exception(const char* name, address runtime_entry, 440 Register arg1 = noreg, Register arg2 = noreg) { 441 #ifdef ASSERT 442 int insts_size = VerifyThread ? 1 * K : 600; 443 #else 444 int insts_size = VerifyThread ? 1 * K : 256; 445 #endif /* ASSERT */ 446 int locs_size = 32; 447 448 CodeBuffer code(name, insts_size, locs_size); 449 MacroAssembler* masm = new MacroAssembler(&code); 450 451 __ verify_thread(); 452 453 // This is an inlined and slightly modified version of call_VM 454 // which has the ability to fetch the return PC out of thread-local storage 455 __ assert_not_delayed(); 456 457 // Note that we always push a frame because on the SPARC 458 // architecture, for all of our implicit exception kinds at call 459 // sites, the implicit exception is taken before the callee frame 460 // is pushed. 461 __ save_frame(0); 462 463 int frame_complete = __ offset(); 464 465 // Note that we always have a runtime stub frame on the top of stack by this point 466 Register last_java_sp = SP; 467 // 64-bit last_java_sp is biased! 468 __ set_last_Java_frame(last_java_sp, G0); 469 if (VerifyThread) __ mov(G2_thread, O0); // about to be smashed; pass early 470 __ save_thread(noreg); 471 if (arg1 != noreg) { 472 assert(arg2 != O1, "clobbered"); 473 __ mov(arg1, O1); 474 } 475 if (arg2 != noreg) { 476 __ mov(arg2, O2); 477 } 478 // do the call 479 BLOCK_COMMENT("call runtime_entry"); 480 __ call(runtime_entry, relocInfo::runtime_call_type); 481 if (!VerifyThread) 482 __ delayed()->mov(G2_thread, O0); // pass thread as first argument 483 else 484 __ delayed()->nop(); // (thread already passed) 485 __ restore_thread(noreg); 486 __ reset_last_Java_frame(); 487 488 // check for pending exceptions. use Gtemp as scratch register. 489 #ifdef ASSERT 490 Label L; 491 492 Address exception_addr(G2_thread, Thread::pending_exception_offset()); 493 Register scratch_reg = Gtemp; 494 __ ld_ptr(exception_addr, scratch_reg); 495 __ br_notnull_short(scratch_reg, Assembler::pt, L); 496 __ should_not_reach_here(); 497 __ bind(L); 498 #endif // ASSERT 499 BLOCK_COMMENT("call forward_exception_entry"); 500 __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type); 501 // we use O7 linkage so that forward_exception_entry has the issuing PC 502 __ delayed()->restore(); 503 504 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false); 505 return stub->entry_point(); 506 } 507 508 #undef __ 509 #define __ _masm-> 510 511 512 // Generate a routine that sets all the registers so we 513 // can tell if the stop routine prints them correctly. 514 address generate_test_stop() { 515 StubCodeMark mark(this, "StubRoutines", "test_stop"); 516 address start = __ pc(); 517 518 int i; 519 520 __ save_frame(0); 521 522 static jfloat zero = 0.0, one = 1.0; 523 524 // put addr in L0, then load through L0 to F0 525 __ set((intptr_t)&zero, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F0); 526 __ set((intptr_t)&one, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1 527 528 // use add to put 2..18 in F2..F18 529 for ( i = 2; i <= 18; ++i ) { 530 __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1), as_FloatRegister(i)); 531 } 532 533 // Now put double 2 in F16, double 18 in F18 534 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 ); 535 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 ); 536 537 // use add to put 20..32 in F20..F32 538 for (i = 20; i < 32; i += 2) { 539 __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2), as_FloatRegister(i)); 540 } 541 542 // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's 543 for ( i = 0; i < 8; ++i ) { 544 if (i < 6) { 545 __ set( i, as_iRegister(i)); 546 __ set(16 + i, as_oRegister(i)); 547 __ set(24 + i, as_gRegister(i)); 548 } 549 __ set( 8 + i, as_lRegister(i)); 550 } 551 552 __ stop("testing stop"); 553 554 555 __ ret(); 556 __ delayed()->restore(); 557 558 return start; 559 } 560 561 562 address generate_stop_subroutine() { 563 StubCodeMark mark(this, "StubRoutines", "stop_subroutine"); 564 address start = __ pc(); 565 566 __ stop_subroutine(); 567 568 return start; 569 } 570 571 address generate_flush_callers_register_windows() { 572 StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows"); 573 address start = __ pc(); 574 575 __ flush_windows(); 576 __ retl(false); 577 __ delayed()->add( FP, STACK_BIAS, O0 ); 578 // The returned value must be a stack pointer whose register save area 579 // is flushed, and will stay flushed while the caller executes. 580 581 return start; 582 } 583 584 // Helper functions for v8 atomic operations. 585 // 586 void get_v8_oop_lock_ptr(Register lock_ptr_reg, Register mark_oop_reg, Register scratch_reg) { 587 if (mark_oop_reg == noreg) { 588 address lock_ptr = (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(); 589 __ set((intptr_t)lock_ptr, lock_ptr_reg); 590 } else { 591 assert(scratch_reg != noreg, "just checking"); 592 address lock_ptr = (address)StubRoutines::Sparc::_v8_oop_lock_cache; 593 __ set((intptr_t)lock_ptr, lock_ptr_reg); 594 __ and3(mark_oop_reg, StubRoutines::Sparc::v8_oop_lock_mask_in_place, scratch_reg); 595 __ add(lock_ptr_reg, scratch_reg, lock_ptr_reg); 596 } 597 } 598 599 void generate_v8_lock_prologue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) { 600 601 get_v8_oop_lock_ptr(lock_ptr_reg, mark_oop_reg, scratch_reg); 602 __ set(StubRoutines::Sparc::locked, lock_reg); 603 // Initialize yield counter 604 __ mov(G0,yield_reg); 605 606 __ BIND(retry); 607 __ cmp_and_br_short(yield_reg, V8AtomicOperationUnderLockSpinCount, Assembler::less, Assembler::pt, dontyield); 608 609 // This code can only be called from inside the VM, this 610 // stub is only invoked from Atomic::add(). We do not 611 // want to use call_VM, because _last_java_sp and such 612 // must already be set. 613 // 614 // Save the regs and make space for a C call 615 __ save(SP, -96, SP); 616 __ save_all_globals_into_locals(); 617 BLOCK_COMMENT("call os::naked_sleep"); 618 __ call(CAST_FROM_FN_PTR(address, os::naked_sleep)); 619 __ delayed()->nop(); 620 __ restore_globals_from_locals(); 621 __ restore(); 622 // reset the counter 623 __ mov(G0,yield_reg); 624 625 __ BIND(dontyield); 626 627 // try to get lock 628 __ swap(lock_ptr_reg, 0, lock_reg); 629 630 // did we get the lock? 631 __ cmp(lock_reg, StubRoutines::Sparc::unlocked); 632 __ br(Assembler::notEqual, true, Assembler::pn, retry); 633 __ delayed()->add(yield_reg,1,yield_reg); 634 635 // yes, got lock. do the operation here. 636 } 637 638 void generate_v8_lock_epilogue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) { 639 __ st(lock_reg, lock_ptr_reg, 0); // unlock 640 } 641 642 // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest). 643 // 644 // Arguments : 645 // 646 // exchange_value: O0 647 // dest: O1 648 // 649 // Results: 650 // 651 // O0: the value previously stored in dest 652 // 653 address generate_atomic_xchg() { 654 StubCodeMark mark(this, "StubRoutines", "atomic_xchg"); 655 address start = __ pc(); 656 657 if (UseCASForSwap) { 658 // Use CAS instead of swap, just in case the MP hardware 659 // prefers to work with just one kind of synch. instruction. 660 Label retry; 661 __ BIND(retry); 662 __ mov(O0, O3); // scratch copy of exchange value 663 __ ld(O1, 0, O2); // observe the previous value 664 // try to replace O2 with O3 665 __ cas_under_lock(O1, O2, O3, 666 (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false); 667 __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry); 668 669 __ retl(false); 670 __ delayed()->mov(O2, O0); // report previous value to caller 671 672 } else { 673 if (VM_Version::v9_instructions_work()) { 674 __ retl(false); 675 __ delayed()->swap(O1, 0, O0); 676 } else { 677 const Register& lock_reg = O2; 678 const Register& lock_ptr_reg = O3; 679 const Register& yield_reg = O4; 680 681 Label retry; 682 Label dontyield; 683 684 generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield); 685 // got the lock, do the swap 686 __ swap(O1, 0, O0); 687 688 generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield); 689 __ retl(false); 690 __ delayed()->nop(); 691 } 692 } 693 694 return start; 695 } 696 697 698 // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value) 699 // 700 // Arguments : 701 // 702 // exchange_value: O0 703 // dest: O1 704 // compare_value: O2 705 // 706 // Results: 707 // 708 // O0: the value previously stored in dest 709 // 710 // Overwrites (v8): O3,O4,O5 711 // 712 address generate_atomic_cmpxchg() { 713 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg"); 714 address start = __ pc(); 715 716 // cmpxchg(dest, compare_value, exchange_value) 717 __ cas_under_lock(O1, O2, O0, 718 (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false); 719 __ retl(false); 720 __ delayed()->nop(); 721 722 return start; 723 } 724 725 // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value) 726 // 727 // Arguments : 728 // 729 // exchange_value: O1:O0 730 // dest: O2 731 // compare_value: O4:O3 732 // 733 // Results: 734 // 735 // O1:O0: the value previously stored in dest 736 // 737 // This only works on V9, on V8 we don't generate any 738 // code and just return NULL. 739 // 740 // Overwrites: G1,G2,G3 741 // 742 address generate_atomic_cmpxchg_long() { 743 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long"); 744 address start = __ pc(); 745 746 if (!VM_Version::supports_cx8()) 747 return NULL;; 748 __ sllx(O0, 32, O0); 749 __ srl(O1, 0, O1); 750 __ or3(O0,O1,O0); // O0 holds 64-bit value from compare_value 751 __ sllx(O3, 32, O3); 752 __ srl(O4, 0, O4); 753 __ or3(O3,O4,O3); // O3 holds 64-bit value from exchange_value 754 __ casx(O2, O3, O0); 755 __ srl(O0, 0, O1); // unpacked return value in O1:O0 756 __ retl(false); 757 __ delayed()->srlx(O0, 32, O0); 758 759 return start; 760 } 761 762 763 // Support for jint Atomic::add(jint add_value, volatile jint* dest). 764 // 765 // Arguments : 766 // 767 // add_value: O0 (e.g., +1 or -1) 768 // dest: O1 769 // 770 // Results: 771 // 772 // O0: the new value stored in dest 773 // 774 // Overwrites (v9): O3 775 // Overwrites (v8): O3,O4,O5 776 // 777 address generate_atomic_add() { 778 StubCodeMark mark(this, "StubRoutines", "atomic_add"); 779 address start = __ pc(); 780 __ BIND(_atomic_add_stub); 781 782 if (VM_Version::v9_instructions_work()) { 783 Label(retry); 784 __ BIND(retry); 785 786 __ lduw(O1, 0, O2); 787 __ add(O0, O2, O3); 788 __ cas(O1, O2, O3); 789 __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry); 790 __ retl(false); 791 __ delayed()->add(O0, O2, O0); // note that cas made O2==O3 792 } else { 793 const Register& lock_reg = O2; 794 const Register& lock_ptr_reg = O3; 795 const Register& value_reg = O4; 796 const Register& yield_reg = O5; 797 798 Label(retry); 799 Label(dontyield); 800 801 generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield); 802 // got lock, do the increment 803 __ ld(O1, 0, value_reg); 804 __ add(O0, value_reg, value_reg); 805 __ st(value_reg, O1, 0); 806 807 // %%% only for RMO and PSO 808 __ membar(Assembler::StoreStore); 809 810 generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield); 811 812 __ retl(false); 813 __ delayed()->mov(value_reg, O0); 814 } 815 816 return start; 817 } 818 Label _atomic_add_stub; // called from other stubs 819 820 821 //------------------------------------------------------------------------------------------------------------------------ 822 // The following routine generates a subroutine to throw an asynchronous 823 // UnknownError when an unsafe access gets a fault that could not be 824 // reasonably prevented by the programmer. (Example: SIGBUS/OBJERR.) 825 // 826 // Arguments : 827 // 828 // trapping PC: O7 829 // 830 // Results: 831 // posts an asynchronous exception, skips the trapping instruction 832 // 833 834 address generate_handler_for_unsafe_access() { 835 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access"); 836 address start = __ pc(); 837 838 const int preserve_register_words = (64 * 2); 839 Address preserve_addr(FP, (-preserve_register_words * wordSize) + STACK_BIAS); 840 841 Register Lthread = L7_thread_cache; 842 int i; 843 844 __ save_frame(0); 845 __ mov(G1, L1); 846 __ mov(G2, L2); 847 __ mov(G3, L3); 848 __ mov(G4, L4); 849 __ mov(G5, L5); 850 for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) { 851 __ stf(FloatRegisterImpl::D, as_FloatRegister(i), preserve_addr, i * wordSize); 852 } 853 854 address entry_point = CAST_FROM_FN_PTR(address, handle_unsafe_access); 855 BLOCK_COMMENT("call handle_unsafe_access"); 856 __ call(entry_point, relocInfo::runtime_call_type); 857 __ delayed()->nop(); 858 859 __ mov(L1, G1); 860 __ mov(L2, G2); 861 __ mov(L3, G3); 862 __ mov(L4, G4); 863 __ mov(L5, G5); 864 for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) { 865 __ ldf(FloatRegisterImpl::D, preserve_addr, as_FloatRegister(i), i * wordSize); 866 } 867 868 __ verify_thread(); 869 870 __ jmp(O0, 0); 871 __ delayed()->restore(); 872 873 return start; 874 } 875 876 877 // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super ); 878 // Arguments : 879 // 880 // ret : O0, returned 881 // icc/xcc: set as O0 (depending on wordSize) 882 // sub : O1, argument, not changed 883 // super: O2, argument, not changed 884 // raddr: O7, blown by call 885 address generate_partial_subtype_check() { 886 __ align(CodeEntryAlignment); 887 StubCodeMark mark(this, "StubRoutines", "partial_subtype_check"); 888 address start = __ pc(); 889 Label miss; 890 891 #if defined(COMPILER2) && !defined(_LP64) 892 // Do not use a 'save' because it blows the 64-bit O registers. 893 __ add(SP,-4*wordSize,SP); // Make space for 4 temps (stack must be 2 words aligned) 894 __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize); 895 __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize); 896 __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize); 897 __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize); 898 Register Rret = O0; 899 Register Rsub = O1; 900 Register Rsuper = O2; 901 #else 902 __ save_frame(0); 903 Register Rret = I0; 904 Register Rsub = I1; 905 Register Rsuper = I2; 906 #endif 907 908 Register L0_ary_len = L0; 909 Register L1_ary_ptr = L1; 910 Register L2_super = L2; 911 Register L3_index = L3; 912 913 __ check_klass_subtype_slow_path(Rsub, Rsuper, 914 L0, L1, L2, L3, 915 NULL, &miss); 916 917 // Match falls through here. 918 __ addcc(G0,0,Rret); // set Z flags, Z result 919 920 #if defined(COMPILER2) && !defined(_LP64) 921 __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0); 922 __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1); 923 __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2); 924 __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3); 925 __ retl(); // Result in Rret is zero; flags set to Z 926 __ delayed()->add(SP,4*wordSize,SP); 927 #else 928 __ ret(); // Result in Rret is zero; flags set to Z 929 __ delayed()->restore(); 930 #endif 931 932 __ BIND(miss); 933 __ addcc(G0,1,Rret); // set NZ flags, NZ result 934 935 #if defined(COMPILER2) && !defined(_LP64) 936 __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0); 937 __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1); 938 __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2); 939 __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3); 940 __ retl(); // Result in Rret is != 0; flags set to NZ 941 __ delayed()->add(SP,4*wordSize,SP); 942 #else 943 __ ret(); // Result in Rret is != 0; flags set to NZ 944 __ delayed()->restore(); 945 #endif 946 947 return start; 948 } 949 950 951 // Called from MacroAssembler::verify_oop 952 // 953 address generate_verify_oop_subroutine() { 954 StubCodeMark mark(this, "StubRoutines", "verify_oop_stub"); 955 956 address start = __ pc(); 957 958 __ verify_oop_subroutine(); 959 960 return start; 961 } 962 963 964 // 965 // Verify that a register contains clean 32-bits positive value 966 // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax). 967 // 968 // Input: 969 // Rint - 32-bits value 970 // Rtmp - scratch 971 // 972 void assert_clean_int(Register Rint, Register Rtmp) { 973 #if defined(ASSERT) && defined(_LP64) 974 __ signx(Rint, Rtmp); 975 __ cmp(Rint, Rtmp); 976 __ breakpoint_trap(Assembler::notEqual, Assembler::xcc); 977 #endif 978 } 979 980 // 981 // Generate overlap test for array copy stubs 982 // 983 // Input: 984 // O0 - array1 985 // O1 - array2 986 // O2 - element count 987 // 988 // Kills temps: O3, O4 989 // 990 void array_overlap_test(address no_overlap_target, int log2_elem_size) { 991 assert(no_overlap_target != NULL, "must be generated"); 992 array_overlap_test(no_overlap_target, NULL, log2_elem_size); 993 } 994 void array_overlap_test(Label& L_no_overlap, int log2_elem_size) { 995 array_overlap_test(NULL, &L_no_overlap, log2_elem_size); 996 } 997 void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) { 998 const Register from = O0; 999 const Register to = O1; 1000 const Register count = O2; 1001 const Register to_from = O3; // to - from 1002 const Register byte_count = O4; // count << log2_elem_size 1003 1004 __ subcc(to, from, to_from); 1005 __ sll_ptr(count, log2_elem_size, byte_count); 1006 if (NOLp == NULL) 1007 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target); 1008 else 1009 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp)); 1010 __ delayed()->cmp(to_from, byte_count); 1011 if (NOLp == NULL) 1012 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target); 1013 else 1014 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp)); 1015 __ delayed()->nop(); 1016 } 1017 1018 // 1019 // Generate pre-write barrier for array. 1020 // 1021 // Input: 1022 // addr - register containing starting address 1023 // count - register containing element count 1024 // tmp - scratch register 1025 // 1026 // The input registers are overwritten. 1027 // 1028 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 1029 BarrierSet* bs = Universe::heap()->barrier_set(); 1030 switch (bs->kind()) { 1031 case BarrierSet::G1SATBCT: 1032 case BarrierSet::G1SATBCTLogging: 1033 // With G1, don't generate the call if we statically know that the target in uninitialized 1034 if (!dest_uninitialized) { 1035 __ save_frame(0); 1036 // Save the necessary global regs... will be used after. 1037 if (addr->is_global()) { 1038 __ mov(addr, L0); 1039 } 1040 if (count->is_global()) { 1041 __ mov(count, L1); 1042 } 1043 __ mov(addr->after_save(), O0); 1044 // Get the count into O1 1045 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre)); 1046 __ delayed()->mov(count->after_save(), O1); 1047 if (addr->is_global()) { 1048 __ mov(L0, addr); 1049 } 1050 if (count->is_global()) { 1051 __ mov(L1, count); 1052 } 1053 __ restore(); 1054 } 1055 break; 1056 case BarrierSet::CardTableModRef: 1057 case BarrierSet::CardTableExtension: 1058 case BarrierSet::ModRef: 1059 break; 1060 default: 1061 ShouldNotReachHere(); 1062 } 1063 } 1064 // 1065 // Generate post-write barrier for array. 1066 // 1067 // Input: 1068 // addr - register containing starting address 1069 // count - register containing element count 1070 // tmp - scratch register 1071 // 1072 // The input registers are overwritten. 1073 // 1074 void gen_write_ref_array_post_barrier(Register addr, Register count, 1075 Register tmp) { 1076 BarrierSet* bs = Universe::heap()->barrier_set(); 1077 1078 switch (bs->kind()) { 1079 case BarrierSet::G1SATBCT: 1080 case BarrierSet::G1SATBCTLogging: 1081 { 1082 // Get some new fresh output registers. 1083 __ save_frame(0); 1084 __ mov(addr->after_save(), O0); 1085 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post)); 1086 __ delayed()->mov(count->after_save(), O1); 1087 __ restore(); 1088 } 1089 break; 1090 case BarrierSet::CardTableModRef: 1091 case BarrierSet::CardTableExtension: 1092 { 1093 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 1094 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 1095 assert_different_registers(addr, count, tmp); 1096 1097 Label L_loop; 1098 1099 __ sll_ptr(count, LogBytesPerHeapOop, count); 1100 __ sub(count, BytesPerHeapOop, count); 1101 __ add(count, addr, count); 1102 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.) 1103 __ srl_ptr(addr, CardTableModRefBS::card_shift, addr); 1104 __ srl_ptr(count, CardTableModRefBS::card_shift, count); 1105 __ sub(count, addr, count); 1106 AddressLiteral rs(ct->byte_map_base); 1107 __ set(rs, tmp); 1108 __ BIND(L_loop); 1109 __ stb(G0, tmp, addr); 1110 __ subcc(count, 1, count); 1111 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1112 __ delayed()->add(addr, 1, addr); 1113 } 1114 break; 1115 case BarrierSet::ModRef: 1116 break; 1117 default: 1118 ShouldNotReachHere(); 1119 } 1120 } 1121 1122 // 1123 // Generate main code for disjoint arraycopy 1124 // 1125 typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec, 1126 Label& L_loop, bool use_prefetch, bool use_bis); 1127 1128 void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size, 1129 int iter_size, CopyLoopFunc copy_loop_func) { 1130 Label L_copy; 1131 1132 assert(log2_elem_size <= 3, "the following code should be changed"); 1133 int count_dec = 16>>log2_elem_size; 1134 1135 int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance); 1136 assert(prefetch_dist < 4096, "invalid value"); 1137 prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size 1138 int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count 1139 1140 if (UseBlockCopy) { 1141 Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy; 1142 1143 // 64 bytes tail + bytes copied in one loop iteration 1144 int tail_size = 64 + iter_size; 1145 int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size; 1146 // Use BIS copy only for big arrays since it requires membar. 1147 __ set(block_copy_count, O4); 1148 __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy); 1149 // This code is for disjoint source and destination: 1150 // to <= from || to >= from+count 1151 // but BIS will stomp over 'from' if (to > from-tail_size && to <= from) 1152 __ sub(from, to, O4); 1153 __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm. 1154 __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy); 1155 1156 __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY); 1157 // BIS should not be used to copy tail (64 bytes+iter_size) 1158 // to avoid zeroing of following values. 1159 __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0 1160 1161 if (prefetch_count > 0) { // rounded up to one iteration count 1162 // Do prefetching only if copy size is bigger 1163 // than prefetch distance. 1164 __ set(prefetch_count, O4); 1165 __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy); 1166 __ sub(count, prefetch_count, count); 1167 1168 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true); 1169 __ add(count, prefetch_count, count); // restore count 1170 1171 } // prefetch_count > 0 1172 1173 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true); 1174 __ add(count, (tail_size>>log2_elem_size), count); // restore count 1175 1176 __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT); 1177 // BIS needs membar. 1178 __ membar(Assembler::StoreLoad); 1179 // Copy tail 1180 __ ba_short(L_copy); 1181 1182 __ BIND(L_skip_block_copy); 1183 } // UseBlockCopy 1184 1185 if (prefetch_count > 0) { // rounded up to one iteration count 1186 // Do prefetching only if copy size is bigger 1187 // than prefetch distance. 1188 __ set(prefetch_count, O4); 1189 __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy); 1190 __ sub(count, prefetch_count, count); 1191 1192 Label L_copy_prefetch; 1193 (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false); 1194 __ add(count, prefetch_count, count); // restore count 1195 1196 } // prefetch_count > 0 1197 1198 (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false); 1199 } 1200 1201 1202 1203 // 1204 // Helper methods for copy_16_bytes_forward_with_shift() 1205 // 1206 void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec, 1207 Label& L_loop, bool use_prefetch, bool use_bis) { 1208 1209 const Register left_shift = G1; // left shift bit counter 1210 const Register right_shift = G5; // right shift bit counter 1211 1212 __ align(OptoLoopAlignment); 1213 __ BIND(L_loop); 1214 if (use_prefetch) { 1215 if (ArraycopySrcPrefetchDistance > 0) { 1216 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 1217 } 1218 if (ArraycopyDstPrefetchDistance > 0) { 1219 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 1220 } 1221 } 1222 __ ldx(from, 0, O4); 1223 __ ldx(from, 8, G4); 1224 __ inc(to, 16); 1225 __ inc(from, 16); 1226 __ deccc(count, count_dec); // Can we do next iteration after this one? 1227 __ srlx(O4, right_shift, G3); 1228 __ bset(G3, O3); 1229 __ sllx(O4, left_shift, O4); 1230 __ srlx(G4, right_shift, G3); 1231 __ bset(G3, O4); 1232 if (use_bis) { 1233 __ stxa(O3, to, -16); 1234 __ stxa(O4, to, -8); 1235 } else { 1236 __ stx(O3, to, -16); 1237 __ stx(O4, to, -8); 1238 } 1239 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1240 __ delayed()->sllx(G4, left_shift, O3); 1241 } 1242 1243 // Copy big chunks forward with shift 1244 // 1245 // Inputs: 1246 // from - source arrays 1247 // to - destination array aligned to 8-bytes 1248 // count - elements count to copy >= the count equivalent to 16 bytes 1249 // count_dec - elements count's decrement equivalent to 16 bytes 1250 // L_copy_bytes - copy exit label 1251 // 1252 void copy_16_bytes_forward_with_shift(Register from, Register to, 1253 Register count, int log2_elem_size, Label& L_copy_bytes) { 1254 Label L_aligned_copy, L_copy_last_bytes; 1255 assert(log2_elem_size <= 3, "the following code should be changed"); 1256 int count_dec = 16>>log2_elem_size; 1257 1258 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy 1259 __ andcc(from, 7, G1); // misaligned bytes 1260 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1261 __ delayed()->nop(); 1262 1263 const Register left_shift = G1; // left shift bit counter 1264 const Register right_shift = G5; // right shift bit counter 1265 1266 __ sll(G1, LogBitsPerByte, left_shift); 1267 __ mov(64, right_shift); 1268 __ sub(right_shift, left_shift, right_shift); 1269 1270 // 1271 // Load 2 aligned 8-bytes chunks and use one from previous iteration 1272 // to form 2 aligned 8-bytes chunks to store. 1273 // 1274 __ dec(count, count_dec); // Pre-decrement 'count' 1275 __ andn(from, 7, from); // Align address 1276 __ ldx(from, 0, O3); 1277 __ inc(from, 8); 1278 __ sllx(O3, left_shift, O3); 1279 1280 disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop); 1281 1282 __ inccc(count, count_dec>>1 ); // + 8 bytes 1283 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 1284 __ delayed()->inc(count, count_dec>>1); // restore 'count' 1285 1286 // copy 8 bytes, part of them already loaded in O3 1287 __ ldx(from, 0, O4); 1288 __ inc(to, 8); 1289 __ inc(from, 8); 1290 __ srlx(O4, right_shift, G3); 1291 __ bset(O3, G3); 1292 __ stx(G3, to, -8); 1293 1294 __ BIND(L_copy_last_bytes); 1295 __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes 1296 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 1297 __ delayed()->sub(from, right_shift, from); // restore address 1298 1299 __ BIND(L_aligned_copy); 1300 } 1301 1302 // Copy big chunks backward with shift 1303 // 1304 // Inputs: 1305 // end_from - source arrays end address 1306 // end_to - destination array end address aligned to 8-bytes 1307 // count - elements count to copy >= the count equivalent to 16 bytes 1308 // count_dec - elements count's decrement equivalent to 16 bytes 1309 // L_aligned_copy - aligned copy exit label 1310 // L_copy_bytes - copy exit label 1311 // 1312 void copy_16_bytes_backward_with_shift(Register end_from, Register end_to, 1313 Register count, int count_dec, 1314 Label& L_aligned_copy, Label& L_copy_bytes) { 1315 Label L_loop, L_copy_last_bytes; 1316 1317 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy 1318 __ andcc(end_from, 7, G1); // misaligned bytes 1319 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1320 __ delayed()->deccc(count, count_dec); // Pre-decrement 'count' 1321 1322 const Register left_shift = G1; // left shift bit counter 1323 const Register right_shift = G5; // right shift bit counter 1324 1325 __ sll(G1, LogBitsPerByte, left_shift); 1326 __ mov(64, right_shift); 1327 __ sub(right_shift, left_shift, right_shift); 1328 1329 // 1330 // Load 2 aligned 8-bytes chunks and use one from previous iteration 1331 // to form 2 aligned 8-bytes chunks to store. 1332 // 1333 __ andn(end_from, 7, end_from); // Align address 1334 __ ldx(end_from, 0, O3); 1335 __ align(OptoLoopAlignment); 1336 __ BIND(L_loop); 1337 __ ldx(end_from, -8, O4); 1338 __ deccc(count, count_dec); // Can we do next iteration after this one? 1339 __ ldx(end_from, -16, G4); 1340 __ dec(end_to, 16); 1341 __ dec(end_from, 16); 1342 __ srlx(O3, right_shift, O3); 1343 __ sllx(O4, left_shift, G3); 1344 __ bset(G3, O3); 1345 __ stx(O3, end_to, 8); 1346 __ srlx(O4, right_shift, O4); 1347 __ sllx(G4, left_shift, G3); 1348 __ bset(G3, O4); 1349 __ stx(O4, end_to, 0); 1350 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1351 __ delayed()->mov(G4, O3); 1352 1353 __ inccc(count, count_dec>>1 ); // + 8 bytes 1354 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 1355 __ delayed()->inc(count, count_dec>>1); // restore 'count' 1356 1357 // copy 8 bytes, part of them already loaded in O3 1358 __ ldx(end_from, -8, O4); 1359 __ dec(end_to, 8); 1360 __ dec(end_from, 8); 1361 __ srlx(O3, right_shift, O3); 1362 __ sllx(O4, left_shift, G3); 1363 __ bset(O3, G3); 1364 __ stx(G3, end_to, 0); 1365 1366 __ BIND(L_copy_last_bytes); 1367 __ srl(left_shift, LogBitsPerByte, left_shift); // misaligned bytes 1368 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 1369 __ delayed()->add(end_from, left_shift, end_from); // restore address 1370 } 1371 1372 // 1373 // Generate stub for disjoint byte copy. If "aligned" is true, the 1374 // "from" and "to" addresses are assumed to be heapword aligned. 1375 // 1376 // Arguments for generated stub: 1377 // from: O0 1378 // to: O1 1379 // count: O2 treated as signed 1380 // 1381 address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) { 1382 __ align(CodeEntryAlignment); 1383 StubCodeMark mark(this, "StubRoutines", name); 1384 address start = __ pc(); 1385 1386 Label L_skip_alignment, L_align; 1387 Label L_copy_byte, L_copy_byte_loop, L_exit; 1388 1389 const Register from = O0; // source array address 1390 const Register to = O1; // destination array address 1391 const Register count = O2; // elements count 1392 const Register offset = O5; // offset from start of arrays 1393 // O3, O4, G3, G4 are used as temp registers 1394 1395 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1396 1397 if (entry != NULL) { 1398 *entry = __ pc(); 1399 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1400 BLOCK_COMMENT("Entry:"); 1401 } 1402 1403 // for short arrays, just do single element copy 1404 __ cmp(count, 23); // 16 + 7 1405 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte); 1406 __ delayed()->mov(G0, offset); 1407 1408 if (aligned) { 1409 // 'aligned' == true when it is known statically during compilation 1410 // of this arraycopy call site that both 'from' and 'to' addresses 1411 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1412 // 1413 // Aligned arrays have 4 bytes alignment in 32-bits VM 1414 // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM 1415 // 1416 #ifndef _LP64 1417 // copy a 4-bytes word if necessary to align 'to' to 8 bytes 1418 __ andcc(to, 7, G0); 1419 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment); 1420 __ delayed()->ld(from, 0, O3); 1421 __ inc(from, 4); 1422 __ inc(to, 4); 1423 __ dec(count, 4); 1424 __ st(O3, to, -4); 1425 __ BIND(L_skip_alignment); 1426 #endif 1427 } else { 1428 // copy bytes to align 'to' on 8 byte boundary 1429 __ andcc(to, 7, G1); // misaligned bytes 1430 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1431 __ delayed()->neg(G1); 1432 __ inc(G1, 8); // bytes need to copy to next 8-bytes alignment 1433 __ sub(count, G1, count); 1434 __ BIND(L_align); 1435 __ ldub(from, 0, O3); 1436 __ deccc(G1); 1437 __ inc(from); 1438 __ stb(O3, to, 0); 1439 __ br(Assembler::notZero, false, Assembler::pt, L_align); 1440 __ delayed()->inc(to); 1441 __ BIND(L_skip_alignment); 1442 } 1443 #ifdef _LP64 1444 if (!aligned) 1445 #endif 1446 { 1447 // Copy with shift 16 bytes per iteration if arrays do not have 1448 // the same alignment mod 8, otherwise fall through to the next 1449 // code for aligned copy. 1450 // The compare above (count >= 23) guarantes 'count' >= 16 bytes. 1451 // Also jump over aligned copy after the copy with shift completed. 1452 1453 copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte); 1454 } 1455 1456 // Both array are 8 bytes aligned, copy 16 bytes at a time 1457 __ and3(count, 7, G4); // Save count 1458 __ srl(count, 3, count); 1459 generate_disjoint_long_copy_core(aligned); 1460 __ mov(G4, count); // Restore count 1461 1462 // copy tailing bytes 1463 __ BIND(L_copy_byte); 1464 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1465 __ align(OptoLoopAlignment); 1466 __ BIND(L_copy_byte_loop); 1467 __ ldub(from, offset, O3); 1468 __ deccc(count); 1469 __ stb(O3, to, offset); 1470 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop); 1471 __ delayed()->inc(offset); 1472 1473 __ BIND(L_exit); 1474 // O3, O4 are used as temp registers 1475 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4); 1476 __ retl(); 1477 __ delayed()->mov(G0, O0); // return 0 1478 return start; 1479 } 1480 1481 // 1482 // Generate stub for conjoint byte copy. If "aligned" is true, the 1483 // "from" and "to" addresses are assumed to be heapword aligned. 1484 // 1485 // Arguments for generated stub: 1486 // from: O0 1487 // to: O1 1488 // count: O2 treated as signed 1489 // 1490 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1491 address *entry, const char *name) { 1492 // Do reverse copy. 1493 1494 __ align(CodeEntryAlignment); 1495 StubCodeMark mark(this, "StubRoutines", name); 1496 address start = __ pc(); 1497 1498 Label L_skip_alignment, L_align, L_aligned_copy; 1499 Label L_copy_byte, L_copy_byte_loop, L_exit; 1500 1501 const Register from = O0; // source array address 1502 const Register to = O1; // destination array address 1503 const Register count = O2; // elements count 1504 const Register end_from = from; // source array end address 1505 const Register end_to = to; // destination array end address 1506 1507 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1508 1509 if (entry != NULL) { 1510 *entry = __ pc(); 1511 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1512 BLOCK_COMMENT("Entry:"); 1513 } 1514 1515 array_overlap_test(nooverlap_target, 0); 1516 1517 __ add(to, count, end_to); // offset after last copied element 1518 1519 // for short arrays, just do single element copy 1520 __ cmp(count, 23); // 16 + 7 1521 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte); 1522 __ delayed()->add(from, count, end_from); 1523 1524 { 1525 // Align end of arrays since they could be not aligned even 1526 // when arrays itself are aligned. 1527 1528 // copy bytes to align 'end_to' on 8 byte boundary 1529 __ andcc(end_to, 7, G1); // misaligned bytes 1530 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1531 __ delayed()->nop(); 1532 __ sub(count, G1, count); 1533 __ BIND(L_align); 1534 __ dec(end_from); 1535 __ dec(end_to); 1536 __ ldub(end_from, 0, O3); 1537 __ deccc(G1); 1538 __ brx(Assembler::notZero, false, Assembler::pt, L_align); 1539 __ delayed()->stb(O3, end_to, 0); 1540 __ BIND(L_skip_alignment); 1541 } 1542 #ifdef _LP64 1543 if (aligned) { 1544 // Both arrays are aligned to 8-bytes in 64-bits VM. 1545 // The 'count' is decremented in copy_16_bytes_backward_with_shift() 1546 // in unaligned case. 1547 __ dec(count, 16); 1548 } else 1549 #endif 1550 { 1551 // Copy with shift 16 bytes per iteration if arrays do not have 1552 // the same alignment mod 8, otherwise jump to the next 1553 // code for aligned copy (and substracting 16 from 'count' before jump). 1554 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1555 // Also jump over aligned copy after the copy with shift completed. 1556 1557 copy_16_bytes_backward_with_shift(end_from, end_to, count, 16, 1558 L_aligned_copy, L_copy_byte); 1559 } 1560 // copy 4 elements (16 bytes) at a time 1561 __ align(OptoLoopAlignment); 1562 __ BIND(L_aligned_copy); 1563 __ dec(end_from, 16); 1564 __ ldx(end_from, 8, O3); 1565 __ ldx(end_from, 0, O4); 1566 __ dec(end_to, 16); 1567 __ deccc(count, 16); 1568 __ stx(O3, end_to, 8); 1569 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 1570 __ delayed()->stx(O4, end_to, 0); 1571 __ inc(count, 16); 1572 1573 // copy 1 element (2 bytes) at a time 1574 __ BIND(L_copy_byte); 1575 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1576 __ align(OptoLoopAlignment); 1577 __ BIND(L_copy_byte_loop); 1578 __ dec(end_from); 1579 __ dec(end_to); 1580 __ ldub(end_from, 0, O4); 1581 __ deccc(count); 1582 __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop); 1583 __ delayed()->stb(O4, end_to, 0); 1584 1585 __ BIND(L_exit); 1586 // O3, O4 are used as temp registers 1587 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4); 1588 __ retl(); 1589 __ delayed()->mov(G0, O0); // return 0 1590 return start; 1591 } 1592 1593 // 1594 // Generate stub for disjoint short copy. If "aligned" is true, the 1595 // "from" and "to" addresses are assumed to be heapword aligned. 1596 // 1597 // Arguments for generated stub: 1598 // from: O0 1599 // to: O1 1600 // count: O2 treated as signed 1601 // 1602 address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) { 1603 __ align(CodeEntryAlignment); 1604 StubCodeMark mark(this, "StubRoutines", name); 1605 address start = __ pc(); 1606 1607 Label L_skip_alignment, L_skip_alignment2; 1608 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit; 1609 1610 const Register from = O0; // source array address 1611 const Register to = O1; // destination array address 1612 const Register count = O2; // elements count 1613 const Register offset = O5; // offset from start of arrays 1614 // O3, O4, G3, G4 are used as temp registers 1615 1616 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1617 1618 if (entry != NULL) { 1619 *entry = __ pc(); 1620 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1621 BLOCK_COMMENT("Entry:"); 1622 } 1623 1624 // for short arrays, just do single element copy 1625 __ cmp(count, 11); // 8 + 3 (22 bytes) 1626 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes); 1627 __ delayed()->mov(G0, offset); 1628 1629 if (aligned) { 1630 // 'aligned' == true when it is known statically during compilation 1631 // of this arraycopy call site that both 'from' and 'to' addresses 1632 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1633 // 1634 // Aligned arrays have 4 bytes alignment in 32-bits VM 1635 // and 8 bytes - in 64-bits VM. 1636 // 1637 #ifndef _LP64 1638 // copy a 2-elements word if necessary to align 'to' to 8 bytes 1639 __ andcc(to, 7, G0); 1640 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1641 __ delayed()->ld(from, 0, O3); 1642 __ inc(from, 4); 1643 __ inc(to, 4); 1644 __ dec(count, 2); 1645 __ st(O3, to, -4); 1646 __ BIND(L_skip_alignment); 1647 #endif 1648 } else { 1649 // copy 1 element if necessary to align 'to' on an 4 bytes 1650 __ andcc(to, 3, G0); 1651 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1652 __ delayed()->lduh(from, 0, O3); 1653 __ inc(from, 2); 1654 __ inc(to, 2); 1655 __ dec(count); 1656 __ sth(O3, to, -2); 1657 __ BIND(L_skip_alignment); 1658 1659 // copy 2 elements to align 'to' on an 8 byte boundary 1660 __ andcc(to, 7, G0); 1661 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2); 1662 __ delayed()->lduh(from, 0, O3); 1663 __ dec(count, 2); 1664 __ lduh(from, 2, O4); 1665 __ inc(from, 4); 1666 __ inc(to, 4); 1667 __ sth(O3, to, -4); 1668 __ sth(O4, to, -2); 1669 __ BIND(L_skip_alignment2); 1670 } 1671 #ifdef _LP64 1672 if (!aligned) 1673 #endif 1674 { 1675 // Copy with shift 16 bytes per iteration if arrays do not have 1676 // the same alignment mod 8, otherwise fall through to the next 1677 // code for aligned copy. 1678 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1679 // Also jump over aligned copy after the copy with shift completed. 1680 1681 copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes); 1682 } 1683 1684 // Both array are 8 bytes aligned, copy 16 bytes at a time 1685 __ and3(count, 3, G4); // Save 1686 __ srl(count, 2, count); 1687 generate_disjoint_long_copy_core(aligned); 1688 __ mov(G4, count); // restore 1689 1690 // copy 1 element at a time 1691 __ BIND(L_copy_2_bytes); 1692 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1693 __ align(OptoLoopAlignment); 1694 __ BIND(L_copy_2_bytes_loop); 1695 __ lduh(from, offset, O3); 1696 __ deccc(count); 1697 __ sth(O3, to, offset); 1698 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop); 1699 __ delayed()->inc(offset, 2); 1700 1701 __ BIND(L_exit); 1702 // O3, O4 are used as temp registers 1703 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4); 1704 __ retl(); 1705 __ delayed()->mov(G0, O0); // return 0 1706 return start; 1707 } 1708 1709 // 1710 // Generate stub for disjoint short fill. If "aligned" is true, the 1711 // "to" address is assumed to be heapword aligned. 1712 // 1713 // Arguments for generated stub: 1714 // to: O0 1715 // value: O1 1716 // count: O2 treated as signed 1717 // 1718 address generate_fill(BasicType t, bool aligned, const char* name) { 1719 __ align(CodeEntryAlignment); 1720 StubCodeMark mark(this, "StubRoutines", name); 1721 address start = __ pc(); 1722 1723 const Register to = O0; // source array address 1724 const Register value = O1; // fill value 1725 const Register count = O2; // elements count 1726 // O3 is used as a temp register 1727 1728 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1729 1730 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte; 1731 Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes; 1732 1733 int shift = -1; 1734 switch (t) { 1735 case T_BYTE: 1736 shift = 2; 1737 break; 1738 case T_SHORT: 1739 shift = 1; 1740 break; 1741 case T_INT: 1742 shift = 0; 1743 break; 1744 default: ShouldNotReachHere(); 1745 } 1746 1747 BLOCK_COMMENT("Entry:"); 1748 1749 if (t == T_BYTE) { 1750 // Zero extend value 1751 __ and3(value, 0xff, value); 1752 __ sllx(value, 8, O3); 1753 __ or3(value, O3, value); 1754 } 1755 if (t == T_SHORT) { 1756 // Zero extend value 1757 __ sllx(value, 48, value); 1758 __ srlx(value, 48, value); 1759 } 1760 if (t == T_BYTE || t == T_SHORT) { 1761 __ sllx(value, 16, O3); 1762 __ or3(value, O3, value); 1763 } 1764 1765 __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element 1766 __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp 1767 __ delayed()->andcc(count, 1, G0); 1768 1769 if (!aligned && (t == T_BYTE || t == T_SHORT)) { 1770 // align source address at 4 bytes address boundary 1771 if (t == T_BYTE) { 1772 // One byte misalignment happens only for byte arrays 1773 __ andcc(to, 1, G0); 1774 __ br(Assembler::zero, false, Assembler::pt, L_skip_align1); 1775 __ delayed()->nop(); 1776 __ stb(value, to, 0); 1777 __ inc(to, 1); 1778 __ dec(count, 1); 1779 __ BIND(L_skip_align1); 1780 } 1781 // Two bytes misalignment happens only for byte and short (char) arrays 1782 __ andcc(to, 2, G0); 1783 __ br(Assembler::zero, false, Assembler::pt, L_skip_align2); 1784 __ delayed()->nop(); 1785 __ sth(value, to, 0); 1786 __ inc(to, 2); 1787 __ dec(count, 1 << (shift - 1)); 1788 __ BIND(L_skip_align2); 1789 } 1790 #ifdef _LP64 1791 if (!aligned) { 1792 #endif 1793 // align to 8 bytes, we know we are 4 byte aligned to start 1794 __ andcc(to, 7, G0); 1795 __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes); 1796 __ delayed()->nop(); 1797 __ stw(value, to, 0); 1798 __ inc(to, 4); 1799 __ dec(count, 1 << shift); 1800 __ BIND(L_fill_32_bytes); 1801 #ifdef _LP64 1802 } 1803 #endif 1804 1805 if (t == T_INT) { 1806 // Zero extend value 1807 __ srl(value, 0, value); 1808 } 1809 if (t == T_BYTE || t == T_SHORT || t == T_INT) { 1810 __ sllx(value, 32, O3); 1811 __ or3(value, O3, value); 1812 } 1813 1814 Label L_check_fill_8_bytes; 1815 // Fill 32-byte chunks 1816 __ subcc(count, 8 << shift, count); 1817 __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes); 1818 __ delayed()->nop(); 1819 1820 Label L_fill_32_bytes_loop, L_fill_4_bytes; 1821 __ align(16); 1822 __ BIND(L_fill_32_bytes_loop); 1823 1824 __ stx(value, to, 0); 1825 __ stx(value, to, 8); 1826 __ stx(value, to, 16); 1827 __ stx(value, to, 24); 1828 1829 __ subcc(count, 8 << shift, count); 1830 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop); 1831 __ delayed()->add(to, 32, to); 1832 1833 __ BIND(L_check_fill_8_bytes); 1834 __ addcc(count, 8 << shift, count); 1835 __ brx(Assembler::zero, false, Assembler::pn, L_exit); 1836 __ delayed()->subcc(count, 1 << (shift + 1), count); 1837 __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes); 1838 __ delayed()->andcc(count, 1<<shift, G0); 1839 1840 // 1841 // length is too short, just fill 8 bytes at a time 1842 // 1843 Label L_fill_8_bytes_loop; 1844 __ BIND(L_fill_8_bytes_loop); 1845 __ stx(value, to, 0); 1846 __ subcc(count, 1 << (shift + 1), count); 1847 __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop); 1848 __ delayed()->add(to, 8, to); 1849 1850 // fill trailing 4 bytes 1851 __ andcc(count, 1<<shift, G0); // in delay slot of branches 1852 if (t == T_INT) { 1853 __ BIND(L_fill_elements); 1854 } 1855 __ BIND(L_fill_4_bytes); 1856 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes); 1857 if (t == T_BYTE || t == T_SHORT) { 1858 __ delayed()->andcc(count, 1<<(shift-1), G0); 1859 } else { 1860 __ delayed()->nop(); 1861 } 1862 __ stw(value, to, 0); 1863 if (t == T_BYTE || t == T_SHORT) { 1864 __ inc(to, 4); 1865 // fill trailing 2 bytes 1866 __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches 1867 __ BIND(L_fill_2_bytes); 1868 __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte); 1869 __ delayed()->andcc(count, 1, count); 1870 __ sth(value, to, 0); 1871 if (t == T_BYTE) { 1872 __ inc(to, 2); 1873 // fill trailing byte 1874 __ andcc(count, 1, count); // in delay slot of branches 1875 __ BIND(L_fill_byte); 1876 __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1877 __ delayed()->nop(); 1878 __ stb(value, to, 0); 1879 } else { 1880 __ BIND(L_fill_byte); 1881 } 1882 } else { 1883 __ BIND(L_fill_2_bytes); 1884 } 1885 __ BIND(L_exit); 1886 __ retl(); 1887 __ delayed()->nop(); 1888 1889 // Handle copies less than 8 bytes. Int is handled elsewhere. 1890 if (t == T_BYTE) { 1891 __ BIND(L_fill_elements); 1892 Label L_fill_2, L_fill_4; 1893 // in delay slot __ andcc(count, 1, G0); 1894 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2); 1895 __ delayed()->andcc(count, 2, G0); 1896 __ stb(value, to, 0); 1897 __ inc(to, 1); 1898 __ BIND(L_fill_2); 1899 __ brx(Assembler::zero, false, Assembler::pt, L_fill_4); 1900 __ delayed()->andcc(count, 4, G0); 1901 __ stb(value, to, 0); 1902 __ stb(value, to, 1); 1903 __ inc(to, 2); 1904 __ BIND(L_fill_4); 1905 __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1906 __ delayed()->nop(); 1907 __ stb(value, to, 0); 1908 __ stb(value, to, 1); 1909 __ stb(value, to, 2); 1910 __ retl(); 1911 __ delayed()->stb(value, to, 3); 1912 } 1913 1914 if (t == T_SHORT) { 1915 Label L_fill_2; 1916 __ BIND(L_fill_elements); 1917 // in delay slot __ andcc(count, 1, G0); 1918 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2); 1919 __ delayed()->andcc(count, 2, G0); 1920 __ sth(value, to, 0); 1921 __ inc(to, 2); 1922 __ BIND(L_fill_2); 1923 __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1924 __ delayed()->nop(); 1925 __ sth(value, to, 0); 1926 __ retl(); 1927 __ delayed()->sth(value, to, 2); 1928 } 1929 return start; 1930 } 1931 1932 // 1933 // Generate stub for conjoint short copy. If "aligned" is true, the 1934 // "from" and "to" addresses are assumed to be heapword aligned. 1935 // 1936 // Arguments for generated stub: 1937 // from: O0 1938 // to: O1 1939 // count: O2 treated as signed 1940 // 1941 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1942 address *entry, const char *name) { 1943 // Do reverse copy. 1944 1945 __ align(CodeEntryAlignment); 1946 StubCodeMark mark(this, "StubRoutines", name); 1947 address start = __ pc(); 1948 1949 Label L_skip_alignment, L_skip_alignment2, L_aligned_copy; 1950 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit; 1951 1952 const Register from = O0; // source array address 1953 const Register to = O1; // destination array address 1954 const Register count = O2; // elements count 1955 const Register end_from = from; // source array end address 1956 const Register end_to = to; // destination array end address 1957 1958 const Register byte_count = O3; // bytes count to copy 1959 1960 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1961 1962 if (entry != NULL) { 1963 *entry = __ pc(); 1964 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1965 BLOCK_COMMENT("Entry:"); 1966 } 1967 1968 array_overlap_test(nooverlap_target, 1); 1969 1970 __ sllx(count, LogBytesPerShort, byte_count); 1971 __ add(to, byte_count, end_to); // offset after last copied element 1972 1973 // for short arrays, just do single element copy 1974 __ cmp(count, 11); // 8 + 3 (22 bytes) 1975 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes); 1976 __ delayed()->add(from, byte_count, end_from); 1977 1978 { 1979 // Align end of arrays since they could be not aligned even 1980 // when arrays itself are aligned. 1981 1982 // copy 1 element if necessary to align 'end_to' on an 4 bytes 1983 __ andcc(end_to, 3, G0); 1984 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1985 __ delayed()->lduh(end_from, -2, O3); 1986 __ dec(end_from, 2); 1987 __ dec(end_to, 2); 1988 __ dec(count); 1989 __ sth(O3, end_to, 0); 1990 __ BIND(L_skip_alignment); 1991 1992 // copy 2 elements to align 'end_to' on an 8 byte boundary 1993 __ andcc(end_to, 7, G0); 1994 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2); 1995 __ delayed()->lduh(end_from, -2, O3); 1996 __ dec(count, 2); 1997 __ lduh(end_from, -4, O4); 1998 __ dec(end_from, 4); 1999 __ dec(end_to, 4); 2000 __ sth(O3, end_to, 2); 2001 __ sth(O4, end_to, 0); 2002 __ BIND(L_skip_alignment2); 2003 } 2004 #ifdef _LP64 2005 if (aligned) { 2006 // Both arrays are aligned to 8-bytes in 64-bits VM. 2007 // The 'count' is decremented in copy_16_bytes_backward_with_shift() 2008 // in unaligned case. 2009 __ dec(count, 8); 2010 } else 2011 #endif 2012 { 2013 // Copy with shift 16 bytes per iteration if arrays do not have 2014 // the same alignment mod 8, otherwise jump to the next 2015 // code for aligned copy (and substracting 8 from 'count' before jump). 2016 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 2017 // Also jump over aligned copy after the copy with shift completed. 2018 2019 copy_16_bytes_backward_with_shift(end_from, end_to, count, 8, 2020 L_aligned_copy, L_copy_2_bytes); 2021 } 2022 // copy 4 elements (16 bytes) at a time 2023 __ align(OptoLoopAlignment); 2024 __ BIND(L_aligned_copy); 2025 __ dec(end_from, 16); 2026 __ ldx(end_from, 8, O3); 2027 __ ldx(end_from, 0, O4); 2028 __ dec(end_to, 16); 2029 __ deccc(count, 8); 2030 __ stx(O3, end_to, 8); 2031 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 2032 __ delayed()->stx(O4, end_to, 0); 2033 __ inc(count, 8); 2034 2035 // copy 1 element (2 bytes) at a time 2036 __ BIND(L_copy_2_bytes); 2037 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 2038 __ BIND(L_copy_2_bytes_loop); 2039 __ dec(end_from, 2); 2040 __ dec(end_to, 2); 2041 __ lduh(end_from, 0, O4); 2042 __ deccc(count); 2043 __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop); 2044 __ delayed()->sth(O4, end_to, 0); 2045 2046 __ BIND(L_exit); 2047 // O3, O4 are used as temp registers 2048 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4); 2049 __ retl(); 2050 __ delayed()->mov(G0, O0); // return 0 2051 return start; 2052 } 2053 2054 // 2055 // Helper methods for generate_disjoint_int_copy_core() 2056 // 2057 void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec, 2058 Label& L_loop, bool use_prefetch, bool use_bis) { 2059 2060 __ align(OptoLoopAlignment); 2061 __ BIND(L_loop); 2062 if (use_prefetch) { 2063 if (ArraycopySrcPrefetchDistance > 0) { 2064 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 2065 } 2066 if (ArraycopyDstPrefetchDistance > 0) { 2067 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 2068 } 2069 } 2070 __ ldx(from, 4, O4); 2071 __ ldx(from, 12, G4); 2072 __ inc(to, 16); 2073 __ inc(from, 16); 2074 __ deccc(count, 4); // Can we do next iteration after this one? 2075 2076 __ srlx(O4, 32, G3); 2077 __ bset(G3, O3); 2078 __ sllx(O4, 32, O4); 2079 __ srlx(G4, 32, G3); 2080 __ bset(G3, O4); 2081 if (use_bis) { 2082 __ stxa(O3, to, -16); 2083 __ stxa(O4, to, -8); 2084 } else { 2085 __ stx(O3, to, -16); 2086 __ stx(O4, to, -8); 2087 } 2088 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 2089 __ delayed()->sllx(G4, 32, O3); 2090 2091 } 2092 2093 // 2094 // Generate core code for disjoint int copy (and oop copy on 32-bit). 2095 // If "aligned" is true, the "from" and "to" addresses are assumed 2096 // to be heapword aligned. 2097 // 2098 // Arguments: 2099 // from: O0 2100 // to: O1 2101 // count: O2 treated as signed 2102 // 2103 void generate_disjoint_int_copy_core(bool aligned) { 2104 2105 Label L_skip_alignment, L_aligned_copy; 2106 Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 2107 2108 const Register from = O0; // source array address 2109 const Register to = O1; // destination array address 2110 const Register count = O2; // elements count 2111 const Register offset = O5; // offset from start of arrays 2112 // O3, O4, G3, G4 are used as temp registers 2113 2114 // 'aligned' == true when it is known statically during compilation 2115 // of this arraycopy call site that both 'from' and 'to' addresses 2116 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 2117 // 2118 // Aligned arrays have 4 bytes alignment in 32-bits VM 2119 // and 8 bytes - in 64-bits VM. 2120 // 2121 #ifdef _LP64 2122 if (!aligned) 2123 #endif 2124 { 2125 // The next check could be put under 'ifndef' since the code in 2126 // generate_disjoint_long_copy_core() has own checks and set 'offset'. 2127 2128 // for short arrays, just do single element copy 2129 __ cmp(count, 5); // 4 + 1 (20 bytes) 2130 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes); 2131 __ delayed()->mov(G0, offset); 2132 2133 // copy 1 element to align 'to' on an 8 byte boundary 2134 __ andcc(to, 7, G0); 2135 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 2136 __ delayed()->ld(from, 0, O3); 2137 __ inc(from, 4); 2138 __ inc(to, 4); 2139 __ dec(count); 2140 __ st(O3, to, -4); 2141 __ BIND(L_skip_alignment); 2142 2143 // if arrays have same alignment mod 8, do 4 elements copy 2144 __ andcc(from, 7, G0); 2145 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 2146 __ delayed()->ld(from, 0, O3); 2147 2148 // 2149 // Load 2 aligned 8-bytes chunks and use one from previous iteration 2150 // to form 2 aligned 8-bytes chunks to store. 2151 // 2152 // copy_16_bytes_forward_with_shift() is not used here since this 2153 // code is more optimal. 2154 2155 // copy with shift 4 elements (16 bytes) at a time 2156 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 2157 __ sllx(O3, 32, O3); 2158 2159 disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop); 2160 2161 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); 2162 __ delayed()->inc(count, 4); // restore 'count' 2163 2164 __ BIND(L_aligned_copy); 2165 } // !aligned 2166 2167 // copy 4 elements (16 bytes) at a time 2168 __ and3(count, 1, G4); // Save 2169 __ srl(count, 1, count); 2170 generate_disjoint_long_copy_core(aligned); 2171 __ mov(G4, count); // Restore 2172 2173 // copy 1 element at a time 2174 __ BIND(L_copy_4_bytes); 2175 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 2176 __ BIND(L_copy_4_bytes_loop); 2177 __ ld(from, offset, O3); 2178 __ deccc(count); 2179 __ st(O3, to, offset); 2180 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop); 2181 __ delayed()->inc(offset, 4); 2182 __ BIND(L_exit); 2183 } 2184 2185 // 2186 // Generate stub for disjoint int copy. If "aligned" is true, the 2187 // "from" and "to" addresses are assumed to be heapword aligned. 2188 // 2189 // Arguments for generated stub: 2190 // from: O0 2191 // to: O1 2192 // count: O2 treated as signed 2193 // 2194 address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) { 2195 __ align(CodeEntryAlignment); 2196 StubCodeMark mark(this, "StubRoutines", name); 2197 address start = __ pc(); 2198 2199 const Register count = O2; 2200 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2201 2202 if (entry != NULL) { 2203 *entry = __ pc(); 2204 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2205 BLOCK_COMMENT("Entry:"); 2206 } 2207 2208 generate_disjoint_int_copy_core(aligned); 2209 2210 // O3, O4 are used as temp registers 2211 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4); 2212 __ retl(); 2213 __ delayed()->mov(G0, O0); // return 0 2214 return start; 2215 } 2216 2217 // 2218 // Generate core code for conjoint int copy (and oop copy on 32-bit). 2219 // If "aligned" is true, the "from" and "to" addresses are assumed 2220 // to be heapword aligned. 2221 // 2222 // Arguments: 2223 // from: O0 2224 // to: O1 2225 // count: O2 treated as signed 2226 // 2227 void generate_conjoint_int_copy_core(bool aligned) { 2228 // Do reverse copy. 2229 2230 Label L_skip_alignment, L_aligned_copy; 2231 Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 2232 2233 const Register from = O0; // source array address 2234 const Register to = O1; // destination array address 2235 const Register count = O2; // elements count 2236 const Register end_from = from; // source array end address 2237 const Register end_to = to; // destination array end address 2238 // O3, O4, O5, G3 are used as temp registers 2239 2240 const Register byte_count = O3; // bytes count to copy 2241 2242 __ sllx(count, LogBytesPerInt, byte_count); 2243 __ add(to, byte_count, end_to); // offset after last copied element 2244 2245 __ cmp(count, 5); // for short arrays, just do single element copy 2246 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes); 2247 __ delayed()->add(from, byte_count, end_from); 2248 2249 // copy 1 element to align 'to' on an 8 byte boundary 2250 __ andcc(end_to, 7, G0); 2251 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 2252 __ delayed()->nop(); 2253 __ dec(count); 2254 __ dec(end_from, 4); 2255 __ dec(end_to, 4); 2256 __ ld(end_from, 0, O4); 2257 __ st(O4, end_to, 0); 2258 __ BIND(L_skip_alignment); 2259 2260 // Check if 'end_from' and 'end_to' has the same alignment. 2261 __ andcc(end_from, 7, G0); 2262 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 2263 __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4 2264 2265 // copy with shift 4 elements (16 bytes) at a time 2266 // 2267 // Load 2 aligned 8-bytes chunks and use one from previous iteration 2268 // to form 2 aligned 8-bytes chunks to store. 2269 // 2270 __ ldx(end_from, -4, O3); 2271 __ align(OptoLoopAlignment); 2272 __ BIND(L_copy_16_bytes); 2273 __ ldx(end_from, -12, O4); 2274 __ deccc(count, 4); 2275 __ ldx(end_from, -20, O5); 2276 __ dec(end_to, 16); 2277 __ dec(end_from, 16); 2278 __ srlx(O3, 32, O3); 2279 __ sllx(O4, 32, G3); 2280 __ bset(G3, O3); 2281 __ stx(O3, end_to, 8); 2282 __ srlx(O4, 32, O4); 2283 __ sllx(O5, 32, G3); 2284 __ bset(O4, G3); 2285 __ stx(G3, end_to, 0); 2286 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); 2287 __ delayed()->mov(O5, O3); 2288 2289 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); 2290 __ delayed()->inc(count, 4); 2291 2292 // copy 4 elements (16 bytes) at a time 2293 __ align(OptoLoopAlignment); 2294 __ BIND(L_aligned_copy); 2295 __ dec(end_from, 16); 2296 __ ldx(end_from, 8, O3); 2297 __ ldx(end_from, 0, O4); 2298 __ dec(end_to, 16); 2299 __ deccc(count, 4); 2300 __ stx(O3, end_to, 8); 2301 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 2302 __ delayed()->stx(O4, end_to, 0); 2303 __ inc(count, 4); 2304 2305 // copy 1 element (4 bytes) at a time 2306 __ BIND(L_copy_4_bytes); 2307 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 2308 __ BIND(L_copy_4_bytes_loop); 2309 __ dec(end_from, 4); 2310 __ dec(end_to, 4); 2311 __ ld(end_from, 0, O4); 2312 __ deccc(count); 2313 __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop); 2314 __ delayed()->st(O4, end_to, 0); 2315 __ BIND(L_exit); 2316 } 2317 2318 // 2319 // Generate stub for conjoint int copy. If "aligned" is true, the 2320 // "from" and "to" addresses are assumed to be heapword aligned. 2321 // 2322 // Arguments for generated stub: 2323 // from: O0 2324 // to: O1 2325 // count: O2 treated as signed 2326 // 2327 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 2328 address *entry, const char *name) { 2329 __ align(CodeEntryAlignment); 2330 StubCodeMark mark(this, "StubRoutines", name); 2331 address start = __ pc(); 2332 2333 assert_clean_int(O2, O3); // Make sure 'count' is clean int. 2334 2335 if (entry != NULL) { 2336 *entry = __ pc(); 2337 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2338 BLOCK_COMMENT("Entry:"); 2339 } 2340 2341 array_overlap_test(nooverlap_target, 2); 2342 2343 generate_conjoint_int_copy_core(aligned); 2344 2345 // O3, O4 are used as temp registers 2346 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4); 2347 __ retl(); 2348 __ delayed()->mov(G0, O0); // return 0 2349 return start; 2350 } 2351 2352 // 2353 // Helper methods for generate_disjoint_long_copy_core() 2354 // 2355 void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec, 2356 Label& L_loop, bool use_prefetch, bool use_bis) { 2357 __ align(OptoLoopAlignment); 2358 __ BIND(L_loop); 2359 for (int off = 0; off < 64; off += 16) { 2360 if (use_prefetch && (off & 31) == 0) { 2361 if (ArraycopySrcPrefetchDistance > 0) { 2362 __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads); 2363 } 2364 if (ArraycopyDstPrefetchDistance > 0) { 2365 __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads); 2366 } 2367 } 2368 __ ldx(from, off+0, O4); 2369 __ ldx(from, off+8, O5); 2370 if (use_bis) { 2371 __ stxa(O4, to, off+0); 2372 __ stxa(O5, to, off+8); 2373 } else { 2374 __ stx(O4, to, off+0); 2375 __ stx(O5, to, off+8); 2376 } 2377 } 2378 __ deccc(count, 8); 2379 __ inc(from, 64); 2380 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 2381 __ delayed()->inc(to, 64); 2382 } 2383 2384 // 2385 // Generate core code for disjoint long copy (and oop copy on 64-bit). 2386 // "aligned" is ignored, because we must make the stronger 2387 // assumption that both addresses are always 64-bit aligned. 2388 // 2389 // Arguments: 2390 // from: O0 2391 // to: O1 2392 // count: O2 treated as signed 2393 // 2394 // count -= 2; 2395 // if ( count >= 0 ) { // >= 2 elements 2396 // if ( count > 6) { // >= 8 elements 2397 // count -= 6; // original count - 8 2398 // do { 2399 // copy_8_elements; 2400 // count -= 8; 2401 // } while ( count >= 0 ); 2402 // count += 6; 2403 // } 2404 // if ( count >= 0 ) { // >= 2 elements 2405 // do { 2406 // copy_2_elements; 2407 // } while ( (count=count-2) >= 0 ); 2408 // } 2409 // } 2410 // count += 2; 2411 // if ( count != 0 ) { // 1 element left 2412 // copy_1_element; 2413 // } 2414 // 2415 void generate_disjoint_long_copy_core(bool aligned) { 2416 Label L_copy_8_bytes, L_copy_16_bytes, L_exit; 2417 const Register from = O0; // source array address 2418 const Register to = O1; // destination array address 2419 const Register count = O2; // elements count 2420 const Register offset0 = O4; // element offset 2421 const Register offset8 = O5; // next element offset 2422 2423 __ deccc(count, 2); 2424 __ mov(G0, offset0); // offset from start of arrays (0) 2425 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 2426 __ delayed()->add(offset0, 8, offset8); 2427 2428 // Copy by 64 bytes chunks 2429 2430 const Register from64 = O3; // source address 2431 const Register to64 = G3; // destination address 2432 __ subcc(count, 6, O3); 2433 __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes ); 2434 __ delayed()->mov(to, to64); 2435 // Now we can use O4(offset0), O5(offset8) as temps 2436 __ mov(O3, count); 2437 // count >= 0 (original count - 8) 2438 __ mov(from, from64); 2439 2440 disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop); 2441 2442 // Restore O4(offset0), O5(offset8) 2443 __ sub(from64, from, offset0); 2444 __ inccc(count, 6); // restore count 2445 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 2446 __ delayed()->add(offset0, 8, offset8); 2447 2448 // Copy by 16 bytes chunks 2449 __ align(OptoLoopAlignment); 2450 __ BIND(L_copy_16_bytes); 2451 __ ldx(from, offset0, O3); 2452 __ ldx(from, offset8, G3); 2453 __ deccc(count, 2); 2454 __ stx(O3, to, offset0); 2455 __ inc(offset0, 16); 2456 __ stx(G3, to, offset8); 2457 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); 2458 __ delayed()->inc(offset8, 16); 2459 2460 // Copy last 8 bytes 2461 __ BIND(L_copy_8_bytes); 2462 __ inccc(count, 2); 2463 __ brx(Assembler::zero, true, Assembler::pn, L_exit ); 2464 __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs 2465 __ ldx(from, offset0, O3); 2466 __ stx(O3, to, offset0); 2467 __ BIND(L_exit); 2468 } 2469 2470 // 2471 // Generate stub for disjoint long copy. 2472 // "aligned" is ignored, because we must make the stronger 2473 // assumption that both addresses are always 64-bit aligned. 2474 // 2475 // Arguments for generated stub: 2476 // from: O0 2477 // to: O1 2478 // count: O2 treated as signed 2479 // 2480 address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) { 2481 __ align(CodeEntryAlignment); 2482 StubCodeMark mark(this, "StubRoutines", name); 2483 address start = __ pc(); 2484 2485 assert_clean_int(O2, O3); // Make sure 'count' is clean int. 2486 2487 if (entry != NULL) { 2488 *entry = __ pc(); 2489 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2490 BLOCK_COMMENT("Entry:"); 2491 } 2492 2493 generate_disjoint_long_copy_core(aligned); 2494 2495 // O3, O4 are used as temp registers 2496 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4); 2497 __ retl(); 2498 __ delayed()->mov(G0, O0); // return 0 2499 return start; 2500 } 2501 2502 // 2503 // Generate core code for conjoint long copy (and oop copy on 64-bit). 2504 // "aligned" is ignored, because we must make the stronger 2505 // assumption that both addresses are always 64-bit aligned. 2506 // 2507 // Arguments: 2508 // from: O0 2509 // to: O1 2510 // count: O2 treated as signed 2511 // 2512 void generate_conjoint_long_copy_core(bool aligned) { 2513 // Do reverse copy. 2514 Label L_copy_8_bytes, L_copy_16_bytes, L_exit; 2515 const Register from = O0; // source array address 2516 const Register to = O1; // destination array address 2517 const Register count = O2; // elements count 2518 const Register offset8 = O4; // element offset 2519 const Register offset0 = O5; // previous element offset 2520 2521 __ subcc(count, 1, count); 2522 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes ); 2523 __ delayed()->sllx(count, LogBytesPerLong, offset8); 2524 __ sub(offset8, 8, offset0); 2525 __ align(OptoLoopAlignment); 2526 __ BIND(L_copy_16_bytes); 2527 __ ldx(from, offset8, O2); 2528 __ ldx(from, offset0, O3); 2529 __ stx(O2, to, offset8); 2530 __ deccc(offset8, 16); // use offset8 as counter 2531 __ stx(O3, to, offset0); 2532 __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes); 2533 __ delayed()->dec(offset0, 16); 2534 2535 __ BIND(L_copy_8_bytes); 2536 __ brx(Assembler::negative, false, Assembler::pn, L_exit ); 2537 __ delayed()->nop(); 2538 __ ldx(from, 0, O3); 2539 __ stx(O3, to, 0); 2540 __ BIND(L_exit); 2541 } 2542 2543 // Generate stub for conjoint long copy. 2544 // "aligned" is ignored, because we must make the stronger 2545 // assumption that both addresses are always 64-bit aligned. 2546 // 2547 // Arguments for generated stub: 2548 // from: O0 2549 // to: O1 2550 // count: O2 treated as signed 2551 // 2552 address generate_conjoint_long_copy(bool aligned, address nooverlap_target, 2553 address *entry, const char *name) { 2554 __ align(CodeEntryAlignment); 2555 StubCodeMark mark(this, "StubRoutines", name); 2556 address start = __ pc(); 2557 2558 assert(aligned, "Should always be aligned"); 2559 2560 assert_clean_int(O2, O3); // Make sure 'count' is clean int. 2561 2562 if (entry != NULL) { 2563 *entry = __ pc(); 2564 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2565 BLOCK_COMMENT("Entry:"); 2566 } 2567 2568 array_overlap_test(nooverlap_target, 3); 2569 2570 generate_conjoint_long_copy_core(aligned); 2571 2572 // O3, O4 are used as temp registers 2573 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4); 2574 __ retl(); 2575 __ delayed()->mov(G0, O0); // return 0 2576 return start; 2577 } 2578 2579 // Generate stub for disjoint oop copy. If "aligned" is true, the 2580 // "from" and "to" addresses are assumed to be heapword aligned. 2581 // 2582 // Arguments for generated stub: 2583 // from: O0 2584 // to: O1 2585 // count: O2 treated as signed 2586 // 2587 address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name, 2588 bool dest_uninitialized = false) { 2589 2590 const Register from = O0; // source array address 2591 const Register to = O1; // destination array address 2592 const Register count = O2; // elements count 2593 2594 __ align(CodeEntryAlignment); 2595 StubCodeMark mark(this, "StubRoutines", name); 2596 address start = __ pc(); 2597 2598 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2599 2600 if (entry != NULL) { 2601 *entry = __ pc(); 2602 // caller can pass a 64-bit byte count here 2603 BLOCK_COMMENT("Entry:"); 2604 } 2605 2606 // save arguments for barrier generation 2607 __ mov(to, G1); 2608 __ mov(count, G5); 2609 gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized); 2610 #ifdef _LP64 2611 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2612 if (UseCompressedOops) { 2613 generate_disjoint_int_copy_core(aligned); 2614 } else { 2615 generate_disjoint_long_copy_core(aligned); 2616 } 2617 #else 2618 generate_disjoint_int_copy_core(aligned); 2619 #endif 2620 // O0 is used as temp register 2621 gen_write_ref_array_post_barrier(G1, G5, O0); 2622 2623 // O3, O4 are used as temp registers 2624 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4); 2625 __ retl(); 2626 __ delayed()->mov(G0, O0); // return 0 2627 return start; 2628 } 2629 2630 // Generate stub for conjoint oop copy. If "aligned" is true, the 2631 // "from" and "to" addresses are assumed to be heapword aligned. 2632 // 2633 // Arguments for generated stub: 2634 // from: O0 2635 // to: O1 2636 // count: O2 treated as signed 2637 // 2638 address generate_conjoint_oop_copy(bool aligned, address nooverlap_target, 2639 address *entry, const char *name, 2640 bool dest_uninitialized = false) { 2641 2642 const Register from = O0; // source array address 2643 const Register to = O1; // destination array address 2644 const Register count = O2; // elements count 2645 2646 __ align(CodeEntryAlignment); 2647 StubCodeMark mark(this, "StubRoutines", name); 2648 address start = __ pc(); 2649 2650 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2651 2652 if (entry != NULL) { 2653 *entry = __ pc(); 2654 // caller can pass a 64-bit byte count here 2655 BLOCK_COMMENT("Entry:"); 2656 } 2657 2658 array_overlap_test(nooverlap_target, LogBytesPerHeapOop); 2659 2660 // save arguments for barrier generation 2661 __ mov(to, G1); 2662 __ mov(count, G5); 2663 gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized); 2664 2665 #ifdef _LP64 2666 if (UseCompressedOops) { 2667 generate_conjoint_int_copy_core(aligned); 2668 } else { 2669 generate_conjoint_long_copy_core(aligned); 2670 } 2671 #else 2672 generate_conjoint_int_copy_core(aligned); 2673 #endif 2674 2675 // O0 is used as temp register 2676 gen_write_ref_array_post_barrier(G1, G5, O0); 2677 2678 // O3, O4 are used as temp registers 2679 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4); 2680 __ retl(); 2681 __ delayed()->mov(G0, O0); // return 0 2682 return start; 2683 } 2684 2685 2686 // Helper for generating a dynamic type check. 2687 // Smashes only the given temp registers. 2688 void generate_type_check(Register sub_klass, 2689 Register super_check_offset, 2690 Register super_klass, 2691 Register temp, 2692 Label& L_success) { 2693 assert_different_registers(sub_klass, super_check_offset, super_klass, temp); 2694 2695 BLOCK_COMMENT("type_check:"); 2696 2697 Label L_miss, L_pop_to_miss; 2698 2699 assert_clean_int(super_check_offset, temp); 2700 2701 __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg, 2702 &L_success, &L_miss, NULL, 2703 super_check_offset); 2704 2705 BLOCK_COMMENT("type_check_slow_path:"); 2706 __ save_frame(0); 2707 __ check_klass_subtype_slow_path(sub_klass->after_save(), 2708 super_klass->after_save(), 2709 L0, L1, L2, L4, 2710 NULL, &L_pop_to_miss); 2711 __ ba(L_success); 2712 __ delayed()->restore(); 2713 2714 __ bind(L_pop_to_miss); 2715 __ restore(); 2716 2717 // Fall through on failure! 2718 __ BIND(L_miss); 2719 } 2720 2721 2722 // Generate stub for checked oop copy. 2723 // 2724 // Arguments for generated stub: 2725 // from: O0 2726 // to: O1 2727 // count: O2 treated as signed 2728 // ckoff: O3 (super_check_offset) 2729 // ckval: O4 (super_klass) 2730 // ret: O0 zero for success; (-1^K) where K is partial transfer count 2731 // 2732 address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) { 2733 2734 const Register O0_from = O0; // source array address 2735 const Register O1_to = O1; // destination array address 2736 const Register O2_count = O2; // elements count 2737 const Register O3_ckoff = O3; // super_check_offset 2738 const Register O4_ckval = O4; // super_klass 2739 2740 const Register O5_offset = O5; // loop var, with stride wordSize 2741 const Register G1_remain = G1; // loop var, with stride -1 2742 const Register G3_oop = G3; // actual oop copied 2743 const Register G4_klass = G4; // oop._klass 2744 const Register G5_super = G5; // oop._klass._primary_supers[ckval] 2745 2746 __ align(CodeEntryAlignment); 2747 StubCodeMark mark(this, "StubRoutines", name); 2748 address start = __ pc(); 2749 2750 #ifdef ASSERT 2751 // We sometimes save a frame (see generate_type_check below). 2752 // If this will cause trouble, let's fail now instead of later. 2753 __ save_frame(0); 2754 __ restore(); 2755 #endif 2756 2757 assert_clean_int(O2_count, G1); // Make sure 'count' is clean int. 2758 2759 #ifdef ASSERT 2760 // caller guarantees that the arrays really are different 2761 // otherwise, we would have to make conjoint checks 2762 { Label L; 2763 __ mov(O3, G1); // spill: overlap test smashes O3 2764 __ mov(O4, G4); // spill: overlap test smashes O4 2765 array_overlap_test(L, LogBytesPerHeapOop); 2766 __ stop("checkcast_copy within a single array"); 2767 __ bind(L); 2768 __ mov(G1, O3); 2769 __ mov(G4, O4); 2770 } 2771 #endif //ASSERT 2772 2773 if (entry != NULL) { 2774 *entry = __ pc(); 2775 // caller can pass a 64-bit byte count here (from generic stub) 2776 BLOCK_COMMENT("Entry:"); 2777 } 2778 gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized); 2779 2780 Label load_element, store_element, do_card_marks, fail, done; 2781 __ addcc(O2_count, 0, G1_remain); // initialize loop index, and test it 2782 __ brx(Assembler::notZero, false, Assembler::pt, load_element); 2783 __ delayed()->mov(G0, O5_offset); // offset from start of arrays 2784 2785 // Empty array: Nothing to do. 2786 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4); 2787 __ retl(); 2788 __ delayed()->set(0, O0); // return 0 on (trivial) success 2789 2790 // ======== begin loop ======== 2791 // (Loop is rotated; its entry is load_element.) 2792 // Loop variables: 2793 // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays 2794 // (O2 = len; O2 != 0; O2--) --- number of oops *remaining* 2795 // G3, G4, G5 --- current oop, oop.klass, oop.klass.super 2796 __ align(OptoLoopAlignment); 2797 2798 __ BIND(store_element); 2799 __ deccc(G1_remain); // decrement the count 2800 __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop 2801 __ inc(O5_offset, heapOopSize); // step to next offset 2802 __ brx(Assembler::zero, true, Assembler::pt, do_card_marks); 2803 __ delayed()->set(0, O0); // return -1 on success 2804 2805 // ======== loop entry is here ======== 2806 __ BIND(load_element); 2807 __ load_heap_oop(O0_from, O5_offset, G3_oop); // load the oop 2808 __ br_null_short(G3_oop, Assembler::pt, store_element); 2809 2810 __ load_klass(G3_oop, G4_klass); // query the object klass 2811 2812 generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super, 2813 // branch to this on success: 2814 store_element); 2815 // ======== end loop ======== 2816 2817 // It was a real error; we must depend on the caller to finish the job. 2818 // Register G1 has number of *remaining* oops, O2 number of *total* oops. 2819 // Emit GC store barriers for the oops we have copied (O2 minus G1), 2820 // and report their number to the caller. 2821 __ BIND(fail); 2822 __ subcc(O2_count, G1_remain, O2_count); 2823 __ brx(Assembler::zero, false, Assembler::pt, done); 2824 __ delayed()->not1(O2_count, O0); // report (-1^K) to caller 2825 2826 __ BIND(do_card_marks); 2827 gen_write_ref_array_post_barrier(O1_to, O2_count, O3); // store check on O1[0..O2] 2828 2829 __ BIND(done); 2830 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4); 2831 __ retl(); 2832 __ delayed()->nop(); // return value in 00 2833 2834 return start; 2835 } 2836 2837 2838 // Generate 'unsafe' array copy stub 2839 // Though just as safe as the other stubs, it takes an unscaled 2840 // size_t argument instead of an element count. 2841 // 2842 // Arguments for generated stub: 2843 // from: O0 2844 // to: O1 2845 // count: O2 byte count, treated as ssize_t, can be zero 2846 // 2847 // Examines the alignment of the operands and dispatches 2848 // to a long, int, short, or byte copy loop. 2849 // 2850 address generate_unsafe_copy(const char* name, 2851 address byte_copy_entry, 2852 address short_copy_entry, 2853 address int_copy_entry, 2854 address long_copy_entry) { 2855 2856 const Register O0_from = O0; // source array address 2857 const Register O1_to = O1; // destination array address 2858 const Register O2_count = O2; // elements count 2859 2860 const Register G1_bits = G1; // test copy of low bits 2861 2862 __ align(CodeEntryAlignment); 2863 StubCodeMark mark(this, "StubRoutines", name); 2864 address start = __ pc(); 2865 2866 // bump this on entry, not on exit: 2867 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3); 2868 2869 __ or3(O0_from, O1_to, G1_bits); 2870 __ or3(O2_count, G1_bits, G1_bits); 2871 2872 __ btst(BytesPerLong-1, G1_bits); 2873 __ br(Assembler::zero, true, Assembler::pt, 2874 long_copy_entry, relocInfo::runtime_call_type); 2875 // scale the count on the way out: 2876 __ delayed()->srax(O2_count, LogBytesPerLong, O2_count); 2877 2878 __ btst(BytesPerInt-1, G1_bits); 2879 __ br(Assembler::zero, true, Assembler::pt, 2880 int_copy_entry, relocInfo::runtime_call_type); 2881 // scale the count on the way out: 2882 __ delayed()->srax(O2_count, LogBytesPerInt, O2_count); 2883 2884 __ btst(BytesPerShort-1, G1_bits); 2885 __ br(Assembler::zero, true, Assembler::pt, 2886 short_copy_entry, relocInfo::runtime_call_type); 2887 // scale the count on the way out: 2888 __ delayed()->srax(O2_count, LogBytesPerShort, O2_count); 2889 2890 __ br(Assembler::always, false, Assembler::pt, 2891 byte_copy_entry, relocInfo::runtime_call_type); 2892 __ delayed()->nop(); 2893 2894 return start; 2895 } 2896 2897 2898 // Perform range checks on the proposed arraycopy. 2899 // Kills the two temps, but nothing else. 2900 // Also, clean the sign bits of src_pos and dst_pos. 2901 void arraycopy_range_checks(Register src, // source array oop (O0) 2902 Register src_pos, // source position (O1) 2903 Register dst, // destination array oo (O2) 2904 Register dst_pos, // destination position (O3) 2905 Register length, // length of copy (O4) 2906 Register temp1, Register temp2, 2907 Label& L_failed) { 2908 BLOCK_COMMENT("arraycopy_range_checks:"); 2909 2910 // if (src_pos + length > arrayOop(src)->length() ) FAIL; 2911 2912 const Register array_length = temp1; // scratch 2913 const Register end_pos = temp2; // scratch 2914 2915 // Note: This next instruction may be in the delay slot of a branch: 2916 __ add(length, src_pos, end_pos); // src_pos + length 2917 __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length); 2918 __ cmp(end_pos, array_length); 2919 __ br(Assembler::greater, false, Assembler::pn, L_failed); 2920 2921 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL; 2922 __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length 2923 __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length); 2924 __ cmp(end_pos, array_length); 2925 __ br(Assembler::greater, false, Assembler::pn, L_failed); 2926 2927 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'. 2928 // Move with sign extension can be used since they are positive. 2929 __ delayed()->signx(src_pos, src_pos); 2930 __ signx(dst_pos, dst_pos); 2931 2932 BLOCK_COMMENT("arraycopy_range_checks done"); 2933 } 2934 2935 2936 // 2937 // Generate generic array copy stubs 2938 // 2939 // Input: 2940 // O0 - src oop 2941 // O1 - src_pos 2942 // O2 - dst oop 2943 // O3 - dst_pos 2944 // O4 - element count 2945 // 2946 // Output: 2947 // O0 == 0 - success 2948 // O0 == -1 - need to call System.arraycopy 2949 // 2950 address generate_generic_copy(const char *name, 2951 address entry_jbyte_arraycopy, 2952 address entry_jshort_arraycopy, 2953 address entry_jint_arraycopy, 2954 address entry_oop_arraycopy, 2955 address entry_jlong_arraycopy, 2956 address entry_checkcast_arraycopy) { 2957 Label L_failed, L_objArray; 2958 2959 // Input registers 2960 const Register src = O0; // source array oop 2961 const Register src_pos = O1; // source position 2962 const Register dst = O2; // destination array oop 2963 const Register dst_pos = O3; // destination position 2964 const Register length = O4; // elements count 2965 2966 // registers used as temp 2967 const Register G3_src_klass = G3; // source array klass 2968 const Register G4_dst_klass = G4; // destination array klass 2969 const Register G5_lh = G5; // layout handler 2970 const Register O5_temp = O5; 2971 2972 __ align(CodeEntryAlignment); 2973 StubCodeMark mark(this, "StubRoutines", name); 2974 address start = __ pc(); 2975 2976 // bump this on entry, not on exit: 2977 inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3); 2978 2979 // In principle, the int arguments could be dirty. 2980 //assert_clean_int(src_pos, G1); 2981 //assert_clean_int(dst_pos, G1); 2982 //assert_clean_int(length, G1); 2983 2984 //----------------------------------------------------------------------- 2985 // Assembler stubs will be used for this call to arraycopy 2986 // if the following conditions are met: 2987 // 2988 // (1) src and dst must not be null. 2989 // (2) src_pos must not be negative. 2990 // (3) dst_pos must not be negative. 2991 // (4) length must not be negative. 2992 // (5) src klass and dst klass should be the same and not NULL. 2993 // (6) src and dst should be arrays. 2994 // (7) src_pos + length must not exceed length of src. 2995 // (8) dst_pos + length must not exceed length of dst. 2996 BLOCK_COMMENT("arraycopy initial argument checks"); 2997 2998 // if (src == NULL) return -1; 2999 __ br_null(src, false, Assembler::pn, L_failed); 3000 3001 // if (src_pos < 0) return -1; 3002 __ delayed()->tst(src_pos); 3003 __ br(Assembler::negative, false, Assembler::pn, L_failed); 3004 __ delayed()->nop(); 3005 3006 // if (dst == NULL) return -1; 3007 __ br_null(dst, false, Assembler::pn, L_failed); 3008 3009 // if (dst_pos < 0) return -1; 3010 __ delayed()->tst(dst_pos); 3011 __ br(Assembler::negative, false, Assembler::pn, L_failed); 3012 3013 // if (length < 0) return -1; 3014 __ delayed()->tst(length); 3015 __ br(Assembler::negative, false, Assembler::pn, L_failed); 3016 3017 BLOCK_COMMENT("arraycopy argument klass checks"); 3018 // get src->klass() 3019 if (UseCompressedKlassPointers) { 3020 __ delayed()->nop(); // ??? not good 3021 __ load_klass(src, G3_src_klass); 3022 } else { 3023 __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass); 3024 } 3025 3026 #ifdef ASSERT 3027 // assert(src->klass() != NULL); 3028 BLOCK_COMMENT("assert klasses not null"); 3029 { Label L_a, L_b; 3030 __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL 3031 __ bind(L_a); 3032 __ stop("broken null klass"); 3033 __ bind(L_b); 3034 __ load_klass(dst, G4_dst_klass); 3035 __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also 3036 __ delayed()->mov(G0, G4_dst_klass); // scribble the temp 3037 BLOCK_COMMENT("assert done"); 3038 } 3039 #endif 3040 3041 // Load layout helper 3042 // 3043 // |array_tag| | header_size | element_type | |log2_element_size| 3044 // 32 30 24 16 8 2 0 3045 // 3046 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 3047 // 3048 3049 int lh_offset = in_bytes(Klass::layout_helper_offset()); 3050 3051 // Load 32-bits signed value. Use br() instruction with it to check icc. 3052 __ lduw(G3_src_klass, lh_offset, G5_lh); 3053 3054 if (UseCompressedKlassPointers) { 3055 __ load_klass(dst, G4_dst_klass); 3056 } 3057 // Handle objArrays completely differently... 3058 juint objArray_lh = Klass::array_layout_helper(T_OBJECT); 3059 __ set(objArray_lh, O5_temp); 3060 __ cmp(G5_lh, O5_temp); 3061 __ br(Assembler::equal, false, Assembler::pt, L_objArray); 3062 if (UseCompressedKlassPointers) { 3063 __ delayed()->nop(); 3064 } else { 3065 __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass); 3066 } 3067 3068 // if (src->klass() != dst->klass()) return -1; 3069 __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed); 3070 3071 // if (!src->is_Array()) return -1; 3072 __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0 3073 __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed); 3074 3075 // At this point, it is known to be a typeArray (array_tag 0x3). 3076 #ifdef ASSERT 3077 __ delayed()->nop(); 3078 { Label L; 3079 jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 3080 __ set(lh_prim_tag_in_place, O5_temp); 3081 __ cmp(G5_lh, O5_temp); 3082 __ br(Assembler::greaterEqual, false, Assembler::pt, L); 3083 __ delayed()->nop(); 3084 __ stop("must be a primitive array"); 3085 __ bind(L); 3086 } 3087 #else 3088 __ delayed(); // match next insn to prev branch 3089 #endif 3090 3091 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 3092 O5_temp, G4_dst_klass, L_failed); 3093 3094 // TypeArrayKlass 3095 // 3096 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 3097 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 3098 // 3099 3100 const Register G4_offset = G4_dst_klass; // array offset 3101 const Register G3_elsize = G3_src_klass; // log2 element size 3102 3103 __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset); 3104 __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset 3105 __ add(src, G4_offset, src); // src array offset 3106 __ add(dst, G4_offset, dst); // dst array offset 3107 __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size 3108 3109 // next registers should be set before the jump to corresponding stub 3110 const Register from = O0; // source array address 3111 const Register to = O1; // destination array address 3112 const Register count = O2; // elements count 3113 3114 // 'from', 'to', 'count' registers should be set in this order 3115 // since they are the same as 'src', 'src_pos', 'dst'. 3116 3117 BLOCK_COMMENT("scale indexes to element size"); 3118 __ sll_ptr(src_pos, G3_elsize, src_pos); 3119 __ sll_ptr(dst_pos, G3_elsize, dst_pos); 3120 __ add(src, src_pos, from); // src_addr 3121 __ add(dst, dst_pos, to); // dst_addr 3122 3123 BLOCK_COMMENT("choose copy loop based on element size"); 3124 __ cmp(G3_elsize, 0); 3125 __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy); 3126 __ delayed()->signx(length, count); // length 3127 3128 __ cmp(G3_elsize, LogBytesPerShort); 3129 __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy); 3130 __ delayed()->signx(length, count); // length 3131 3132 __ cmp(G3_elsize, LogBytesPerInt); 3133 __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy); 3134 __ delayed()->signx(length, count); // length 3135 #ifdef ASSERT 3136 { Label L; 3137 __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L); 3138 __ stop("must be long copy, but elsize is wrong"); 3139 __ bind(L); 3140 } 3141 #endif 3142 __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy); 3143 __ delayed()->signx(length, count); // length 3144 3145 // ObjArrayKlass 3146 __ BIND(L_objArray); 3147 // live at this point: G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length 3148 3149 Label L_plain_copy, L_checkcast_copy; 3150 // test array classes for subtyping 3151 __ cmp(G3_src_klass, G4_dst_klass); // usual case is exact equality 3152 __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy); 3153 __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below 3154 3155 // Identically typed arrays can be copied without element-wise checks. 3156 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 3157 O5_temp, G5_lh, L_failed); 3158 3159 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset 3160 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset 3161 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos); 3162 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos); 3163 __ add(src, src_pos, from); // src_addr 3164 __ add(dst, dst_pos, to); // dst_addr 3165 __ BIND(L_plain_copy); 3166 __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy); 3167 __ delayed()->signx(length, count); // length 3168 3169 __ BIND(L_checkcast_copy); 3170 // live at this point: G3_src_klass, G4_dst_klass 3171 { 3172 // Before looking at dst.length, make sure dst is also an objArray. 3173 // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot 3174 __ cmp(G5_lh, O5_temp); 3175 __ br(Assembler::notEqual, false, Assembler::pn, L_failed); 3176 3177 // It is safe to examine both src.length and dst.length. 3178 __ delayed(); // match next insn to prev branch 3179 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 3180 O5_temp, G5_lh, L_failed); 3181 3182 // Marshal the base address arguments now, freeing registers. 3183 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset 3184 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset 3185 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos); 3186 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos); 3187 __ add(src, src_pos, from); // src_addr 3188 __ add(dst, dst_pos, to); // dst_addr 3189 __ signx(length, count); // length (reloaded) 3190 3191 Register sco_temp = O3; // this register is free now 3192 assert_different_registers(from, to, count, sco_temp, 3193 G4_dst_klass, G3_src_klass); 3194 3195 // Generate the type check. 3196 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 3197 __ lduw(G4_dst_klass, sco_offset, sco_temp); 3198 generate_type_check(G3_src_klass, sco_temp, G4_dst_klass, 3199 O5_temp, L_plain_copy); 3200 3201 // Fetch destination element klass from the ObjArrayKlass header. 3202 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 3203 3204 // the checkcast_copy loop needs two extra arguments: 3205 __ ld_ptr(G4_dst_klass, ek_offset, O4); // dest elem klass 3206 // lduw(O4, sco_offset, O3); // sco of elem klass 3207 3208 __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy); 3209 __ delayed()->lduw(O4, sco_offset, O3); 3210 } 3211 3212 __ BIND(L_failed); 3213 __ retl(); 3214 __ delayed()->sub(G0, 1, O0); // return -1 3215 return start; 3216 } 3217 3218 // 3219 // Generate stub for heap zeroing. 3220 // "to" address is aligned to jlong (8 bytes). 3221 // 3222 // Arguments for generated stub: 3223 // to: O0 3224 // count: O1 treated as signed (count of HeapWord) 3225 // count could be 0 3226 // 3227 address generate_zero_aligned_words(const char* name) { 3228 __ align(CodeEntryAlignment); 3229 StubCodeMark mark(this, "StubRoutines", name); 3230 address start = __ pc(); 3231 3232 const Register to = O0; // source array address 3233 const Register count = O1; // HeapWords count 3234 const Register temp = O2; // scratch 3235 3236 Label Ldone; 3237 __ sllx(count, LogHeapWordSize, count); // to bytes count 3238 // Use BIS for zeroing 3239 __ bis_zeroing(to, count, temp, Ldone); 3240 __ bind(Ldone); 3241 __ retl(); 3242 __ delayed()->nop(); 3243 return start; 3244 } 3245 3246 void generate_arraycopy_stubs() { 3247 address entry; 3248 address entry_jbyte_arraycopy; 3249 address entry_jshort_arraycopy; 3250 address entry_jint_arraycopy; 3251 address entry_oop_arraycopy; 3252 address entry_jlong_arraycopy; 3253 address entry_checkcast_arraycopy; 3254 3255 //*** jbyte 3256 // Always need aligned and unaligned versions 3257 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 3258 "jbyte_disjoint_arraycopy"); 3259 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 3260 &entry_jbyte_arraycopy, 3261 "jbyte_arraycopy"); 3262 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 3263 "arrayof_jbyte_disjoint_arraycopy"); 3264 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 3265 "arrayof_jbyte_arraycopy"); 3266 3267 //*** jshort 3268 // Always need aligned and unaligned versions 3269 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 3270 "jshort_disjoint_arraycopy"); 3271 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 3272 &entry_jshort_arraycopy, 3273 "jshort_arraycopy"); 3274 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 3275 "arrayof_jshort_disjoint_arraycopy"); 3276 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 3277 "arrayof_jshort_arraycopy"); 3278 3279 //*** jint 3280 // Aligned versions 3281 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 3282 "arrayof_jint_disjoint_arraycopy"); 3283 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 3284 "arrayof_jint_arraycopy"); 3285 #ifdef _LP64 3286 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 3287 // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it). 3288 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 3289 "jint_disjoint_arraycopy"); 3290 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 3291 &entry_jint_arraycopy, 3292 "jint_arraycopy"); 3293 #else 3294 // In 32 bit jints are always HeapWordSize aligned, so always use the aligned version 3295 // (in fact in 32bit we always have a pre-loop part even in the aligned version, 3296 // because it uses 64-bit loads/stores, so the aligned flag is actually ignored). 3297 StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy; 3298 StubRoutines::_jint_arraycopy = StubRoutines::_arrayof_jint_arraycopy; 3299 #endif 3300 3301 3302 //*** jlong 3303 // It is always aligned 3304 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 3305 "arrayof_jlong_disjoint_arraycopy"); 3306 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 3307 "arrayof_jlong_arraycopy"); 3308 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 3309 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 3310 3311 3312 //*** oops 3313 // Aligned versions 3314 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, &entry, 3315 "arrayof_oop_disjoint_arraycopy"); 3316 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy, 3317 "arrayof_oop_arraycopy"); 3318 // Aligned versions without pre-barriers 3319 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry, 3320 "arrayof_oop_disjoint_arraycopy_uninit", 3321 /*dest_uninitialized*/true); 3322 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, entry, NULL, 3323 "arrayof_oop_arraycopy_uninit", 3324 /*dest_uninitialized*/true); 3325 #ifdef _LP64 3326 if (UseCompressedOops) { 3327 // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy. 3328 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, &entry, 3329 "oop_disjoint_arraycopy"); 3330 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy, 3331 "oop_arraycopy"); 3332 // Unaligned versions without pre-barriers 3333 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, &entry, 3334 "oop_disjoint_arraycopy_uninit", 3335 /*dest_uninitialized*/true); 3336 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, entry, NULL, 3337 "oop_arraycopy_uninit", 3338 /*dest_uninitialized*/true); 3339 } else 3340 #endif 3341 { 3342 // oop arraycopy is always aligned on 32bit and 64bit without compressed oops 3343 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 3344 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 3345 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 3346 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 3347 } 3348 3349 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 3350 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 3351 /*dest_uninitialized*/true); 3352 3353 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 3354 entry_jbyte_arraycopy, 3355 entry_jshort_arraycopy, 3356 entry_jint_arraycopy, 3357 entry_jlong_arraycopy); 3358 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 3359 entry_jbyte_arraycopy, 3360 entry_jshort_arraycopy, 3361 entry_jint_arraycopy, 3362 entry_oop_arraycopy, 3363 entry_jlong_arraycopy, 3364 entry_checkcast_arraycopy); 3365 3366 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 3367 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 3368 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 3369 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 3370 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 3371 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 3372 3373 if (UseBlockZeroing) { 3374 StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words"); 3375 } 3376 } 3377 3378 void generate_initial() { 3379 // Generates all stubs and initializes the entry points 3380 3381 //------------------------------------------------------------------------------------------------------------------------ 3382 // entry points that exist in all platforms 3383 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than 3384 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp. 3385 StubRoutines::_forward_exception_entry = generate_forward_exception(); 3386 3387 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address); 3388 StubRoutines::_catch_exception_entry = generate_catch_exception(); 3389 3390 //------------------------------------------------------------------------------------------------------------------------ 3391 // entry points that are platform specific 3392 StubRoutines::Sparc::_test_stop_entry = generate_test_stop(); 3393 3394 StubRoutines::Sparc::_stop_subroutine_entry = generate_stop_subroutine(); 3395 StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows(); 3396 3397 #if !defined(COMPILER2) && !defined(_LP64) 3398 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg(); 3399 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg(); 3400 StubRoutines::_atomic_add_entry = generate_atomic_add(); 3401 StubRoutines::_atomic_xchg_ptr_entry = StubRoutines::_atomic_xchg_entry; 3402 StubRoutines::_atomic_cmpxchg_ptr_entry = StubRoutines::_atomic_cmpxchg_entry; 3403 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long(); 3404 StubRoutines::_atomic_add_ptr_entry = StubRoutines::_atomic_add_entry; 3405 #endif // COMPILER2 !=> _LP64 3406 3407 // Build this early so it's available for the interpreter. 3408 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError)); 3409 } 3410 3411 3412 void generate_all() { 3413 // Generates all stubs and initializes the entry points 3414 3415 // Generate partial_subtype_check first here since its code depends on 3416 // UseZeroBaseCompressedOops which is defined after heap initialization. 3417 StubRoutines::Sparc::_partial_subtype_check = generate_partial_subtype_check(); 3418 // These entry points require SharedInfo::stack0 to be set up in non-core builds 3419 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError)); 3420 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError)); 3421 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call)); 3422 3423 StubRoutines::_handler_for_unsafe_access_entry = 3424 generate_handler_for_unsafe_access(); 3425 3426 // support for verify_oop (must happen after universe_init) 3427 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop_subroutine(); 3428 3429 // arraycopy stubs used by compilers 3430 generate_arraycopy_stubs(); 3431 3432 // Don't initialize the platform math functions since sparc 3433 // doesn't have intrinsics for these operations. 3434 } 3435 3436 3437 public: 3438 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 3439 // replace the standard masm with a special one: 3440 _masm = new MacroAssembler(code); 3441 3442 _stub_count = !all ? 0x100 : 0x200; 3443 if (all) { 3444 generate_all(); 3445 } else { 3446 generate_initial(); 3447 } 3448 3449 // make sure this stub is available for all local calls 3450 if (_atomic_add_stub.is_unbound()) { 3451 // generate a second time, if necessary 3452 (void) generate_atomic_add(); 3453 } 3454 } 3455 3456 3457 private: 3458 int _stub_count; 3459 void stub_prolog(StubCodeDesc* cdesc) { 3460 # ifdef ASSERT 3461 // put extra information in the stub code, to make it more readable 3462 #ifdef _LP64 3463 // Write the high part of the address 3464 // [RGV] Check if there is a dependency on the size of this prolog 3465 __ emit_data((intptr_t)cdesc >> 32, relocInfo::none); 3466 #endif 3467 __ emit_data((intptr_t)cdesc, relocInfo::none); 3468 __ emit_data(++_stub_count, relocInfo::none); 3469 # endif 3470 align(true); 3471 } 3472 3473 void align(bool at_header = false) { 3474 // %%%%% move this constant somewhere else 3475 // UltraSPARC cache line size is 8 instructions: 3476 const unsigned int icache_line_size = 32; 3477 const unsigned int icache_half_line_size = 16; 3478 3479 if (at_header) { 3480 while ((intptr_t)(__ pc()) % icache_line_size != 0) { 3481 __ emit_data(0, relocInfo::none); 3482 } 3483 } else { 3484 while ((intptr_t)(__ pc()) % icache_half_line_size != 0) { 3485 __ nop(); 3486 } 3487 } 3488 } 3489 3490 }; // end class declaration 3491 3492 void StubGenerator_generate(CodeBuffer* code, bool all) { 3493 StubGenerator g(code, all); 3494 }