1 /* 2 * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/macroAssembler.inline.hpp" 27 #include "interpreter/interpreter.hpp" 28 #include "nativeInst_sparc.hpp" 29 #include "oops/instanceOop.hpp" 30 #include "oops/method.hpp" 31 #include "oops/objArrayKlass.hpp" 32 #include "oops/oop.inline.hpp" 33 #include "prims/methodHandles.hpp" 34 #include "runtime/frame.inline.hpp" 35 #include "runtime/handles.inline.hpp" 36 #include "runtime/sharedRuntime.hpp" 37 #include "runtime/stubCodeGenerator.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "runtime/thread.inline.hpp" 40 #include "utilities/top.hpp" 41 #ifdef COMPILER2 42 #include "opto/runtime.hpp" 43 #endif 44 45 // Declaration and definition of StubGenerator (no .hpp file). 46 // For a more detailed description of the stub routine structure 47 // see the comment in stubRoutines.hpp. 48 49 #define __ _masm-> 50 51 #ifdef PRODUCT 52 #define BLOCK_COMMENT(str) /* nothing */ 53 #else 54 #define BLOCK_COMMENT(str) __ block_comment(str) 55 #endif 56 57 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 58 59 // Note: The register L7 is used as L7_thread_cache, and may not be used 60 // any other way within this module. 61 62 63 static const Register& Lstub_temp = L2; 64 65 // ------------------------------------------------------------------------------------------------------------------------- 66 // Stub Code definitions 67 68 static address handle_unsafe_access() { 69 JavaThread* thread = JavaThread::current(); 70 address pc = thread->saved_exception_pc(); 71 address npc = thread->saved_exception_npc(); 72 // pc is the instruction which we must emulate 73 // doing a no-op is fine: return garbage from the load 74 75 // request an async exception 76 thread->set_pending_unsafe_access_error(); 77 78 // return address of next instruction to execute 79 return npc; 80 } 81 82 class StubGenerator: public StubCodeGenerator { 83 private: 84 85 #ifdef PRODUCT 86 #define inc_counter_np(a,b,c) 87 #else 88 #define inc_counter_np(counter, t1, t2) \ 89 BLOCK_COMMENT("inc_counter " #counter); \ 90 __ inc_counter(&counter, t1, t2); 91 #endif 92 93 //---------------------------------------------------------------------------------------------------- 94 // Call stubs are used to call Java from C 95 96 address generate_call_stub(address& return_pc) { 97 StubCodeMark mark(this, "StubRoutines", "call_stub"); 98 address start = __ pc(); 99 100 // Incoming arguments: 101 // 102 // o0 : call wrapper address 103 // o1 : result (address) 104 // o2 : result type 105 // o3 : method 106 // o4 : (interpreter) entry point 107 // o5 : parameters (address) 108 // [sp + 0x5c]: parameter size (in words) 109 // [sp + 0x60]: thread 110 // 111 // +---------------+ <--- sp + 0 112 // | | 113 // . reg save area . 114 // | | 115 // +---------------+ <--- sp + 0x40 116 // | | 117 // . extra 7 slots . 118 // | | 119 // +---------------+ <--- sp + 0x5c 120 // | param. size | 121 // +---------------+ <--- sp + 0x60 122 // | thread | 123 // +---------------+ 124 // | | 125 126 // note: if the link argument position changes, adjust 127 // the code in frame::entry_frame_call_wrapper() 128 129 const Argument link = Argument(0, false); // used only for GC 130 const Argument result = Argument(1, false); 131 const Argument result_type = Argument(2, false); 132 const Argument method = Argument(3, false); 133 const Argument entry_point = Argument(4, false); 134 const Argument parameters = Argument(5, false); 135 const Argument parameter_size = Argument(6, false); 136 const Argument thread = Argument(7, false); 137 138 // setup thread register 139 __ ld_ptr(thread.as_address(), G2_thread); 140 __ reinit_heapbase(); 141 142 #ifdef ASSERT 143 // make sure we have no pending exceptions 144 { const Register t = G3_scratch; 145 Label L; 146 __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t); 147 __ br_null_short(t, Assembler::pt, L); 148 __ stop("StubRoutines::call_stub: entered with pending exception"); 149 __ bind(L); 150 } 151 #endif 152 153 // create activation frame & allocate space for parameters 154 { const Register t = G3_scratch; 155 __ ld_ptr(parameter_size.as_address(), t); // get parameter size (in words) 156 __ add(t, frame::memory_parameter_word_sp_offset, t); // add space for save area (in words) 157 __ round_to(t, WordsPerLong); // make sure it is multiple of 2 (in words) 158 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes 159 __ neg(t); // negate so it can be used with save 160 __ save(SP, t, SP); // setup new frame 161 } 162 163 // +---------------+ <--- sp + 0 164 // | | 165 // . reg save area . 166 // | | 167 // +---------------+ <--- sp + 0x40 168 // | | 169 // . extra 7 slots . 170 // | | 171 // +---------------+ <--- sp + 0x5c 172 // | empty slot | (only if parameter size is even) 173 // +---------------+ 174 // | | 175 // . parameters . 176 // | | 177 // +---------------+ <--- fp + 0 178 // | | 179 // . reg save area . 180 // | | 181 // +---------------+ <--- fp + 0x40 182 // | | 183 // . extra 7 slots . 184 // | | 185 // +---------------+ <--- fp + 0x5c 186 // | param. size | 187 // +---------------+ <--- fp + 0x60 188 // | thread | 189 // +---------------+ 190 // | | 191 192 // pass parameters if any 193 BLOCK_COMMENT("pass parameters if any"); 194 { const Register src = parameters.as_in().as_register(); 195 const Register dst = Lentry_args; 196 const Register tmp = G3_scratch; 197 const Register cnt = G4_scratch; 198 199 // test if any parameters & setup of Lentry_args 200 Label exit; 201 __ ld_ptr(parameter_size.as_in().as_address(), cnt); // parameter counter 202 __ add( FP, STACK_BIAS, dst ); 203 __ cmp_zero_and_br(Assembler::zero, cnt, exit); 204 __ delayed()->sub(dst, BytesPerWord, dst); // setup Lentry_args 205 206 // copy parameters if any 207 Label loop; 208 __ BIND(loop); 209 // Store parameter value 210 __ ld_ptr(src, 0, tmp); 211 __ add(src, BytesPerWord, src); 212 __ st_ptr(tmp, dst, 0); 213 __ deccc(cnt); 214 __ br(Assembler::greater, false, Assembler::pt, loop); 215 __ delayed()->sub(dst, Interpreter::stackElementSize, dst); 216 217 // done 218 __ BIND(exit); 219 } 220 221 // setup parameters, method & call Java function 222 #ifdef ASSERT 223 // layout_activation_impl checks it's notion of saved SP against 224 // this register, so if this changes update it as well. 225 const Register saved_SP = Lscratch; 226 __ mov(SP, saved_SP); // keep track of SP before call 227 #endif 228 229 // setup parameters 230 const Register t = G3_scratch; 231 __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words) 232 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes 233 __ sub(FP, t, Gargs); // setup parameter pointer 234 #ifdef _LP64 235 __ add( Gargs, STACK_BIAS, Gargs ); // Account for LP64 stack bias 236 #endif 237 __ mov(SP, O5_savedSP); 238 239 240 // do the call 241 // 242 // the following register must be setup: 243 // 244 // G2_thread 245 // G5_method 246 // Gargs 247 BLOCK_COMMENT("call Java function"); 248 __ jmpl(entry_point.as_in().as_register(), G0, O7); 249 __ delayed()->mov(method.as_in().as_register(), G5_method); // setup method 250 251 BLOCK_COMMENT("call_stub_return_address:"); 252 return_pc = __ pc(); 253 254 // The callee, if it wasn't interpreted, can return with SP changed so 255 // we can no longer assert of change of SP. 256 257 // store result depending on type 258 // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE 259 // is treated as T_INT) 260 { const Register addr = result .as_in().as_register(); 261 const Register type = result_type.as_in().as_register(); 262 Label is_long, is_float, is_double, is_object, exit; 263 __ cmp(type, T_OBJECT); __ br(Assembler::equal, false, Assembler::pn, is_object); 264 __ delayed()->cmp(type, T_FLOAT); __ br(Assembler::equal, false, Assembler::pn, is_float); 265 __ delayed()->cmp(type, T_DOUBLE); __ br(Assembler::equal, false, Assembler::pn, is_double); 266 __ delayed()->cmp(type, T_LONG); __ br(Assembler::equal, false, Assembler::pn, is_long); 267 __ delayed()->nop(); 268 269 // store int result 270 __ st(O0, addr, G0); 271 272 __ BIND(exit); 273 __ ret(); 274 __ delayed()->restore(); 275 276 __ BIND(is_object); 277 __ ba(exit); 278 __ delayed()->st_ptr(O0, addr, G0); 279 280 __ BIND(is_float); 281 __ ba(exit); 282 __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0); 283 284 __ BIND(is_double); 285 __ ba(exit); 286 __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0); 287 288 __ BIND(is_long); 289 #ifdef _LP64 290 __ ba(exit); 291 __ delayed()->st_long(O0, addr, G0); // store entire long 292 #else 293 #if defined(COMPILER2) 294 // All return values are where we want them, except for Longs. C2 returns 295 // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1. 296 // Since the interpreter will return longs in G1 and O0/O1 in the 32bit 297 // build we simply always use G1. 298 // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to 299 // do this here. Unfortunately if we did a rethrow we'd see an machepilog node 300 // first which would move g1 -> O0/O1 and destroy the exception we were throwing. 301 302 __ ba(exit); 303 __ delayed()->stx(G1, addr, G0); // store entire long 304 #else 305 __ st(O1, addr, BytesPerInt); 306 __ ba(exit); 307 __ delayed()->st(O0, addr, G0); 308 #endif /* COMPILER2 */ 309 #endif /* _LP64 */ 310 } 311 return start; 312 } 313 314 315 //---------------------------------------------------------------------------------------------------- 316 // Return point for a Java call if there's an exception thrown in Java code. 317 // The exception is caught and transformed into a pending exception stored in 318 // JavaThread that can be tested from within the VM. 319 // 320 // Oexception: exception oop 321 322 address generate_catch_exception() { 323 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 324 325 address start = __ pc(); 326 // verify that thread corresponds 327 __ verify_thread(); 328 329 const Register& temp_reg = Gtemp; 330 Address pending_exception_addr (G2_thread, Thread::pending_exception_offset()); 331 Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset ()); 332 Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset ()); 333 334 // set pending exception 335 __ verify_oop(Oexception); 336 __ st_ptr(Oexception, pending_exception_addr); 337 __ set((intptr_t)__FILE__, temp_reg); 338 __ st_ptr(temp_reg, exception_file_offset_addr); 339 __ set((intptr_t)__LINE__, temp_reg); 340 __ st(temp_reg, exception_line_offset_addr); 341 342 // complete return to VM 343 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before"); 344 345 AddressLiteral stub_ret(StubRoutines::_call_stub_return_address); 346 __ jump_to(stub_ret, temp_reg); 347 __ delayed()->nop(); 348 349 return start; 350 } 351 352 353 //---------------------------------------------------------------------------------------------------- 354 // Continuation point for runtime calls returning with a pending exception 355 // The pending exception check happened in the runtime or native call stub 356 // The pending exception in Thread is converted into a Java-level exception 357 // 358 // Contract with Java-level exception handler: O0 = exception 359 // O1 = throwing pc 360 361 address generate_forward_exception() { 362 StubCodeMark mark(this, "StubRoutines", "forward_exception"); 363 address start = __ pc(); 364 365 // Upon entry, O7 has the return address returning into Java 366 // (interpreted or compiled) code; i.e. the return address 367 // becomes the throwing pc. 368 369 const Register& handler_reg = Gtemp; 370 371 Address exception_addr(G2_thread, Thread::pending_exception_offset()); 372 373 #ifdef ASSERT 374 // make sure that this code is only executed if there is a pending exception 375 { Label L; 376 __ ld_ptr(exception_addr, Gtemp); 377 __ br_notnull_short(Gtemp, Assembler::pt, L); 378 __ stop("StubRoutines::forward exception: no pending exception (1)"); 379 __ bind(L); 380 } 381 #endif 382 383 // compute exception handler into handler_reg 384 __ get_thread(); 385 __ ld_ptr(exception_addr, Oexception); 386 __ verify_oop(Oexception); 387 __ save_frame(0); // compensates for compiler weakness 388 __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC 389 BLOCK_COMMENT("call exception_handler_for_return_address"); 390 __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch); 391 __ mov(O0, handler_reg); 392 __ restore(); // compensates for compiler weakness 393 394 __ ld_ptr(exception_addr, Oexception); 395 __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC 396 397 #ifdef ASSERT 398 // make sure exception is set 399 { Label L; 400 __ br_notnull_short(Oexception, Assembler::pt, L); 401 __ stop("StubRoutines::forward exception: no pending exception (2)"); 402 __ bind(L); 403 } 404 #endif 405 // jump to exception handler 406 __ jmp(handler_reg, 0); 407 // clear pending exception 408 __ delayed()->st_ptr(G0, exception_addr); 409 410 return start; 411 } 412 413 // Safefetch stubs. 414 void generate_safefetch(const char* name, int size, address* entry, 415 address* fault_pc, address* continuation_pc) { 416 // safefetch signatures: 417 // int SafeFetch32(int* adr, int errValue); 418 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 419 // 420 // arguments: 421 // o0 = adr 422 // o1 = errValue 423 // 424 // result: 425 // o0 = *adr or errValue 426 427 StubCodeMark mark(this, "StubRoutines", name); 428 429 // Entry point, pc or function descriptor. 430 __ align(CodeEntryAlignment); 431 *entry = __ pc(); 432 433 __ mov(O0, G1); // g1 = o0 434 __ mov(O1, O0); // o0 = o1 435 // Load *adr into c_rarg1, may fault. 436 *fault_pc = __ pc(); 437 switch (size) { 438 case 4: 439 // int32_t 440 __ ldsw(G1, 0, O0); // o0 = [g1] 441 break; 442 case 8: 443 // int64_t 444 __ ldx(G1, 0, O0); // o0 = [g1] 445 break; 446 default: 447 ShouldNotReachHere(); 448 } 449 450 // return errValue or *adr 451 *continuation_pc = __ pc(); 452 // By convention with the trap handler we ensure there is a non-CTI 453 // instruction in the trap shadow. 454 __ nop(); 455 __ retl(); 456 __ delayed()->nop(); 457 } 458 459 //------------------------------------------------------------------------------------------------------------------------ 460 // Continuation point for throwing of implicit exceptions that are not handled in 461 // the current activation. Fabricates an exception oop and initiates normal 462 // exception dispatching in this frame. Only callee-saved registers are preserved 463 // (through the normal register window / RegisterMap handling). 464 // If the compiler needs all registers to be preserved between the fault 465 // point and the exception handler then it must assume responsibility for that in 466 // AbstractCompiler::continuation_for_implicit_null_exception or 467 // continuation_for_implicit_division_by_zero_exception. All other implicit 468 // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are 469 // either at call sites or otherwise assume that stack unwinding will be initiated, 470 // so caller saved registers were assumed volatile in the compiler. 471 472 // Note that we generate only this stub into a RuntimeStub, because it needs to be 473 // properly traversed and ignored during GC, so we change the meaning of the "__" 474 // macro within this method. 475 #undef __ 476 #define __ masm-> 477 478 address generate_throw_exception(const char* name, address runtime_entry, 479 Register arg1 = noreg, Register arg2 = noreg) { 480 #ifdef ASSERT 481 int insts_size = VerifyThread ? 1 * K : 600; 482 #else 483 int insts_size = VerifyThread ? 1 * K : 256; 484 #endif /* ASSERT */ 485 int locs_size = 32; 486 487 CodeBuffer code(name, insts_size, locs_size); 488 MacroAssembler* masm = new MacroAssembler(&code); 489 490 __ verify_thread(); 491 492 // This is an inlined and slightly modified version of call_VM 493 // which has the ability to fetch the return PC out of thread-local storage 494 __ assert_not_delayed(); 495 496 // Note that we always push a frame because on the SPARC 497 // architecture, for all of our implicit exception kinds at call 498 // sites, the implicit exception is taken before the callee frame 499 // is pushed. 500 __ save_frame(0); 501 502 int frame_complete = __ offset(); 503 504 // Note that we always have a runtime stub frame on the top of stack by this point 505 Register last_java_sp = SP; 506 // 64-bit last_java_sp is biased! 507 __ set_last_Java_frame(last_java_sp, G0); 508 if (VerifyThread) __ mov(G2_thread, O0); // about to be smashed; pass early 509 __ save_thread(noreg); 510 if (arg1 != noreg) { 511 assert(arg2 != O1, "clobbered"); 512 __ mov(arg1, O1); 513 } 514 if (arg2 != noreg) { 515 __ mov(arg2, O2); 516 } 517 // do the call 518 BLOCK_COMMENT("call runtime_entry"); 519 __ call(runtime_entry, relocInfo::runtime_call_type); 520 if (!VerifyThread) 521 __ delayed()->mov(G2_thread, O0); // pass thread as first argument 522 else 523 __ delayed()->nop(); // (thread already passed) 524 __ restore_thread(noreg); 525 __ reset_last_Java_frame(); 526 527 // check for pending exceptions. use Gtemp as scratch register. 528 #ifdef ASSERT 529 Label L; 530 531 Address exception_addr(G2_thread, Thread::pending_exception_offset()); 532 Register scratch_reg = Gtemp; 533 __ ld_ptr(exception_addr, scratch_reg); 534 __ br_notnull_short(scratch_reg, Assembler::pt, L); 535 __ should_not_reach_here(); 536 __ bind(L); 537 #endif // ASSERT 538 BLOCK_COMMENT("call forward_exception_entry"); 539 __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type); 540 // we use O7 linkage so that forward_exception_entry has the issuing PC 541 __ delayed()->restore(); 542 543 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false); 544 return stub->entry_point(); 545 } 546 547 #undef __ 548 #define __ _masm-> 549 550 551 // Generate a routine that sets all the registers so we 552 // can tell if the stop routine prints them correctly. 553 address generate_test_stop() { 554 StubCodeMark mark(this, "StubRoutines", "test_stop"); 555 address start = __ pc(); 556 557 int i; 558 559 __ save_frame(0); 560 561 static jfloat zero = 0.0, one = 1.0; 562 563 // put addr in L0, then load through L0 to F0 564 __ set((intptr_t)&zero, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F0); 565 __ set((intptr_t)&one, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1 566 567 // use add to put 2..18 in F2..F18 568 for ( i = 2; i <= 18; ++i ) { 569 __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1), as_FloatRegister(i)); 570 } 571 572 // Now put double 2 in F16, double 18 in F18 573 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 ); 574 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 ); 575 576 // use add to put 20..32 in F20..F32 577 for (i = 20; i < 32; i += 2) { 578 __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2), as_FloatRegister(i)); 579 } 580 581 // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's 582 for ( i = 0; i < 8; ++i ) { 583 if (i < 6) { 584 __ set( i, as_iRegister(i)); 585 __ set(16 + i, as_oRegister(i)); 586 __ set(24 + i, as_gRegister(i)); 587 } 588 __ set( 8 + i, as_lRegister(i)); 589 } 590 591 __ stop("testing stop"); 592 593 594 __ ret(); 595 __ delayed()->restore(); 596 597 return start; 598 } 599 600 601 address generate_stop_subroutine() { 602 StubCodeMark mark(this, "StubRoutines", "stop_subroutine"); 603 address start = __ pc(); 604 605 __ stop_subroutine(); 606 607 return start; 608 } 609 610 address generate_flush_callers_register_windows() { 611 StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows"); 612 address start = __ pc(); 613 614 __ flushw(); 615 __ retl(false); 616 __ delayed()->add( FP, STACK_BIAS, O0 ); 617 // The returned value must be a stack pointer whose register save area 618 // is flushed, and will stay flushed while the caller executes. 619 620 return start; 621 } 622 623 // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest). 624 // 625 // Arguments: 626 // 627 // exchange_value: O0 628 // dest: O1 629 // 630 // Results: 631 // 632 // O0: the value previously stored in dest 633 // 634 address generate_atomic_xchg() { 635 StubCodeMark mark(this, "StubRoutines", "atomic_xchg"); 636 address start = __ pc(); 637 638 if (UseCASForSwap) { 639 // Use CAS instead of swap, just in case the MP hardware 640 // prefers to work with just one kind of synch. instruction. 641 Label retry; 642 __ BIND(retry); 643 __ mov(O0, O3); // scratch copy of exchange value 644 __ ld(O1, 0, O2); // observe the previous value 645 // try to replace O2 with O3 646 __ cas(O1, O2, O3); 647 __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry); 648 649 __ retl(false); 650 __ delayed()->mov(O2, O0); // report previous value to caller 651 } else { 652 __ retl(false); 653 __ delayed()->swap(O1, 0, O0); 654 } 655 656 return start; 657 } 658 659 660 // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value) 661 // 662 // Arguments: 663 // 664 // exchange_value: O0 665 // dest: O1 666 // compare_value: O2 667 // 668 // Results: 669 // 670 // O0: the value previously stored in dest 671 // 672 address generate_atomic_cmpxchg() { 673 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg"); 674 address start = __ pc(); 675 676 // cmpxchg(dest, compare_value, exchange_value) 677 __ cas(O1, O2, O0); 678 __ retl(false); 679 __ delayed()->nop(); 680 681 return start; 682 } 683 684 // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value) 685 // 686 // Arguments: 687 // 688 // exchange_value: O1:O0 689 // dest: O2 690 // compare_value: O4:O3 691 // 692 // Results: 693 // 694 // O1:O0: the value previously stored in dest 695 // 696 // Overwrites: G1,G2,G3 697 // 698 address generate_atomic_cmpxchg_long() { 699 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long"); 700 address start = __ pc(); 701 702 __ sllx(O0, 32, O0); 703 __ srl(O1, 0, O1); 704 __ or3(O0,O1,O0); // O0 holds 64-bit value from compare_value 705 __ sllx(O3, 32, O3); 706 __ srl(O4, 0, O4); 707 __ or3(O3,O4,O3); // O3 holds 64-bit value from exchange_value 708 __ casx(O2, O3, O0); 709 __ srl(O0, 0, O1); // unpacked return value in O1:O0 710 __ retl(false); 711 __ delayed()->srlx(O0, 32, O0); 712 713 return start; 714 } 715 716 717 // Support for jint Atomic::add(jint add_value, volatile jint* dest). 718 // 719 // Arguments: 720 // 721 // add_value: O0 (e.g., +1 or -1) 722 // dest: O1 723 // 724 // Results: 725 // 726 // O0: the new value stored in dest 727 // 728 // Overwrites: O3 729 // 730 address generate_atomic_add() { 731 StubCodeMark mark(this, "StubRoutines", "atomic_add"); 732 address start = __ pc(); 733 __ BIND(_atomic_add_stub); 734 735 Label(retry); 736 __ BIND(retry); 737 738 __ lduw(O1, 0, O2); 739 __ add(O0, O2, O3); 740 __ cas(O1, O2, O3); 741 __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry); 742 __ retl(false); 743 __ delayed()->add(O0, O2, O0); // note that cas made O2==O3 744 745 return start; 746 } 747 Label _atomic_add_stub; // called from other stubs 748 749 750 //------------------------------------------------------------------------------------------------------------------------ 751 // The following routine generates a subroutine to throw an asynchronous 752 // UnknownError when an unsafe access gets a fault that could not be 753 // reasonably prevented by the programmer. (Example: SIGBUS/OBJERR.) 754 // 755 // Arguments : 756 // 757 // trapping PC: O7 758 // 759 // Results: 760 // posts an asynchronous exception, skips the trapping instruction 761 // 762 763 address generate_handler_for_unsafe_access() { 764 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access"); 765 address start = __ pc(); 766 767 const int preserve_register_words = (64 * 2); 768 Address preserve_addr(FP, (-preserve_register_words * wordSize) + STACK_BIAS); 769 770 Register Lthread = L7_thread_cache; 771 int i; 772 773 __ save_frame(0); 774 __ mov(G1, L1); 775 __ mov(G2, L2); 776 __ mov(G3, L3); 777 __ mov(G4, L4); 778 __ mov(G5, L5); 779 for (i = 0; i < 64; i += 2) { 780 __ stf(FloatRegisterImpl::D, as_FloatRegister(i), preserve_addr, i * wordSize); 781 } 782 783 address entry_point = CAST_FROM_FN_PTR(address, handle_unsafe_access); 784 BLOCK_COMMENT("call handle_unsafe_access"); 785 __ call(entry_point, relocInfo::runtime_call_type); 786 __ delayed()->nop(); 787 788 __ mov(L1, G1); 789 __ mov(L2, G2); 790 __ mov(L3, G3); 791 __ mov(L4, G4); 792 __ mov(L5, G5); 793 for (i = 0; i < 64; i += 2) { 794 __ ldf(FloatRegisterImpl::D, preserve_addr, as_FloatRegister(i), i * wordSize); 795 } 796 797 __ verify_thread(); 798 799 __ jmp(O0, 0); 800 __ delayed()->restore(); 801 802 return start; 803 } 804 805 806 // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super ); 807 // Arguments : 808 // 809 // ret : O0, returned 810 // icc/xcc: set as O0 (depending on wordSize) 811 // sub : O1, argument, not changed 812 // super: O2, argument, not changed 813 // raddr: O7, blown by call 814 address generate_partial_subtype_check() { 815 __ align(CodeEntryAlignment); 816 StubCodeMark mark(this, "StubRoutines", "partial_subtype_check"); 817 address start = __ pc(); 818 Label miss; 819 820 #if defined(COMPILER2) && !defined(_LP64) 821 // Do not use a 'save' because it blows the 64-bit O registers. 822 __ add(SP,-4*wordSize,SP); // Make space for 4 temps (stack must be 2 words aligned) 823 __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize); 824 __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize); 825 __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize); 826 __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize); 827 Register Rret = O0; 828 Register Rsub = O1; 829 Register Rsuper = O2; 830 #else 831 __ save_frame(0); 832 Register Rret = I0; 833 Register Rsub = I1; 834 Register Rsuper = I2; 835 #endif 836 837 Register L0_ary_len = L0; 838 Register L1_ary_ptr = L1; 839 Register L2_super = L2; 840 Register L3_index = L3; 841 842 __ check_klass_subtype_slow_path(Rsub, Rsuper, 843 L0, L1, L2, L3, 844 NULL, &miss); 845 846 // Match falls through here. 847 __ addcc(G0,0,Rret); // set Z flags, Z result 848 849 #if defined(COMPILER2) && !defined(_LP64) 850 __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0); 851 __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1); 852 __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2); 853 __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3); 854 __ retl(); // Result in Rret is zero; flags set to Z 855 __ delayed()->add(SP,4*wordSize,SP); 856 #else 857 __ ret(); // Result in Rret is zero; flags set to Z 858 __ delayed()->restore(); 859 #endif 860 861 __ BIND(miss); 862 __ addcc(G0,1,Rret); // set NZ flags, NZ result 863 864 #if defined(COMPILER2) && !defined(_LP64) 865 __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0); 866 __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1); 867 __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2); 868 __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3); 869 __ retl(); // Result in Rret is != 0; flags set to NZ 870 __ delayed()->add(SP,4*wordSize,SP); 871 #else 872 __ ret(); // Result in Rret is != 0; flags set to NZ 873 __ delayed()->restore(); 874 #endif 875 876 return start; 877 } 878 879 880 // Called from MacroAssembler::verify_oop 881 // 882 address generate_verify_oop_subroutine() { 883 StubCodeMark mark(this, "StubRoutines", "verify_oop_stub"); 884 885 address start = __ pc(); 886 887 __ verify_oop_subroutine(); 888 889 return start; 890 } 891 892 893 // 894 // Verify that a register contains clean 32-bits positive value 895 // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax). 896 // 897 // Input: 898 // Rint - 32-bits value 899 // Rtmp - scratch 900 // 901 void assert_clean_int(Register Rint, Register Rtmp) { 902 #if defined(ASSERT) && defined(_LP64) 903 __ signx(Rint, Rtmp); 904 __ cmp(Rint, Rtmp); 905 __ breakpoint_trap(Assembler::notEqual, Assembler::xcc); 906 #endif 907 } 908 909 // 910 // Generate overlap test for array copy stubs 911 // 912 // Input: 913 // O0 - array1 914 // O1 - array2 915 // O2 - element count 916 // 917 // Kills temps: O3, O4 918 // 919 void array_overlap_test(address no_overlap_target, int log2_elem_size) { 920 assert(no_overlap_target != NULL, "must be generated"); 921 array_overlap_test(no_overlap_target, NULL, log2_elem_size); 922 } 923 void array_overlap_test(Label& L_no_overlap, int log2_elem_size) { 924 array_overlap_test(NULL, &L_no_overlap, log2_elem_size); 925 } 926 void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) { 927 const Register from = O0; 928 const Register to = O1; 929 const Register count = O2; 930 const Register to_from = O3; // to - from 931 const Register byte_count = O4; // count << log2_elem_size 932 933 __ subcc(to, from, to_from); 934 __ sll_ptr(count, log2_elem_size, byte_count); 935 if (NOLp == NULL) 936 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target); 937 else 938 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp)); 939 __ delayed()->cmp(to_from, byte_count); 940 if (NOLp == NULL) 941 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target); 942 else 943 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp)); 944 __ delayed()->nop(); 945 } 946 947 // 948 // Generate pre-write barrier for array. 949 // 950 // Input: 951 // addr - register containing starting address 952 // count - register containing element count 953 // tmp - scratch register 954 // 955 // The input registers are overwritten. 956 // 957 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 958 BarrierSet* bs = Universe::heap()->barrier_set(); 959 switch (bs->kind()) { 960 case BarrierSet::G1SATBCTLogging: 961 // With G1, don't generate the call if we statically know that the target in uninitialized 962 if (!dest_uninitialized) { 963 __ save_frame(0); 964 // Save the necessary global regs... will be used after. 965 if (addr->is_global()) { 966 __ mov(addr, L0); 967 } 968 if (count->is_global()) { 969 __ mov(count, L1); 970 } 971 __ mov(addr->after_save(), O0); 972 // Get the count into O1 973 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre)); 974 __ delayed()->mov(count->after_save(), O1); 975 if (addr->is_global()) { 976 __ mov(L0, addr); 977 } 978 if (count->is_global()) { 979 __ mov(L1, count); 980 } 981 __ restore(); 982 } 983 break; 984 case BarrierSet::CardTableModRef: 985 case BarrierSet::CardTableExtension: 986 case BarrierSet::ModRef: 987 break; 988 default: 989 ShouldNotReachHere(); 990 } 991 } 992 // 993 // Generate post-write barrier for array. 994 // 995 // Input: 996 // addr - register containing starting address 997 // count - register containing element count 998 // tmp - scratch register 999 // 1000 // The input registers are overwritten. 1001 // 1002 void gen_write_ref_array_post_barrier(Register addr, Register count, 1003 Register tmp) { 1004 BarrierSet* bs = Universe::heap()->barrier_set(); 1005 1006 switch (bs->kind()) { 1007 case BarrierSet::G1SATBCTLogging: 1008 { 1009 // Get some new fresh output registers. 1010 __ save_frame(0); 1011 __ mov(addr->after_save(), O0); 1012 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post)); 1013 __ delayed()->mov(count->after_save(), O1); 1014 __ restore(); 1015 } 1016 break; 1017 case BarrierSet::CardTableModRef: 1018 case BarrierSet::CardTableExtension: 1019 { 1020 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 1021 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 1022 assert_different_registers(addr, count, tmp); 1023 1024 Label L_loop; 1025 1026 __ sll_ptr(count, LogBytesPerHeapOop, count); 1027 __ sub(count, BytesPerHeapOop, count); 1028 __ add(count, addr, count); 1029 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.) 1030 __ srl_ptr(addr, CardTableModRefBS::card_shift, addr); 1031 __ srl_ptr(count, CardTableModRefBS::card_shift, count); 1032 __ sub(count, addr, count); 1033 AddressLiteral rs(ct->byte_map_base); 1034 __ set(rs, tmp); 1035 __ BIND(L_loop); 1036 __ stb(G0, tmp, addr); 1037 __ subcc(count, 1, count); 1038 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1039 __ delayed()->add(addr, 1, addr); 1040 } 1041 break; 1042 case BarrierSet::ModRef: 1043 break; 1044 default: 1045 ShouldNotReachHere(); 1046 } 1047 } 1048 1049 // 1050 // Generate main code for disjoint arraycopy 1051 // 1052 typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec, 1053 Label& L_loop, bool use_prefetch, bool use_bis); 1054 1055 void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size, 1056 int iter_size, StubGenerator::CopyLoopFunc copy_loop_func) { 1057 Label L_copy; 1058 1059 assert(log2_elem_size <= 3, "the following code should be changed"); 1060 int count_dec = 16>>log2_elem_size; 1061 1062 int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance); 1063 assert(prefetch_dist < 4096, "invalid value"); 1064 prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size 1065 int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count 1066 1067 if (UseBlockCopy) { 1068 Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy; 1069 1070 // 64 bytes tail + bytes copied in one loop iteration 1071 int tail_size = 64 + iter_size; 1072 int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size; 1073 // Use BIS copy only for big arrays since it requires membar. 1074 __ set(block_copy_count, O4); 1075 __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy); 1076 // This code is for disjoint source and destination: 1077 // to <= from || to >= from+count 1078 // but BIS will stomp over 'from' if (to > from-tail_size && to <= from) 1079 __ sub(from, to, O4); 1080 __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm. 1081 __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy); 1082 1083 __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY); 1084 // BIS should not be used to copy tail (64 bytes+iter_size) 1085 // to avoid zeroing of following values. 1086 __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0 1087 1088 if (prefetch_count > 0) { // rounded up to one iteration count 1089 // Do prefetching only if copy size is bigger 1090 // than prefetch distance. 1091 __ set(prefetch_count, O4); 1092 __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy); 1093 __ sub(count, prefetch_count, count); 1094 1095 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true); 1096 __ add(count, prefetch_count, count); // restore count 1097 1098 } // prefetch_count > 0 1099 1100 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true); 1101 __ add(count, (tail_size>>log2_elem_size), count); // restore count 1102 1103 __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT); 1104 // BIS needs membar. 1105 __ membar(Assembler::StoreLoad); 1106 // Copy tail 1107 __ ba_short(L_copy); 1108 1109 __ BIND(L_skip_block_copy); 1110 } // UseBlockCopy 1111 1112 if (prefetch_count > 0) { // rounded up to one iteration count 1113 // Do prefetching only if copy size is bigger 1114 // than prefetch distance. 1115 __ set(prefetch_count, O4); 1116 __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy); 1117 __ sub(count, prefetch_count, count); 1118 1119 Label L_copy_prefetch; 1120 (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false); 1121 __ add(count, prefetch_count, count); // restore count 1122 1123 } // prefetch_count > 0 1124 1125 (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false); 1126 } 1127 1128 1129 1130 // 1131 // Helper methods for copy_16_bytes_forward_with_shift() 1132 // 1133 void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec, 1134 Label& L_loop, bool use_prefetch, bool use_bis) { 1135 1136 const Register left_shift = G1; // left shift bit counter 1137 const Register right_shift = G5; // right shift bit counter 1138 1139 __ align(OptoLoopAlignment); 1140 __ BIND(L_loop); 1141 if (use_prefetch) { 1142 if (ArraycopySrcPrefetchDistance > 0) { 1143 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 1144 } 1145 if (ArraycopyDstPrefetchDistance > 0) { 1146 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 1147 } 1148 } 1149 __ ldx(from, 0, O4); 1150 __ ldx(from, 8, G4); 1151 __ inc(to, 16); 1152 __ inc(from, 16); 1153 __ deccc(count, count_dec); // Can we do next iteration after this one? 1154 __ srlx(O4, right_shift, G3); 1155 __ bset(G3, O3); 1156 __ sllx(O4, left_shift, O4); 1157 __ srlx(G4, right_shift, G3); 1158 __ bset(G3, O4); 1159 if (use_bis) { 1160 __ stxa(O3, to, -16); 1161 __ stxa(O4, to, -8); 1162 } else { 1163 __ stx(O3, to, -16); 1164 __ stx(O4, to, -8); 1165 } 1166 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1167 __ delayed()->sllx(G4, left_shift, O3); 1168 } 1169 1170 // Copy big chunks forward with shift 1171 // 1172 // Inputs: 1173 // from - source arrays 1174 // to - destination array aligned to 8-bytes 1175 // count - elements count to copy >= the count equivalent to 16 bytes 1176 // count_dec - elements count's decrement equivalent to 16 bytes 1177 // L_copy_bytes - copy exit label 1178 // 1179 void copy_16_bytes_forward_with_shift(Register from, Register to, 1180 Register count, int log2_elem_size, Label& L_copy_bytes) { 1181 Label L_aligned_copy, L_copy_last_bytes; 1182 assert(log2_elem_size <= 3, "the following code should be changed"); 1183 int count_dec = 16>>log2_elem_size; 1184 1185 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy 1186 __ andcc(from, 7, G1); // misaligned bytes 1187 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1188 __ delayed()->nop(); 1189 1190 const Register left_shift = G1; // left shift bit counter 1191 const Register right_shift = G5; // right shift bit counter 1192 1193 __ sll(G1, LogBitsPerByte, left_shift); 1194 __ mov(64, right_shift); 1195 __ sub(right_shift, left_shift, right_shift); 1196 1197 // 1198 // Load 2 aligned 8-bytes chunks and use one from previous iteration 1199 // to form 2 aligned 8-bytes chunks to store. 1200 // 1201 __ dec(count, count_dec); // Pre-decrement 'count' 1202 __ andn(from, 7, from); // Align address 1203 __ ldx(from, 0, O3); 1204 __ inc(from, 8); 1205 __ sllx(O3, left_shift, O3); 1206 1207 disjoint_copy_core(from, to, count, log2_elem_size, 16, &StubGenerator::copy_16_bytes_shift_loop); 1208 1209 __ inccc(count, count_dec>>1 ); // + 8 bytes 1210 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 1211 __ delayed()->inc(count, count_dec>>1); // restore 'count' 1212 1213 // copy 8 bytes, part of them already loaded in O3 1214 __ ldx(from, 0, O4); 1215 __ inc(to, 8); 1216 __ inc(from, 8); 1217 __ srlx(O4, right_shift, G3); 1218 __ bset(O3, G3); 1219 __ stx(G3, to, -8); 1220 1221 __ BIND(L_copy_last_bytes); 1222 __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes 1223 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 1224 __ delayed()->sub(from, right_shift, from); // restore address 1225 1226 __ BIND(L_aligned_copy); 1227 } 1228 1229 // Copy big chunks backward with shift 1230 // 1231 // Inputs: 1232 // end_from - source arrays end address 1233 // end_to - destination array end address aligned to 8-bytes 1234 // count - elements count to copy >= the count equivalent to 16 bytes 1235 // count_dec - elements count's decrement equivalent to 16 bytes 1236 // L_aligned_copy - aligned copy exit label 1237 // L_copy_bytes - copy exit label 1238 // 1239 void copy_16_bytes_backward_with_shift(Register end_from, Register end_to, 1240 Register count, int count_dec, 1241 Label& L_aligned_copy, Label& L_copy_bytes) { 1242 Label L_loop, L_copy_last_bytes; 1243 1244 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy 1245 __ andcc(end_from, 7, G1); // misaligned bytes 1246 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1247 __ delayed()->deccc(count, count_dec); // Pre-decrement 'count' 1248 1249 const Register left_shift = G1; // left shift bit counter 1250 const Register right_shift = G5; // right shift bit counter 1251 1252 __ sll(G1, LogBitsPerByte, left_shift); 1253 __ mov(64, right_shift); 1254 __ sub(right_shift, left_shift, right_shift); 1255 1256 // 1257 // Load 2 aligned 8-bytes chunks and use one from previous iteration 1258 // to form 2 aligned 8-bytes chunks to store. 1259 // 1260 __ andn(end_from, 7, end_from); // Align address 1261 __ ldx(end_from, 0, O3); 1262 __ align(OptoLoopAlignment); 1263 __ BIND(L_loop); 1264 __ ldx(end_from, -8, O4); 1265 __ deccc(count, count_dec); // Can we do next iteration after this one? 1266 __ ldx(end_from, -16, G4); 1267 __ dec(end_to, 16); 1268 __ dec(end_from, 16); 1269 __ srlx(O3, right_shift, O3); 1270 __ sllx(O4, left_shift, G3); 1271 __ bset(G3, O3); 1272 __ stx(O3, end_to, 8); 1273 __ srlx(O4, right_shift, O4); 1274 __ sllx(G4, left_shift, G3); 1275 __ bset(G3, O4); 1276 __ stx(O4, end_to, 0); 1277 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1278 __ delayed()->mov(G4, O3); 1279 1280 __ inccc(count, count_dec>>1 ); // + 8 bytes 1281 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 1282 __ delayed()->inc(count, count_dec>>1); // restore 'count' 1283 1284 // copy 8 bytes, part of them already loaded in O3 1285 __ ldx(end_from, -8, O4); 1286 __ dec(end_to, 8); 1287 __ dec(end_from, 8); 1288 __ srlx(O3, right_shift, O3); 1289 __ sllx(O4, left_shift, G3); 1290 __ bset(O3, G3); 1291 __ stx(G3, end_to, 0); 1292 1293 __ BIND(L_copy_last_bytes); 1294 __ srl(left_shift, LogBitsPerByte, left_shift); // misaligned bytes 1295 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 1296 __ delayed()->add(end_from, left_shift, end_from); // restore address 1297 } 1298 1299 // 1300 // Generate stub for disjoint byte copy. If "aligned" is true, the 1301 // "from" and "to" addresses are assumed to be heapword aligned. 1302 // 1303 // Arguments for generated stub: 1304 // from: O0 1305 // to: O1 1306 // count: O2 treated as signed 1307 // 1308 address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) { 1309 __ align(CodeEntryAlignment); 1310 StubCodeMark mark(this, "StubRoutines", name); 1311 address start = __ pc(); 1312 1313 Label L_skip_alignment, L_align; 1314 Label L_copy_byte, L_copy_byte_loop, L_exit; 1315 1316 const Register from = O0; // source array address 1317 const Register to = O1; // destination array address 1318 const Register count = O2; // elements count 1319 const Register offset = O5; // offset from start of arrays 1320 // O3, O4, G3, G4 are used as temp registers 1321 1322 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1323 1324 if (entry != NULL) { 1325 *entry = __ pc(); 1326 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1327 BLOCK_COMMENT("Entry:"); 1328 } 1329 1330 // for short arrays, just do single element copy 1331 __ cmp(count, 23); // 16 + 7 1332 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte); 1333 __ delayed()->mov(G0, offset); 1334 1335 if (aligned) { 1336 // 'aligned' == true when it is known statically during compilation 1337 // of this arraycopy call site that both 'from' and 'to' addresses 1338 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1339 // 1340 // Aligned arrays have 4 bytes alignment in 32-bits VM 1341 // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM 1342 // 1343 #ifndef _LP64 1344 // copy a 4-bytes word if necessary to align 'to' to 8 bytes 1345 __ andcc(to, 7, G0); 1346 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment); 1347 __ delayed()->ld(from, 0, O3); 1348 __ inc(from, 4); 1349 __ inc(to, 4); 1350 __ dec(count, 4); 1351 __ st(O3, to, -4); 1352 __ BIND(L_skip_alignment); 1353 #endif 1354 } else { 1355 // copy bytes to align 'to' on 8 byte boundary 1356 __ andcc(to, 7, G1); // misaligned bytes 1357 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1358 __ delayed()->neg(G1); 1359 __ inc(G1, 8); // bytes need to copy to next 8-bytes alignment 1360 __ sub(count, G1, count); 1361 __ BIND(L_align); 1362 __ ldub(from, 0, O3); 1363 __ deccc(G1); 1364 __ inc(from); 1365 __ stb(O3, to, 0); 1366 __ br(Assembler::notZero, false, Assembler::pt, L_align); 1367 __ delayed()->inc(to); 1368 __ BIND(L_skip_alignment); 1369 } 1370 #ifdef _LP64 1371 if (!aligned) 1372 #endif 1373 { 1374 // Copy with shift 16 bytes per iteration if arrays do not have 1375 // the same alignment mod 8, otherwise fall through to the next 1376 // code for aligned copy. 1377 // The compare above (count >= 23) guarantes 'count' >= 16 bytes. 1378 // Also jump over aligned copy after the copy with shift completed. 1379 1380 copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte); 1381 } 1382 1383 // Both array are 8 bytes aligned, copy 16 bytes at a time 1384 __ and3(count, 7, G4); // Save count 1385 __ srl(count, 3, count); 1386 generate_disjoint_long_copy_core(aligned); 1387 __ mov(G4, count); // Restore count 1388 1389 // copy tailing bytes 1390 __ BIND(L_copy_byte); 1391 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1392 __ align(OptoLoopAlignment); 1393 __ BIND(L_copy_byte_loop); 1394 __ ldub(from, offset, O3); 1395 __ deccc(count); 1396 __ stb(O3, to, offset); 1397 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop); 1398 __ delayed()->inc(offset); 1399 1400 __ BIND(L_exit); 1401 // O3, O4 are used as temp registers 1402 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4); 1403 __ retl(); 1404 __ delayed()->mov(G0, O0); // return 0 1405 return start; 1406 } 1407 1408 // 1409 // Generate stub for conjoint byte copy. If "aligned" is true, the 1410 // "from" and "to" addresses are assumed to be heapword aligned. 1411 // 1412 // Arguments for generated stub: 1413 // from: O0 1414 // to: O1 1415 // count: O2 treated as signed 1416 // 1417 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1418 address *entry, const char *name) { 1419 // Do reverse copy. 1420 1421 __ align(CodeEntryAlignment); 1422 StubCodeMark mark(this, "StubRoutines", name); 1423 address start = __ pc(); 1424 1425 Label L_skip_alignment, L_align, L_aligned_copy; 1426 Label L_copy_byte, L_copy_byte_loop, L_exit; 1427 1428 const Register from = O0; // source array address 1429 const Register to = O1; // destination array address 1430 const Register count = O2; // elements count 1431 const Register end_from = from; // source array end address 1432 const Register end_to = to; // destination array end address 1433 1434 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1435 1436 if (entry != NULL) { 1437 *entry = __ pc(); 1438 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1439 BLOCK_COMMENT("Entry:"); 1440 } 1441 1442 array_overlap_test(nooverlap_target, 0); 1443 1444 __ add(to, count, end_to); // offset after last copied element 1445 1446 // for short arrays, just do single element copy 1447 __ cmp(count, 23); // 16 + 7 1448 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte); 1449 __ delayed()->add(from, count, end_from); 1450 1451 { 1452 // Align end of arrays since they could be not aligned even 1453 // when arrays itself are aligned. 1454 1455 // copy bytes to align 'end_to' on 8 byte boundary 1456 __ andcc(end_to, 7, G1); // misaligned bytes 1457 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1458 __ delayed()->nop(); 1459 __ sub(count, G1, count); 1460 __ BIND(L_align); 1461 __ dec(end_from); 1462 __ dec(end_to); 1463 __ ldub(end_from, 0, O3); 1464 __ deccc(G1); 1465 __ brx(Assembler::notZero, false, Assembler::pt, L_align); 1466 __ delayed()->stb(O3, end_to, 0); 1467 __ BIND(L_skip_alignment); 1468 } 1469 #ifdef _LP64 1470 if (aligned) { 1471 // Both arrays are aligned to 8-bytes in 64-bits VM. 1472 // The 'count' is decremented in copy_16_bytes_backward_with_shift() 1473 // in unaligned case. 1474 __ dec(count, 16); 1475 } else 1476 #endif 1477 { 1478 // Copy with shift 16 bytes per iteration if arrays do not have 1479 // the same alignment mod 8, otherwise jump to the next 1480 // code for aligned copy (and substracting 16 from 'count' before jump). 1481 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1482 // Also jump over aligned copy after the copy with shift completed. 1483 1484 copy_16_bytes_backward_with_shift(end_from, end_to, count, 16, 1485 L_aligned_copy, L_copy_byte); 1486 } 1487 // copy 4 elements (16 bytes) at a time 1488 __ align(OptoLoopAlignment); 1489 __ BIND(L_aligned_copy); 1490 __ dec(end_from, 16); 1491 __ ldx(end_from, 8, O3); 1492 __ ldx(end_from, 0, O4); 1493 __ dec(end_to, 16); 1494 __ deccc(count, 16); 1495 __ stx(O3, end_to, 8); 1496 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 1497 __ delayed()->stx(O4, end_to, 0); 1498 __ inc(count, 16); 1499 1500 // copy 1 element (2 bytes) at a time 1501 __ BIND(L_copy_byte); 1502 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1503 __ align(OptoLoopAlignment); 1504 __ BIND(L_copy_byte_loop); 1505 __ dec(end_from); 1506 __ dec(end_to); 1507 __ ldub(end_from, 0, O4); 1508 __ deccc(count); 1509 __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop); 1510 __ delayed()->stb(O4, end_to, 0); 1511 1512 __ BIND(L_exit); 1513 // O3, O4 are used as temp registers 1514 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4); 1515 __ retl(); 1516 __ delayed()->mov(G0, O0); // return 0 1517 return start; 1518 } 1519 1520 // 1521 // Generate stub for disjoint short copy. If "aligned" is true, the 1522 // "from" and "to" addresses are assumed to be heapword aligned. 1523 // 1524 // Arguments for generated stub: 1525 // from: O0 1526 // to: O1 1527 // count: O2 treated as signed 1528 // 1529 address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) { 1530 __ align(CodeEntryAlignment); 1531 StubCodeMark mark(this, "StubRoutines", name); 1532 address start = __ pc(); 1533 1534 Label L_skip_alignment, L_skip_alignment2; 1535 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit; 1536 1537 const Register from = O0; // source array address 1538 const Register to = O1; // destination array address 1539 const Register count = O2; // elements count 1540 const Register offset = O5; // offset from start of arrays 1541 // O3, O4, G3, G4 are used as temp registers 1542 1543 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1544 1545 if (entry != NULL) { 1546 *entry = __ pc(); 1547 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1548 BLOCK_COMMENT("Entry:"); 1549 } 1550 1551 // for short arrays, just do single element copy 1552 __ cmp(count, 11); // 8 + 3 (22 bytes) 1553 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes); 1554 __ delayed()->mov(G0, offset); 1555 1556 if (aligned) { 1557 // 'aligned' == true when it is known statically during compilation 1558 // of this arraycopy call site that both 'from' and 'to' addresses 1559 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1560 // 1561 // Aligned arrays have 4 bytes alignment in 32-bits VM 1562 // and 8 bytes - in 64-bits VM. 1563 // 1564 #ifndef _LP64 1565 // copy a 2-elements word if necessary to align 'to' to 8 bytes 1566 __ andcc(to, 7, G0); 1567 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1568 __ delayed()->ld(from, 0, O3); 1569 __ inc(from, 4); 1570 __ inc(to, 4); 1571 __ dec(count, 2); 1572 __ st(O3, to, -4); 1573 __ BIND(L_skip_alignment); 1574 #endif 1575 } else { 1576 // copy 1 element if necessary to align 'to' on an 4 bytes 1577 __ andcc(to, 3, G0); 1578 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1579 __ delayed()->lduh(from, 0, O3); 1580 __ inc(from, 2); 1581 __ inc(to, 2); 1582 __ dec(count); 1583 __ sth(O3, to, -2); 1584 __ BIND(L_skip_alignment); 1585 1586 // copy 2 elements to align 'to' on an 8 byte boundary 1587 __ andcc(to, 7, G0); 1588 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2); 1589 __ delayed()->lduh(from, 0, O3); 1590 __ dec(count, 2); 1591 __ lduh(from, 2, O4); 1592 __ inc(from, 4); 1593 __ inc(to, 4); 1594 __ sth(O3, to, -4); 1595 __ sth(O4, to, -2); 1596 __ BIND(L_skip_alignment2); 1597 } 1598 #ifdef _LP64 1599 if (!aligned) 1600 #endif 1601 { 1602 // Copy with shift 16 bytes per iteration if arrays do not have 1603 // the same alignment mod 8, otherwise fall through to the next 1604 // code for aligned copy. 1605 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1606 // Also jump over aligned copy after the copy with shift completed. 1607 1608 copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes); 1609 } 1610 1611 // Both array are 8 bytes aligned, copy 16 bytes at a time 1612 __ and3(count, 3, G4); // Save 1613 __ srl(count, 2, count); 1614 generate_disjoint_long_copy_core(aligned); 1615 __ mov(G4, count); // restore 1616 1617 // copy 1 element at a time 1618 __ BIND(L_copy_2_bytes); 1619 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1620 __ align(OptoLoopAlignment); 1621 __ BIND(L_copy_2_bytes_loop); 1622 __ lduh(from, offset, O3); 1623 __ deccc(count); 1624 __ sth(O3, to, offset); 1625 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop); 1626 __ delayed()->inc(offset, 2); 1627 1628 __ BIND(L_exit); 1629 // O3, O4 are used as temp registers 1630 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4); 1631 __ retl(); 1632 __ delayed()->mov(G0, O0); // return 0 1633 return start; 1634 } 1635 1636 // 1637 // Generate stub for disjoint short fill. If "aligned" is true, the 1638 // "to" address is assumed to be heapword aligned. 1639 // 1640 // Arguments for generated stub: 1641 // to: O0 1642 // value: O1 1643 // count: O2 treated as signed 1644 // 1645 address generate_fill(BasicType t, bool aligned, const char* name) { 1646 __ align(CodeEntryAlignment); 1647 StubCodeMark mark(this, "StubRoutines", name); 1648 address start = __ pc(); 1649 1650 const Register to = O0; // source array address 1651 const Register value = O1; // fill value 1652 const Register count = O2; // elements count 1653 // O3 is used as a temp register 1654 1655 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1656 1657 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte; 1658 Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes; 1659 1660 int shift = -1; 1661 switch (t) { 1662 case T_BYTE: 1663 shift = 2; 1664 break; 1665 case T_SHORT: 1666 shift = 1; 1667 break; 1668 case T_INT: 1669 shift = 0; 1670 break; 1671 default: ShouldNotReachHere(); 1672 } 1673 1674 BLOCK_COMMENT("Entry:"); 1675 1676 if (t == T_BYTE) { 1677 // Zero extend value 1678 __ and3(value, 0xff, value); 1679 __ sllx(value, 8, O3); 1680 __ or3(value, O3, value); 1681 } 1682 if (t == T_SHORT) { 1683 // Zero extend value 1684 __ sllx(value, 48, value); 1685 __ srlx(value, 48, value); 1686 } 1687 if (t == T_BYTE || t == T_SHORT) { 1688 __ sllx(value, 16, O3); 1689 __ or3(value, O3, value); 1690 } 1691 1692 __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element 1693 __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp 1694 __ delayed()->andcc(count, 1, G0); 1695 1696 if (!aligned && (t == T_BYTE || t == T_SHORT)) { 1697 // align source address at 4 bytes address boundary 1698 if (t == T_BYTE) { 1699 // One byte misalignment happens only for byte arrays 1700 __ andcc(to, 1, G0); 1701 __ br(Assembler::zero, false, Assembler::pt, L_skip_align1); 1702 __ delayed()->nop(); 1703 __ stb(value, to, 0); 1704 __ inc(to, 1); 1705 __ dec(count, 1); 1706 __ BIND(L_skip_align1); 1707 } 1708 // Two bytes misalignment happens only for byte and short (char) arrays 1709 __ andcc(to, 2, G0); 1710 __ br(Assembler::zero, false, Assembler::pt, L_skip_align2); 1711 __ delayed()->nop(); 1712 __ sth(value, to, 0); 1713 __ inc(to, 2); 1714 __ dec(count, 1 << (shift - 1)); 1715 __ BIND(L_skip_align2); 1716 } 1717 #ifdef _LP64 1718 if (!aligned) { 1719 #endif 1720 // align to 8 bytes, we know we are 4 byte aligned to start 1721 __ andcc(to, 7, G0); 1722 __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes); 1723 __ delayed()->nop(); 1724 __ stw(value, to, 0); 1725 __ inc(to, 4); 1726 __ dec(count, 1 << shift); 1727 __ BIND(L_fill_32_bytes); 1728 #ifdef _LP64 1729 } 1730 #endif 1731 1732 if (t == T_INT) { 1733 // Zero extend value 1734 __ srl(value, 0, value); 1735 } 1736 if (t == T_BYTE || t == T_SHORT || t == T_INT) { 1737 __ sllx(value, 32, O3); 1738 __ or3(value, O3, value); 1739 } 1740 1741 Label L_check_fill_8_bytes; 1742 // Fill 32-byte chunks 1743 __ subcc(count, 8 << shift, count); 1744 __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes); 1745 __ delayed()->nop(); 1746 1747 Label L_fill_32_bytes_loop, L_fill_4_bytes; 1748 __ align(16); 1749 __ BIND(L_fill_32_bytes_loop); 1750 1751 __ stx(value, to, 0); 1752 __ stx(value, to, 8); 1753 __ stx(value, to, 16); 1754 __ stx(value, to, 24); 1755 1756 __ subcc(count, 8 << shift, count); 1757 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop); 1758 __ delayed()->add(to, 32, to); 1759 1760 __ BIND(L_check_fill_8_bytes); 1761 __ addcc(count, 8 << shift, count); 1762 __ brx(Assembler::zero, false, Assembler::pn, L_exit); 1763 __ delayed()->subcc(count, 1 << (shift + 1), count); 1764 __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes); 1765 __ delayed()->andcc(count, 1<<shift, G0); 1766 1767 // 1768 // length is too short, just fill 8 bytes at a time 1769 // 1770 Label L_fill_8_bytes_loop; 1771 __ BIND(L_fill_8_bytes_loop); 1772 __ stx(value, to, 0); 1773 __ subcc(count, 1 << (shift + 1), count); 1774 __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop); 1775 __ delayed()->add(to, 8, to); 1776 1777 // fill trailing 4 bytes 1778 __ andcc(count, 1<<shift, G0); // in delay slot of branches 1779 if (t == T_INT) { 1780 __ BIND(L_fill_elements); 1781 } 1782 __ BIND(L_fill_4_bytes); 1783 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes); 1784 if (t == T_BYTE || t == T_SHORT) { 1785 __ delayed()->andcc(count, 1<<(shift-1), G0); 1786 } else { 1787 __ delayed()->nop(); 1788 } 1789 __ stw(value, to, 0); 1790 if (t == T_BYTE || t == T_SHORT) { 1791 __ inc(to, 4); 1792 // fill trailing 2 bytes 1793 __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches 1794 __ BIND(L_fill_2_bytes); 1795 __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte); 1796 __ delayed()->andcc(count, 1, count); 1797 __ sth(value, to, 0); 1798 if (t == T_BYTE) { 1799 __ inc(to, 2); 1800 // fill trailing byte 1801 __ andcc(count, 1, count); // in delay slot of branches 1802 __ BIND(L_fill_byte); 1803 __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1804 __ delayed()->nop(); 1805 __ stb(value, to, 0); 1806 } else { 1807 __ BIND(L_fill_byte); 1808 } 1809 } else { 1810 __ BIND(L_fill_2_bytes); 1811 } 1812 __ BIND(L_exit); 1813 __ retl(); 1814 __ delayed()->nop(); 1815 1816 // Handle copies less than 8 bytes. Int is handled elsewhere. 1817 if (t == T_BYTE) { 1818 __ BIND(L_fill_elements); 1819 Label L_fill_2, L_fill_4; 1820 // in delay slot __ andcc(count, 1, G0); 1821 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2); 1822 __ delayed()->andcc(count, 2, G0); 1823 __ stb(value, to, 0); 1824 __ inc(to, 1); 1825 __ BIND(L_fill_2); 1826 __ brx(Assembler::zero, false, Assembler::pt, L_fill_4); 1827 __ delayed()->andcc(count, 4, G0); 1828 __ stb(value, to, 0); 1829 __ stb(value, to, 1); 1830 __ inc(to, 2); 1831 __ BIND(L_fill_4); 1832 __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1833 __ delayed()->nop(); 1834 __ stb(value, to, 0); 1835 __ stb(value, to, 1); 1836 __ stb(value, to, 2); 1837 __ retl(); 1838 __ delayed()->stb(value, to, 3); 1839 } 1840 1841 if (t == T_SHORT) { 1842 Label L_fill_2; 1843 __ BIND(L_fill_elements); 1844 // in delay slot __ andcc(count, 1, G0); 1845 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2); 1846 __ delayed()->andcc(count, 2, G0); 1847 __ sth(value, to, 0); 1848 __ inc(to, 2); 1849 __ BIND(L_fill_2); 1850 __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1851 __ delayed()->nop(); 1852 __ sth(value, to, 0); 1853 __ retl(); 1854 __ delayed()->sth(value, to, 2); 1855 } 1856 return start; 1857 } 1858 1859 // 1860 // Generate stub for conjoint short copy. If "aligned" is true, the 1861 // "from" and "to" addresses are assumed to be heapword aligned. 1862 // 1863 // Arguments for generated stub: 1864 // from: O0 1865 // to: O1 1866 // count: O2 treated as signed 1867 // 1868 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1869 address *entry, const char *name) { 1870 // Do reverse copy. 1871 1872 __ align(CodeEntryAlignment); 1873 StubCodeMark mark(this, "StubRoutines", name); 1874 address start = __ pc(); 1875 1876 Label L_skip_alignment, L_skip_alignment2, L_aligned_copy; 1877 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit; 1878 1879 const Register from = O0; // source array address 1880 const Register to = O1; // destination array address 1881 const Register count = O2; // elements count 1882 const Register end_from = from; // source array end address 1883 const Register end_to = to; // destination array end address 1884 1885 const Register byte_count = O3; // bytes count to copy 1886 1887 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1888 1889 if (entry != NULL) { 1890 *entry = __ pc(); 1891 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1892 BLOCK_COMMENT("Entry:"); 1893 } 1894 1895 array_overlap_test(nooverlap_target, 1); 1896 1897 __ sllx(count, LogBytesPerShort, byte_count); 1898 __ add(to, byte_count, end_to); // offset after last copied element 1899 1900 // for short arrays, just do single element copy 1901 __ cmp(count, 11); // 8 + 3 (22 bytes) 1902 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes); 1903 __ delayed()->add(from, byte_count, end_from); 1904 1905 { 1906 // Align end of arrays since they could be not aligned even 1907 // when arrays itself are aligned. 1908 1909 // copy 1 element if necessary to align 'end_to' on an 4 bytes 1910 __ andcc(end_to, 3, G0); 1911 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1912 __ delayed()->lduh(end_from, -2, O3); 1913 __ dec(end_from, 2); 1914 __ dec(end_to, 2); 1915 __ dec(count); 1916 __ sth(O3, end_to, 0); 1917 __ BIND(L_skip_alignment); 1918 1919 // copy 2 elements to align 'end_to' on an 8 byte boundary 1920 __ andcc(end_to, 7, G0); 1921 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2); 1922 __ delayed()->lduh(end_from, -2, O3); 1923 __ dec(count, 2); 1924 __ lduh(end_from, -4, O4); 1925 __ dec(end_from, 4); 1926 __ dec(end_to, 4); 1927 __ sth(O3, end_to, 2); 1928 __ sth(O4, end_to, 0); 1929 __ BIND(L_skip_alignment2); 1930 } 1931 #ifdef _LP64 1932 if (aligned) { 1933 // Both arrays are aligned to 8-bytes in 64-bits VM. 1934 // The 'count' is decremented in copy_16_bytes_backward_with_shift() 1935 // in unaligned case. 1936 __ dec(count, 8); 1937 } else 1938 #endif 1939 { 1940 // Copy with shift 16 bytes per iteration if arrays do not have 1941 // the same alignment mod 8, otherwise jump to the next 1942 // code for aligned copy (and substracting 8 from 'count' before jump). 1943 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1944 // Also jump over aligned copy after the copy with shift completed. 1945 1946 copy_16_bytes_backward_with_shift(end_from, end_to, count, 8, 1947 L_aligned_copy, L_copy_2_bytes); 1948 } 1949 // copy 4 elements (16 bytes) at a time 1950 __ align(OptoLoopAlignment); 1951 __ BIND(L_aligned_copy); 1952 __ dec(end_from, 16); 1953 __ ldx(end_from, 8, O3); 1954 __ ldx(end_from, 0, O4); 1955 __ dec(end_to, 16); 1956 __ deccc(count, 8); 1957 __ stx(O3, end_to, 8); 1958 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 1959 __ delayed()->stx(O4, end_to, 0); 1960 __ inc(count, 8); 1961 1962 // copy 1 element (2 bytes) at a time 1963 __ BIND(L_copy_2_bytes); 1964 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1965 __ BIND(L_copy_2_bytes_loop); 1966 __ dec(end_from, 2); 1967 __ dec(end_to, 2); 1968 __ lduh(end_from, 0, O4); 1969 __ deccc(count); 1970 __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop); 1971 __ delayed()->sth(O4, end_to, 0); 1972 1973 __ BIND(L_exit); 1974 // O3, O4 are used as temp registers 1975 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4); 1976 __ retl(); 1977 __ delayed()->mov(G0, O0); // return 0 1978 return start; 1979 } 1980 1981 // 1982 // Helper methods for generate_disjoint_int_copy_core() 1983 // 1984 void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec, 1985 Label& L_loop, bool use_prefetch, bool use_bis) { 1986 1987 __ align(OptoLoopAlignment); 1988 __ BIND(L_loop); 1989 if (use_prefetch) { 1990 if (ArraycopySrcPrefetchDistance > 0) { 1991 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 1992 } 1993 if (ArraycopyDstPrefetchDistance > 0) { 1994 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 1995 } 1996 } 1997 __ ldx(from, 4, O4); 1998 __ ldx(from, 12, G4); 1999 __ inc(to, 16); 2000 __ inc(from, 16); 2001 __ deccc(count, 4); // Can we do next iteration after this one? 2002 2003 __ srlx(O4, 32, G3); 2004 __ bset(G3, O3); 2005 __ sllx(O4, 32, O4); 2006 __ srlx(G4, 32, G3); 2007 __ bset(G3, O4); 2008 if (use_bis) { 2009 __ stxa(O3, to, -16); 2010 __ stxa(O4, to, -8); 2011 } else { 2012 __ stx(O3, to, -16); 2013 __ stx(O4, to, -8); 2014 } 2015 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 2016 __ delayed()->sllx(G4, 32, O3); 2017 2018 } 2019 2020 // 2021 // Generate core code for disjoint int copy (and oop copy on 32-bit). 2022 // If "aligned" is true, the "from" and "to" addresses are assumed 2023 // to be heapword aligned. 2024 // 2025 // Arguments: 2026 // from: O0 2027 // to: O1 2028 // count: O2 treated as signed 2029 // 2030 void generate_disjoint_int_copy_core(bool aligned) { 2031 2032 Label L_skip_alignment, L_aligned_copy; 2033 Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 2034 2035 const Register from = O0; // source array address 2036 const Register to = O1; // destination array address 2037 const Register count = O2; // elements count 2038 const Register offset = O5; // offset from start of arrays 2039 // O3, O4, G3, G4 are used as temp registers 2040 2041 // 'aligned' == true when it is known statically during compilation 2042 // of this arraycopy call site that both 'from' and 'to' addresses 2043 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 2044 // 2045 // Aligned arrays have 4 bytes alignment in 32-bits VM 2046 // and 8 bytes - in 64-bits VM. 2047 // 2048 #ifdef _LP64 2049 if (!aligned) 2050 #endif 2051 { 2052 // The next check could be put under 'ifndef' since the code in 2053 // generate_disjoint_long_copy_core() has own checks and set 'offset'. 2054 2055 // for short arrays, just do single element copy 2056 __ cmp(count, 5); // 4 + 1 (20 bytes) 2057 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes); 2058 __ delayed()->mov(G0, offset); 2059 2060 // copy 1 element to align 'to' on an 8 byte boundary 2061 __ andcc(to, 7, G0); 2062 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 2063 __ delayed()->ld(from, 0, O3); 2064 __ inc(from, 4); 2065 __ inc(to, 4); 2066 __ dec(count); 2067 __ st(O3, to, -4); 2068 __ BIND(L_skip_alignment); 2069 2070 // if arrays have same alignment mod 8, do 4 elements copy 2071 __ andcc(from, 7, G0); 2072 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 2073 __ delayed()->ld(from, 0, O3); 2074 2075 // 2076 // Load 2 aligned 8-bytes chunks and use one from previous iteration 2077 // to form 2 aligned 8-bytes chunks to store. 2078 // 2079 // copy_16_bytes_forward_with_shift() is not used here since this 2080 // code is more optimal. 2081 2082 // copy with shift 4 elements (16 bytes) at a time 2083 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 2084 __ sllx(O3, 32, O3); 2085 2086 disjoint_copy_core(from, to, count, 2, 16, &StubGenerator::copy_16_bytes_loop); 2087 2088 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); 2089 __ delayed()->inc(count, 4); // restore 'count' 2090 2091 __ BIND(L_aligned_copy); 2092 } // !aligned 2093 2094 // copy 4 elements (16 bytes) at a time 2095 __ and3(count, 1, G4); // Save 2096 __ srl(count, 1, count); 2097 generate_disjoint_long_copy_core(aligned); 2098 __ mov(G4, count); // Restore 2099 2100 // copy 1 element at a time 2101 __ BIND(L_copy_4_bytes); 2102 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 2103 __ BIND(L_copy_4_bytes_loop); 2104 __ ld(from, offset, O3); 2105 __ deccc(count); 2106 __ st(O3, to, offset); 2107 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop); 2108 __ delayed()->inc(offset, 4); 2109 __ BIND(L_exit); 2110 } 2111 2112 // 2113 // Generate stub for disjoint int copy. If "aligned" is true, the 2114 // "from" and "to" addresses are assumed to be heapword aligned. 2115 // 2116 // Arguments for generated stub: 2117 // from: O0 2118 // to: O1 2119 // count: O2 treated as signed 2120 // 2121 address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) { 2122 __ align(CodeEntryAlignment); 2123 StubCodeMark mark(this, "StubRoutines", name); 2124 address start = __ pc(); 2125 2126 const Register count = O2; 2127 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2128 2129 if (entry != NULL) { 2130 *entry = __ pc(); 2131 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2132 BLOCK_COMMENT("Entry:"); 2133 } 2134 2135 generate_disjoint_int_copy_core(aligned); 2136 2137 // O3, O4 are used as temp registers 2138 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4); 2139 __ retl(); 2140 __ delayed()->mov(G0, O0); // return 0 2141 return start; 2142 } 2143 2144 // 2145 // Generate core code for conjoint int copy (and oop copy on 32-bit). 2146 // If "aligned" is true, the "from" and "to" addresses are assumed 2147 // to be heapword aligned. 2148 // 2149 // Arguments: 2150 // from: O0 2151 // to: O1 2152 // count: O2 treated as signed 2153 // 2154 void generate_conjoint_int_copy_core(bool aligned) { 2155 // Do reverse copy. 2156 2157 Label L_skip_alignment, L_aligned_copy; 2158 Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 2159 2160 const Register from = O0; // source array address 2161 const Register to = O1; // destination array address 2162 const Register count = O2; // elements count 2163 const Register end_from = from; // source array end address 2164 const Register end_to = to; // destination array end address 2165 // O3, O4, O5, G3 are used as temp registers 2166 2167 const Register byte_count = O3; // bytes count to copy 2168 2169 __ sllx(count, LogBytesPerInt, byte_count); 2170 __ add(to, byte_count, end_to); // offset after last copied element 2171 2172 __ cmp(count, 5); // for short arrays, just do single element copy 2173 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes); 2174 __ delayed()->add(from, byte_count, end_from); 2175 2176 // copy 1 element to align 'to' on an 8 byte boundary 2177 __ andcc(end_to, 7, G0); 2178 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 2179 __ delayed()->nop(); 2180 __ dec(count); 2181 __ dec(end_from, 4); 2182 __ dec(end_to, 4); 2183 __ ld(end_from, 0, O4); 2184 __ st(O4, end_to, 0); 2185 __ BIND(L_skip_alignment); 2186 2187 // Check if 'end_from' and 'end_to' has the same alignment. 2188 __ andcc(end_from, 7, G0); 2189 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 2190 __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4 2191 2192 // copy with shift 4 elements (16 bytes) at a time 2193 // 2194 // Load 2 aligned 8-bytes chunks and use one from previous iteration 2195 // to form 2 aligned 8-bytes chunks to store. 2196 // 2197 __ ldx(end_from, -4, O3); 2198 __ align(OptoLoopAlignment); 2199 __ BIND(L_copy_16_bytes); 2200 __ ldx(end_from, -12, O4); 2201 __ deccc(count, 4); 2202 __ ldx(end_from, -20, O5); 2203 __ dec(end_to, 16); 2204 __ dec(end_from, 16); 2205 __ srlx(O3, 32, O3); 2206 __ sllx(O4, 32, G3); 2207 __ bset(G3, O3); 2208 __ stx(O3, end_to, 8); 2209 __ srlx(O4, 32, O4); 2210 __ sllx(O5, 32, G3); 2211 __ bset(O4, G3); 2212 __ stx(G3, end_to, 0); 2213 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); 2214 __ delayed()->mov(O5, O3); 2215 2216 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); 2217 __ delayed()->inc(count, 4); 2218 2219 // copy 4 elements (16 bytes) at a time 2220 __ align(OptoLoopAlignment); 2221 __ BIND(L_aligned_copy); 2222 __ dec(end_from, 16); 2223 __ ldx(end_from, 8, O3); 2224 __ ldx(end_from, 0, O4); 2225 __ dec(end_to, 16); 2226 __ deccc(count, 4); 2227 __ stx(O3, end_to, 8); 2228 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 2229 __ delayed()->stx(O4, end_to, 0); 2230 __ inc(count, 4); 2231 2232 // copy 1 element (4 bytes) at a time 2233 __ BIND(L_copy_4_bytes); 2234 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 2235 __ BIND(L_copy_4_bytes_loop); 2236 __ dec(end_from, 4); 2237 __ dec(end_to, 4); 2238 __ ld(end_from, 0, O4); 2239 __ deccc(count); 2240 __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop); 2241 __ delayed()->st(O4, end_to, 0); 2242 __ BIND(L_exit); 2243 } 2244 2245 // 2246 // Generate stub for conjoint int copy. If "aligned" is true, the 2247 // "from" and "to" addresses are assumed to be heapword aligned. 2248 // 2249 // Arguments for generated stub: 2250 // from: O0 2251 // to: O1 2252 // count: O2 treated as signed 2253 // 2254 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 2255 address *entry, const char *name) { 2256 __ align(CodeEntryAlignment); 2257 StubCodeMark mark(this, "StubRoutines", name); 2258 address start = __ pc(); 2259 2260 assert_clean_int(O2, O3); // Make sure 'count' is clean int. 2261 2262 if (entry != NULL) { 2263 *entry = __ pc(); 2264 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2265 BLOCK_COMMENT("Entry:"); 2266 } 2267 2268 array_overlap_test(nooverlap_target, 2); 2269 2270 generate_conjoint_int_copy_core(aligned); 2271 2272 // O3, O4 are used as temp registers 2273 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4); 2274 __ retl(); 2275 __ delayed()->mov(G0, O0); // return 0 2276 return start; 2277 } 2278 2279 // 2280 // Helper methods for generate_disjoint_long_copy_core() 2281 // 2282 void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec, 2283 Label& L_loop, bool use_prefetch, bool use_bis) { 2284 __ align(OptoLoopAlignment); 2285 __ BIND(L_loop); 2286 for (int off = 0; off < 64; off += 16) { 2287 if (use_prefetch && (off & 31) == 0) { 2288 if (ArraycopySrcPrefetchDistance > 0) { 2289 __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads); 2290 } 2291 if (ArraycopyDstPrefetchDistance > 0) { 2292 __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads); 2293 } 2294 } 2295 __ ldx(from, off+0, O4); 2296 __ ldx(from, off+8, O5); 2297 if (use_bis) { 2298 __ stxa(O4, to, off+0); 2299 __ stxa(O5, to, off+8); 2300 } else { 2301 __ stx(O4, to, off+0); 2302 __ stx(O5, to, off+8); 2303 } 2304 } 2305 __ deccc(count, 8); 2306 __ inc(from, 64); 2307 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 2308 __ delayed()->inc(to, 64); 2309 } 2310 2311 // 2312 // Generate core code for disjoint long copy (and oop copy on 64-bit). 2313 // "aligned" is ignored, because we must make the stronger 2314 // assumption that both addresses are always 64-bit aligned. 2315 // 2316 // Arguments: 2317 // from: O0 2318 // to: O1 2319 // count: O2 treated as signed 2320 // 2321 // count -= 2; 2322 // if ( count >= 0 ) { // >= 2 elements 2323 // if ( count > 6) { // >= 8 elements 2324 // count -= 6; // original count - 8 2325 // do { 2326 // copy_8_elements; 2327 // count -= 8; 2328 // } while ( count >= 0 ); 2329 // count += 6; 2330 // } 2331 // if ( count >= 0 ) { // >= 2 elements 2332 // do { 2333 // copy_2_elements; 2334 // } while ( (count=count-2) >= 0 ); 2335 // } 2336 // } 2337 // count += 2; 2338 // if ( count != 0 ) { // 1 element left 2339 // copy_1_element; 2340 // } 2341 // 2342 void generate_disjoint_long_copy_core(bool aligned) { 2343 Label L_copy_8_bytes, L_copy_16_bytes, L_exit; 2344 const Register from = O0; // source array address 2345 const Register to = O1; // destination array address 2346 const Register count = O2; // elements count 2347 const Register offset0 = O4; // element offset 2348 const Register offset8 = O5; // next element offset 2349 2350 __ deccc(count, 2); 2351 __ mov(G0, offset0); // offset from start of arrays (0) 2352 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 2353 __ delayed()->add(offset0, 8, offset8); 2354 2355 // Copy by 64 bytes chunks 2356 2357 const Register from64 = O3; // source address 2358 const Register to64 = G3; // destination address 2359 __ subcc(count, 6, O3); 2360 __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes ); 2361 __ delayed()->mov(to, to64); 2362 // Now we can use O4(offset0), O5(offset8) as temps 2363 __ mov(O3, count); 2364 // count >= 0 (original count - 8) 2365 __ mov(from, from64); 2366 2367 disjoint_copy_core(from64, to64, count, 3, 64, &StubGenerator::copy_64_bytes_loop); 2368 2369 // Restore O4(offset0), O5(offset8) 2370 __ sub(from64, from, offset0); 2371 __ inccc(count, 6); // restore count 2372 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 2373 __ delayed()->add(offset0, 8, offset8); 2374 2375 // Copy by 16 bytes chunks 2376 __ align(OptoLoopAlignment); 2377 __ BIND(L_copy_16_bytes); 2378 __ ldx(from, offset0, O3); 2379 __ ldx(from, offset8, G3); 2380 __ deccc(count, 2); 2381 __ stx(O3, to, offset0); 2382 __ inc(offset0, 16); 2383 __ stx(G3, to, offset8); 2384 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); 2385 __ delayed()->inc(offset8, 16); 2386 2387 // Copy last 8 bytes 2388 __ BIND(L_copy_8_bytes); 2389 __ inccc(count, 2); 2390 __ brx(Assembler::zero, true, Assembler::pn, L_exit ); 2391 __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs 2392 __ ldx(from, offset0, O3); 2393 __ stx(O3, to, offset0); 2394 __ BIND(L_exit); 2395 } 2396 2397 // 2398 // Generate stub for disjoint long copy. 2399 // "aligned" is ignored, because we must make the stronger 2400 // assumption that both addresses are always 64-bit aligned. 2401 // 2402 // Arguments for generated stub: 2403 // from: O0 2404 // to: O1 2405 // count: O2 treated as signed 2406 // 2407 address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) { 2408 __ align(CodeEntryAlignment); 2409 StubCodeMark mark(this, "StubRoutines", name); 2410 address start = __ pc(); 2411 2412 assert_clean_int(O2, O3); // Make sure 'count' is clean int. 2413 2414 if (entry != NULL) { 2415 *entry = __ pc(); 2416 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2417 BLOCK_COMMENT("Entry:"); 2418 } 2419 2420 generate_disjoint_long_copy_core(aligned); 2421 2422 // O3, O4 are used as temp registers 2423 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4); 2424 __ retl(); 2425 __ delayed()->mov(G0, O0); // return 0 2426 return start; 2427 } 2428 2429 // 2430 // Generate core code for conjoint long copy (and oop copy on 64-bit). 2431 // "aligned" is ignored, because we must make the stronger 2432 // assumption that both addresses are always 64-bit aligned. 2433 // 2434 // Arguments: 2435 // from: O0 2436 // to: O1 2437 // count: O2 treated as signed 2438 // 2439 void generate_conjoint_long_copy_core(bool aligned) { 2440 // Do reverse copy. 2441 Label L_copy_8_bytes, L_copy_16_bytes, L_exit; 2442 const Register from = O0; // source array address 2443 const Register to = O1; // destination array address 2444 const Register count = O2; // elements count 2445 const Register offset8 = O4; // element offset 2446 const Register offset0 = O5; // previous element offset 2447 2448 __ subcc(count, 1, count); 2449 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes ); 2450 __ delayed()->sllx(count, LogBytesPerLong, offset8); 2451 __ sub(offset8, 8, offset0); 2452 __ align(OptoLoopAlignment); 2453 __ BIND(L_copy_16_bytes); 2454 __ ldx(from, offset8, O2); 2455 __ ldx(from, offset0, O3); 2456 __ stx(O2, to, offset8); 2457 __ deccc(offset8, 16); // use offset8 as counter 2458 __ stx(O3, to, offset0); 2459 __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes); 2460 __ delayed()->dec(offset0, 16); 2461 2462 __ BIND(L_copy_8_bytes); 2463 __ brx(Assembler::negative, false, Assembler::pn, L_exit ); 2464 __ delayed()->nop(); 2465 __ ldx(from, 0, O3); 2466 __ stx(O3, to, 0); 2467 __ BIND(L_exit); 2468 } 2469 2470 // Generate stub for conjoint long copy. 2471 // "aligned" is ignored, because we must make the stronger 2472 // assumption that both addresses are always 64-bit aligned. 2473 // 2474 // Arguments for generated stub: 2475 // from: O0 2476 // to: O1 2477 // count: O2 treated as signed 2478 // 2479 address generate_conjoint_long_copy(bool aligned, address nooverlap_target, 2480 address *entry, const char *name) { 2481 __ align(CodeEntryAlignment); 2482 StubCodeMark mark(this, "StubRoutines", name); 2483 address start = __ pc(); 2484 2485 assert(aligned, "Should always be aligned"); 2486 2487 assert_clean_int(O2, O3); // Make sure 'count' is clean int. 2488 2489 if (entry != NULL) { 2490 *entry = __ pc(); 2491 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2492 BLOCK_COMMENT("Entry:"); 2493 } 2494 2495 array_overlap_test(nooverlap_target, 3); 2496 2497 generate_conjoint_long_copy_core(aligned); 2498 2499 // O3, O4 are used as temp registers 2500 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4); 2501 __ retl(); 2502 __ delayed()->mov(G0, O0); // return 0 2503 return start; 2504 } 2505 2506 // Generate stub for disjoint oop copy. If "aligned" is true, the 2507 // "from" and "to" addresses are assumed to be heapword aligned. 2508 // 2509 // Arguments for generated stub: 2510 // from: O0 2511 // to: O1 2512 // count: O2 treated as signed 2513 // 2514 address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name, 2515 bool dest_uninitialized = false) { 2516 2517 const Register from = O0; // source array address 2518 const Register to = O1; // destination array address 2519 const Register count = O2; // elements count 2520 2521 __ align(CodeEntryAlignment); 2522 StubCodeMark mark(this, "StubRoutines", name); 2523 address start = __ pc(); 2524 2525 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2526 2527 if (entry != NULL) { 2528 *entry = __ pc(); 2529 // caller can pass a 64-bit byte count here 2530 BLOCK_COMMENT("Entry:"); 2531 } 2532 2533 // save arguments for barrier generation 2534 __ mov(to, G1); 2535 __ mov(count, G5); 2536 gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized); 2537 #ifdef _LP64 2538 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2539 if (UseCompressedOops) { 2540 generate_disjoint_int_copy_core(aligned); 2541 } else { 2542 generate_disjoint_long_copy_core(aligned); 2543 } 2544 #else 2545 generate_disjoint_int_copy_core(aligned); 2546 #endif 2547 // O0 is used as temp register 2548 gen_write_ref_array_post_barrier(G1, G5, O0); 2549 2550 // O3, O4 are used as temp registers 2551 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4); 2552 __ retl(); 2553 __ delayed()->mov(G0, O0); // return 0 2554 return start; 2555 } 2556 2557 // Generate stub for conjoint oop copy. If "aligned" is true, the 2558 // "from" and "to" addresses are assumed to be heapword aligned. 2559 // 2560 // Arguments for generated stub: 2561 // from: O0 2562 // to: O1 2563 // count: O2 treated as signed 2564 // 2565 address generate_conjoint_oop_copy(bool aligned, address nooverlap_target, 2566 address *entry, const char *name, 2567 bool dest_uninitialized = false) { 2568 2569 const Register from = O0; // source array address 2570 const Register to = O1; // destination array address 2571 const Register count = O2; // elements count 2572 2573 __ align(CodeEntryAlignment); 2574 StubCodeMark mark(this, "StubRoutines", name); 2575 address start = __ pc(); 2576 2577 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2578 2579 if (entry != NULL) { 2580 *entry = __ pc(); 2581 // caller can pass a 64-bit byte count here 2582 BLOCK_COMMENT("Entry:"); 2583 } 2584 2585 array_overlap_test(nooverlap_target, LogBytesPerHeapOop); 2586 2587 // save arguments for barrier generation 2588 __ mov(to, G1); 2589 __ mov(count, G5); 2590 gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized); 2591 2592 #ifdef _LP64 2593 if (UseCompressedOops) { 2594 generate_conjoint_int_copy_core(aligned); 2595 } else { 2596 generate_conjoint_long_copy_core(aligned); 2597 } 2598 #else 2599 generate_conjoint_int_copy_core(aligned); 2600 #endif 2601 2602 // O0 is used as temp register 2603 gen_write_ref_array_post_barrier(G1, G5, O0); 2604 2605 // O3, O4 are used as temp registers 2606 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4); 2607 __ retl(); 2608 __ delayed()->mov(G0, O0); // return 0 2609 return start; 2610 } 2611 2612 2613 // Helper for generating a dynamic type check. 2614 // Smashes only the given temp registers. 2615 void generate_type_check(Register sub_klass, 2616 Register super_check_offset, 2617 Register super_klass, 2618 Register temp, 2619 Label& L_success) { 2620 assert_different_registers(sub_klass, super_check_offset, super_klass, temp); 2621 2622 BLOCK_COMMENT("type_check:"); 2623 2624 Label L_miss, L_pop_to_miss; 2625 2626 assert_clean_int(super_check_offset, temp); 2627 2628 __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg, 2629 &L_success, &L_miss, NULL, 2630 super_check_offset); 2631 2632 BLOCK_COMMENT("type_check_slow_path:"); 2633 __ save_frame(0); 2634 __ check_klass_subtype_slow_path(sub_klass->after_save(), 2635 super_klass->after_save(), 2636 L0, L1, L2, L4, 2637 NULL, &L_pop_to_miss); 2638 __ ba(L_success); 2639 __ delayed()->restore(); 2640 2641 __ bind(L_pop_to_miss); 2642 __ restore(); 2643 2644 // Fall through on failure! 2645 __ BIND(L_miss); 2646 } 2647 2648 2649 // Generate stub for checked oop copy. 2650 // 2651 // Arguments for generated stub: 2652 // from: O0 2653 // to: O1 2654 // count: O2 treated as signed 2655 // ckoff: O3 (super_check_offset) 2656 // ckval: O4 (super_klass) 2657 // ret: O0 zero for success; (-1^K) where K is partial transfer count 2658 // 2659 address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) { 2660 2661 const Register O0_from = O0; // source array address 2662 const Register O1_to = O1; // destination array address 2663 const Register O2_count = O2; // elements count 2664 const Register O3_ckoff = O3; // super_check_offset 2665 const Register O4_ckval = O4; // super_klass 2666 2667 const Register O5_offset = O5; // loop var, with stride wordSize 2668 const Register G1_remain = G1; // loop var, with stride -1 2669 const Register G3_oop = G3; // actual oop copied 2670 const Register G4_klass = G4; // oop._klass 2671 const Register G5_super = G5; // oop._klass._primary_supers[ckval] 2672 2673 __ align(CodeEntryAlignment); 2674 StubCodeMark mark(this, "StubRoutines", name); 2675 address start = __ pc(); 2676 2677 #ifdef ASSERT 2678 // We sometimes save a frame (see generate_type_check below). 2679 // If this will cause trouble, let's fail now instead of later. 2680 __ save_frame(0); 2681 __ restore(); 2682 #endif 2683 2684 assert_clean_int(O2_count, G1); // Make sure 'count' is clean int. 2685 2686 #ifdef ASSERT 2687 // caller guarantees that the arrays really are different 2688 // otherwise, we would have to make conjoint checks 2689 { Label L; 2690 __ mov(O3, G1); // spill: overlap test smashes O3 2691 __ mov(O4, G4); // spill: overlap test smashes O4 2692 array_overlap_test(L, LogBytesPerHeapOop); 2693 __ stop("checkcast_copy within a single array"); 2694 __ bind(L); 2695 __ mov(G1, O3); 2696 __ mov(G4, O4); 2697 } 2698 #endif //ASSERT 2699 2700 if (entry != NULL) { 2701 *entry = __ pc(); 2702 // caller can pass a 64-bit byte count here (from generic stub) 2703 BLOCK_COMMENT("Entry:"); 2704 } 2705 gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized); 2706 2707 Label load_element, store_element, do_card_marks, fail, done; 2708 __ addcc(O2_count, 0, G1_remain); // initialize loop index, and test it 2709 __ brx(Assembler::notZero, false, Assembler::pt, load_element); 2710 __ delayed()->mov(G0, O5_offset); // offset from start of arrays 2711 2712 // Empty array: Nothing to do. 2713 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4); 2714 __ retl(); 2715 __ delayed()->set(0, O0); // return 0 on (trivial) success 2716 2717 // ======== begin loop ======== 2718 // (Loop is rotated; its entry is load_element.) 2719 // Loop variables: 2720 // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays 2721 // (O2 = len; O2 != 0; O2--) --- number of oops *remaining* 2722 // G3, G4, G5 --- current oop, oop.klass, oop.klass.super 2723 __ align(OptoLoopAlignment); 2724 2725 __ BIND(store_element); 2726 __ deccc(G1_remain); // decrement the count 2727 __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop 2728 __ inc(O5_offset, heapOopSize); // step to next offset 2729 __ brx(Assembler::zero, true, Assembler::pt, do_card_marks); 2730 __ delayed()->set(0, O0); // return -1 on success 2731 2732 // ======== loop entry is here ======== 2733 __ BIND(load_element); 2734 __ load_heap_oop(O0_from, O5_offset, G3_oop); // load the oop 2735 __ br_null_short(G3_oop, Assembler::pt, store_element); 2736 2737 __ load_klass(G3_oop, G4_klass); // query the object klass 2738 2739 generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super, 2740 // branch to this on success: 2741 store_element); 2742 // ======== end loop ======== 2743 2744 // It was a real error; we must depend on the caller to finish the job. 2745 // Register G1 has number of *remaining* oops, O2 number of *total* oops. 2746 // Emit GC store barriers for the oops we have copied (O2 minus G1), 2747 // and report their number to the caller. 2748 __ BIND(fail); 2749 __ subcc(O2_count, G1_remain, O2_count); 2750 __ brx(Assembler::zero, false, Assembler::pt, done); 2751 __ delayed()->not1(O2_count, O0); // report (-1^K) to caller 2752 2753 __ BIND(do_card_marks); 2754 gen_write_ref_array_post_barrier(O1_to, O2_count, O3); // store check on O1[0..O2] 2755 2756 __ BIND(done); 2757 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4); 2758 __ retl(); 2759 __ delayed()->nop(); // return value in 00 2760 2761 return start; 2762 } 2763 2764 2765 // Generate 'unsafe' array copy stub 2766 // Though just as safe as the other stubs, it takes an unscaled 2767 // size_t argument instead of an element count. 2768 // 2769 // Arguments for generated stub: 2770 // from: O0 2771 // to: O1 2772 // count: O2 byte count, treated as ssize_t, can be zero 2773 // 2774 // Examines the alignment of the operands and dispatches 2775 // to a long, int, short, or byte copy loop. 2776 // 2777 address generate_unsafe_copy(const char* name, 2778 address byte_copy_entry, 2779 address short_copy_entry, 2780 address int_copy_entry, 2781 address long_copy_entry) { 2782 2783 const Register O0_from = O0; // source array address 2784 const Register O1_to = O1; // destination array address 2785 const Register O2_count = O2; // elements count 2786 2787 const Register G1_bits = G1; // test copy of low bits 2788 2789 __ align(CodeEntryAlignment); 2790 StubCodeMark mark(this, "StubRoutines", name); 2791 address start = __ pc(); 2792 2793 // bump this on entry, not on exit: 2794 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3); 2795 2796 __ or3(O0_from, O1_to, G1_bits); 2797 __ or3(O2_count, G1_bits, G1_bits); 2798 2799 __ btst(BytesPerLong-1, G1_bits); 2800 __ br(Assembler::zero, true, Assembler::pt, 2801 long_copy_entry, relocInfo::runtime_call_type); 2802 // scale the count on the way out: 2803 __ delayed()->srax(O2_count, LogBytesPerLong, O2_count); 2804 2805 __ btst(BytesPerInt-1, G1_bits); 2806 __ br(Assembler::zero, true, Assembler::pt, 2807 int_copy_entry, relocInfo::runtime_call_type); 2808 // scale the count on the way out: 2809 __ delayed()->srax(O2_count, LogBytesPerInt, O2_count); 2810 2811 __ btst(BytesPerShort-1, G1_bits); 2812 __ br(Assembler::zero, true, Assembler::pt, 2813 short_copy_entry, relocInfo::runtime_call_type); 2814 // scale the count on the way out: 2815 __ delayed()->srax(O2_count, LogBytesPerShort, O2_count); 2816 2817 __ br(Assembler::always, false, Assembler::pt, 2818 byte_copy_entry, relocInfo::runtime_call_type); 2819 __ delayed()->nop(); 2820 2821 return start; 2822 } 2823 2824 2825 // Perform range checks on the proposed arraycopy. 2826 // Kills the two temps, but nothing else. 2827 // Also, clean the sign bits of src_pos and dst_pos. 2828 void arraycopy_range_checks(Register src, // source array oop (O0) 2829 Register src_pos, // source position (O1) 2830 Register dst, // destination array oo (O2) 2831 Register dst_pos, // destination position (O3) 2832 Register length, // length of copy (O4) 2833 Register temp1, Register temp2, 2834 Label& L_failed) { 2835 BLOCK_COMMENT("arraycopy_range_checks:"); 2836 2837 // if (src_pos + length > arrayOop(src)->length() ) FAIL; 2838 2839 const Register array_length = temp1; // scratch 2840 const Register end_pos = temp2; // scratch 2841 2842 // Note: This next instruction may be in the delay slot of a branch: 2843 __ add(length, src_pos, end_pos); // src_pos + length 2844 __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length); 2845 __ cmp(end_pos, array_length); 2846 __ br(Assembler::greater, false, Assembler::pn, L_failed); 2847 2848 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL; 2849 __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length 2850 __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length); 2851 __ cmp(end_pos, array_length); 2852 __ br(Assembler::greater, false, Assembler::pn, L_failed); 2853 2854 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'. 2855 // Move with sign extension can be used since they are positive. 2856 __ delayed()->signx(src_pos, src_pos); 2857 __ signx(dst_pos, dst_pos); 2858 2859 BLOCK_COMMENT("arraycopy_range_checks done"); 2860 } 2861 2862 2863 // 2864 // Generate generic array copy stubs 2865 // 2866 // Input: 2867 // O0 - src oop 2868 // O1 - src_pos 2869 // O2 - dst oop 2870 // O3 - dst_pos 2871 // O4 - element count 2872 // 2873 // Output: 2874 // O0 == 0 - success 2875 // O0 == -1 - need to call System.arraycopy 2876 // 2877 address generate_generic_copy(const char *name, 2878 address entry_jbyte_arraycopy, 2879 address entry_jshort_arraycopy, 2880 address entry_jint_arraycopy, 2881 address entry_oop_arraycopy, 2882 address entry_jlong_arraycopy, 2883 address entry_checkcast_arraycopy) { 2884 Label L_failed, L_objArray; 2885 2886 // Input registers 2887 const Register src = O0; // source array oop 2888 const Register src_pos = O1; // source position 2889 const Register dst = O2; // destination array oop 2890 const Register dst_pos = O3; // destination position 2891 const Register length = O4; // elements count 2892 2893 // registers used as temp 2894 const Register G3_src_klass = G3; // source array klass 2895 const Register G4_dst_klass = G4; // destination array klass 2896 const Register G5_lh = G5; // layout handler 2897 const Register O5_temp = O5; 2898 2899 __ align(CodeEntryAlignment); 2900 StubCodeMark mark(this, "StubRoutines", name); 2901 address start = __ pc(); 2902 2903 // bump this on entry, not on exit: 2904 inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3); 2905 2906 // In principle, the int arguments could be dirty. 2907 //assert_clean_int(src_pos, G1); 2908 //assert_clean_int(dst_pos, G1); 2909 //assert_clean_int(length, G1); 2910 2911 //----------------------------------------------------------------------- 2912 // Assembler stubs will be used for this call to arraycopy 2913 // if the following conditions are met: 2914 // 2915 // (1) src and dst must not be null. 2916 // (2) src_pos must not be negative. 2917 // (3) dst_pos must not be negative. 2918 // (4) length must not be negative. 2919 // (5) src klass and dst klass should be the same and not NULL. 2920 // (6) src and dst should be arrays. 2921 // (7) src_pos + length must not exceed length of src. 2922 // (8) dst_pos + length must not exceed length of dst. 2923 BLOCK_COMMENT("arraycopy initial argument checks"); 2924 2925 // if (src == NULL) return -1; 2926 __ br_null(src, false, Assembler::pn, L_failed); 2927 2928 // if (src_pos < 0) return -1; 2929 __ delayed()->tst(src_pos); 2930 __ br(Assembler::negative, false, Assembler::pn, L_failed); 2931 __ delayed()->nop(); 2932 2933 // if (dst == NULL) return -1; 2934 __ br_null(dst, false, Assembler::pn, L_failed); 2935 2936 // if (dst_pos < 0) return -1; 2937 __ delayed()->tst(dst_pos); 2938 __ br(Assembler::negative, false, Assembler::pn, L_failed); 2939 2940 // if (length < 0) return -1; 2941 __ delayed()->tst(length); 2942 __ br(Assembler::negative, false, Assembler::pn, L_failed); 2943 2944 BLOCK_COMMENT("arraycopy argument klass checks"); 2945 // get src->klass() 2946 if (UseCompressedClassPointers) { 2947 __ delayed()->nop(); // ??? not good 2948 __ load_klass(src, G3_src_klass); 2949 } else { 2950 __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass); 2951 } 2952 2953 #ifdef ASSERT 2954 // assert(src->klass() != NULL); 2955 BLOCK_COMMENT("assert klasses not null"); 2956 { Label L_a, L_b; 2957 __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL 2958 __ bind(L_a); 2959 __ stop("broken null klass"); 2960 __ bind(L_b); 2961 __ load_klass(dst, G4_dst_klass); 2962 __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also 2963 __ delayed()->mov(G0, G4_dst_klass); // scribble the temp 2964 BLOCK_COMMENT("assert done"); 2965 } 2966 #endif 2967 2968 // Load layout helper 2969 // 2970 // |array_tag| | header_size | element_type | |log2_element_size| 2971 // 32 30 24 16 8 2 0 2972 // 2973 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2974 // 2975 2976 int lh_offset = in_bytes(Klass::layout_helper_offset()); 2977 2978 // Load 32-bits signed value. Use br() instruction with it to check icc. 2979 __ lduw(G3_src_klass, lh_offset, G5_lh); 2980 2981 if (UseCompressedClassPointers) { 2982 __ load_klass(dst, G4_dst_klass); 2983 } 2984 // Handle objArrays completely differently... 2985 juint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2986 __ set(objArray_lh, O5_temp); 2987 __ cmp(G5_lh, O5_temp); 2988 __ br(Assembler::equal, false, Assembler::pt, L_objArray); 2989 if (UseCompressedClassPointers) { 2990 __ delayed()->nop(); 2991 } else { 2992 __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass); 2993 } 2994 2995 // if (src->klass() != dst->klass()) return -1; 2996 __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed); 2997 2998 // if (!src->is_Array()) return -1; 2999 __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0 3000 __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed); 3001 3002 // At this point, it is known to be a typeArray (array_tag 0x3). 3003 #ifdef ASSERT 3004 __ delayed()->nop(); 3005 { Label L; 3006 jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 3007 __ set(lh_prim_tag_in_place, O5_temp); 3008 __ cmp(G5_lh, O5_temp); 3009 __ br(Assembler::greaterEqual, false, Assembler::pt, L); 3010 __ delayed()->nop(); 3011 __ stop("must be a primitive array"); 3012 __ bind(L); 3013 } 3014 #else 3015 __ delayed(); // match next insn to prev branch 3016 #endif 3017 3018 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 3019 O5_temp, G4_dst_klass, L_failed); 3020 3021 // TypeArrayKlass 3022 // 3023 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 3024 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 3025 // 3026 3027 const Register G4_offset = G4_dst_klass; // array offset 3028 const Register G3_elsize = G3_src_klass; // log2 element size 3029 3030 __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset); 3031 __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset 3032 __ add(src, G4_offset, src); // src array offset 3033 __ add(dst, G4_offset, dst); // dst array offset 3034 __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size 3035 3036 // next registers should be set before the jump to corresponding stub 3037 const Register from = O0; // source array address 3038 const Register to = O1; // destination array address 3039 const Register count = O2; // elements count 3040 3041 // 'from', 'to', 'count' registers should be set in this order 3042 // since they are the same as 'src', 'src_pos', 'dst'. 3043 3044 BLOCK_COMMENT("scale indexes to element size"); 3045 __ sll_ptr(src_pos, G3_elsize, src_pos); 3046 __ sll_ptr(dst_pos, G3_elsize, dst_pos); 3047 __ add(src, src_pos, from); // src_addr 3048 __ add(dst, dst_pos, to); // dst_addr 3049 3050 BLOCK_COMMENT("choose copy loop based on element size"); 3051 __ cmp(G3_elsize, 0); 3052 __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy); 3053 __ delayed()->signx(length, count); // length 3054 3055 __ cmp(G3_elsize, LogBytesPerShort); 3056 __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy); 3057 __ delayed()->signx(length, count); // length 3058 3059 __ cmp(G3_elsize, LogBytesPerInt); 3060 __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy); 3061 __ delayed()->signx(length, count); // length 3062 #ifdef ASSERT 3063 { Label L; 3064 __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L); 3065 __ stop("must be long copy, but elsize is wrong"); 3066 __ bind(L); 3067 } 3068 #endif 3069 __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy); 3070 __ delayed()->signx(length, count); // length 3071 3072 // ObjArrayKlass 3073 __ BIND(L_objArray); 3074 // live at this point: G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length 3075 3076 Label L_plain_copy, L_checkcast_copy; 3077 // test array classes for subtyping 3078 __ cmp(G3_src_klass, G4_dst_klass); // usual case is exact equality 3079 __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy); 3080 __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below 3081 3082 // Identically typed arrays can be copied without element-wise checks. 3083 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 3084 O5_temp, G5_lh, L_failed); 3085 3086 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset 3087 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset 3088 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos); 3089 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos); 3090 __ add(src, src_pos, from); // src_addr 3091 __ add(dst, dst_pos, to); // dst_addr 3092 __ BIND(L_plain_copy); 3093 __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy); 3094 __ delayed()->signx(length, count); // length 3095 3096 __ BIND(L_checkcast_copy); 3097 // live at this point: G3_src_klass, G4_dst_klass 3098 { 3099 // Before looking at dst.length, make sure dst is also an objArray. 3100 // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot 3101 __ cmp(G5_lh, O5_temp); 3102 __ br(Assembler::notEqual, false, Assembler::pn, L_failed); 3103 3104 // It is safe to examine both src.length and dst.length. 3105 __ delayed(); // match next insn to prev branch 3106 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 3107 O5_temp, G5_lh, L_failed); 3108 3109 // Marshal the base address arguments now, freeing registers. 3110 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset 3111 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset 3112 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos); 3113 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos); 3114 __ add(src, src_pos, from); // src_addr 3115 __ add(dst, dst_pos, to); // dst_addr 3116 __ signx(length, count); // length (reloaded) 3117 3118 Register sco_temp = O3; // this register is free now 3119 assert_different_registers(from, to, count, sco_temp, 3120 G4_dst_klass, G3_src_klass); 3121 3122 // Generate the type check. 3123 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 3124 __ lduw(G4_dst_klass, sco_offset, sco_temp); 3125 generate_type_check(G3_src_klass, sco_temp, G4_dst_klass, 3126 O5_temp, L_plain_copy); 3127 3128 // Fetch destination element klass from the ObjArrayKlass header. 3129 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 3130 3131 // the checkcast_copy loop needs two extra arguments: 3132 __ ld_ptr(G4_dst_klass, ek_offset, O4); // dest elem klass 3133 // lduw(O4, sco_offset, O3); // sco of elem klass 3134 3135 __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy); 3136 __ delayed()->lduw(O4, sco_offset, O3); 3137 } 3138 3139 __ BIND(L_failed); 3140 __ retl(); 3141 __ delayed()->sub(G0, 1, O0); // return -1 3142 return start; 3143 } 3144 3145 // 3146 // Generate stub for heap zeroing. 3147 // "to" address is aligned to jlong (8 bytes). 3148 // 3149 // Arguments for generated stub: 3150 // to: O0 3151 // count: O1 treated as signed (count of HeapWord) 3152 // count could be 0 3153 // 3154 address generate_zero_aligned_words(const char* name) { 3155 __ align(CodeEntryAlignment); 3156 StubCodeMark mark(this, "StubRoutines", name); 3157 address start = __ pc(); 3158 3159 const Register to = O0; // source array address 3160 const Register count = O1; // HeapWords count 3161 const Register temp = O2; // scratch 3162 3163 Label Ldone; 3164 __ sllx(count, LogHeapWordSize, count); // to bytes count 3165 // Use BIS for zeroing 3166 __ bis_zeroing(to, count, temp, Ldone); 3167 __ bind(Ldone); 3168 __ retl(); 3169 __ delayed()->nop(); 3170 return start; 3171 } 3172 3173 void generate_arraycopy_stubs() { 3174 address entry; 3175 address entry_jbyte_arraycopy; 3176 address entry_jshort_arraycopy; 3177 address entry_jint_arraycopy; 3178 address entry_oop_arraycopy; 3179 address entry_jlong_arraycopy; 3180 address entry_checkcast_arraycopy; 3181 3182 //*** jbyte 3183 // Always need aligned and unaligned versions 3184 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 3185 "jbyte_disjoint_arraycopy"); 3186 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 3187 &entry_jbyte_arraycopy, 3188 "jbyte_arraycopy"); 3189 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 3190 "arrayof_jbyte_disjoint_arraycopy"); 3191 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 3192 "arrayof_jbyte_arraycopy"); 3193 3194 //*** jshort 3195 // Always need aligned and unaligned versions 3196 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 3197 "jshort_disjoint_arraycopy"); 3198 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 3199 &entry_jshort_arraycopy, 3200 "jshort_arraycopy"); 3201 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 3202 "arrayof_jshort_disjoint_arraycopy"); 3203 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 3204 "arrayof_jshort_arraycopy"); 3205 3206 //*** jint 3207 // Aligned versions 3208 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 3209 "arrayof_jint_disjoint_arraycopy"); 3210 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 3211 "arrayof_jint_arraycopy"); 3212 #ifdef _LP64 3213 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 3214 // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it). 3215 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 3216 "jint_disjoint_arraycopy"); 3217 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 3218 &entry_jint_arraycopy, 3219 "jint_arraycopy"); 3220 #else 3221 // In 32 bit jints are always HeapWordSize aligned, so always use the aligned version 3222 // (in fact in 32bit we always have a pre-loop part even in the aligned version, 3223 // because it uses 64-bit loads/stores, so the aligned flag is actually ignored). 3224 StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy; 3225 StubRoutines::_jint_arraycopy = StubRoutines::_arrayof_jint_arraycopy; 3226 #endif 3227 3228 3229 //*** jlong 3230 // It is always aligned 3231 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 3232 "arrayof_jlong_disjoint_arraycopy"); 3233 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 3234 "arrayof_jlong_arraycopy"); 3235 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 3236 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 3237 3238 3239 //*** oops 3240 // Aligned versions 3241 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, &entry, 3242 "arrayof_oop_disjoint_arraycopy"); 3243 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy, 3244 "arrayof_oop_arraycopy"); 3245 // Aligned versions without pre-barriers 3246 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry, 3247 "arrayof_oop_disjoint_arraycopy_uninit", 3248 /*dest_uninitialized*/true); 3249 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, entry, NULL, 3250 "arrayof_oop_arraycopy_uninit", 3251 /*dest_uninitialized*/true); 3252 #ifdef _LP64 3253 if (UseCompressedOops) { 3254 // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy. 3255 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, &entry, 3256 "oop_disjoint_arraycopy"); 3257 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy, 3258 "oop_arraycopy"); 3259 // Unaligned versions without pre-barriers 3260 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, &entry, 3261 "oop_disjoint_arraycopy_uninit", 3262 /*dest_uninitialized*/true); 3263 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, entry, NULL, 3264 "oop_arraycopy_uninit", 3265 /*dest_uninitialized*/true); 3266 } else 3267 #endif 3268 { 3269 // oop arraycopy is always aligned on 32bit and 64bit without compressed oops 3270 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 3271 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 3272 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 3273 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 3274 } 3275 3276 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 3277 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 3278 /*dest_uninitialized*/true); 3279 3280 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 3281 entry_jbyte_arraycopy, 3282 entry_jshort_arraycopy, 3283 entry_jint_arraycopy, 3284 entry_jlong_arraycopy); 3285 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 3286 entry_jbyte_arraycopy, 3287 entry_jshort_arraycopy, 3288 entry_jint_arraycopy, 3289 entry_oop_arraycopy, 3290 entry_jlong_arraycopy, 3291 entry_checkcast_arraycopy); 3292 3293 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 3294 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 3295 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 3296 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 3297 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 3298 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 3299 3300 if (UseBlockZeroing) { 3301 StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words"); 3302 } 3303 } 3304 3305 address generate_aescrypt_encryptBlock() { 3306 // required since we read expanded key 'int' array starting first element without alignment considerations 3307 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 3308 "the following code assumes that first element of an int array is aligned to 8 bytes"); 3309 __ align(CodeEntryAlignment); 3310 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 3311 Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output; 3312 address start = __ pc(); 3313 Register from = O0; // source byte array 3314 Register to = O1; // destination byte array 3315 Register key = O2; // expanded key array 3316 const Register keylen = O4; //reg for storing expanded key array length 3317 3318 // read expanded key length 3319 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 3320 3321 // Method to address arbitrary alignment for load instructions: 3322 // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary 3323 // If zero/aligned then continue with double FP load instructions 3324 // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata 3325 // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address 3326 // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address 3327 // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs 3328 3329 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3330 __ andcc(from, 7, G0); 3331 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); 3332 __ delayed()->alignaddr(from, G0, from); 3333 3334 // aligned case: load input into F54-F56 3335 __ ldf(FloatRegisterImpl::D, from, 0, F54); 3336 __ ldf(FloatRegisterImpl::D, from, 8, F56); 3337 __ ba_short(L_load_expanded_key); 3338 3339 __ BIND(L_load_misaligned_input); 3340 __ ldf(FloatRegisterImpl::D, from, 0, F54); 3341 __ ldf(FloatRegisterImpl::D, from, 8, F56); 3342 __ ldf(FloatRegisterImpl::D, from, 16, F58); 3343 __ faligndata(F54, F56, F54); 3344 __ faligndata(F56, F58, F56); 3345 3346 __ BIND(L_load_expanded_key); 3347 // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed 3348 for ( int i = 0; i <= 38; i += 2 ) { 3349 __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i)); 3350 } 3351 3352 // perform cipher transformation 3353 __ fxor(FloatRegisterImpl::D, F0, F54, F54); 3354 __ fxor(FloatRegisterImpl::D, F2, F56, F56); 3355 // rounds 1 through 8 3356 for ( int i = 4; i <= 28; i += 8 ) { 3357 __ aes_eround01(as_FloatRegister(i), F54, F56, F58); 3358 __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60); 3359 __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54); 3360 __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56); 3361 } 3362 __ aes_eround01(F36, F54, F56, F58); //round 9 3363 __ aes_eround23(F38, F54, F56, F60); 3364 3365 // 128-bit original key size 3366 __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit); 3367 3368 for ( int i = 40; i <= 50; i += 2 ) { 3369 __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) ); 3370 } 3371 __ aes_eround01(F40, F58, F60, F54); //round 10 3372 __ aes_eround23(F42, F58, F60, F56); 3373 __ aes_eround01(F44, F54, F56, F58); //round 11 3374 __ aes_eround23(F46, F54, F56, F60); 3375 3376 // 192-bit original key size 3377 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput); 3378 3379 __ ldf(FloatRegisterImpl::D, key, 208, F52); 3380 __ aes_eround01(F48, F58, F60, F54); //round 12 3381 __ aes_eround23(F50, F58, F60, F56); 3382 __ ldf(FloatRegisterImpl::D, key, 216, F46); 3383 __ ldf(FloatRegisterImpl::D, key, 224, F48); 3384 __ ldf(FloatRegisterImpl::D, key, 232, F50); 3385 __ aes_eround01(F52, F54, F56, F58); //round 13 3386 __ aes_eround23(F46, F54, F56, F60); 3387 __ ba_short(L_storeOutput); 3388 3389 __ BIND(L_doLast128bit); 3390 __ ldf(FloatRegisterImpl::D, key, 160, F48); 3391 __ ldf(FloatRegisterImpl::D, key, 168, F50); 3392 3393 __ BIND(L_storeOutput); 3394 // perform last round of encryption common for all key sizes 3395 __ aes_eround01_l(F48, F58, F60, F54); //last round 3396 __ aes_eround23_l(F50, F58, F60, F56); 3397 3398 // Method to address arbitrary alignment for store instructions: 3399 // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary 3400 // If zero/aligned then continue with double FP store instructions 3401 // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case) 3402 // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001 3403 // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case 3404 // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case. 3405 // Set GSR.align to (8-n) using alignaddr 3406 // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf 3407 // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address 3408 // Store (partial) the original first (8-n) bytes starting at the original 'dest' address 3409 // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address 3410 // We need to execute this process for both the 8-byte result values 3411 3412 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3413 __ andcc(to, 7, O5); 3414 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); 3415 __ delayed()->edge8n(to, G0, O3); 3416 3417 // aligned case: store output into the destination array 3418 __ stf(FloatRegisterImpl::D, F54, to, 0); 3419 __ retl(); 3420 __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8); 3421 3422 __ BIND(L_store_misaligned_output); 3423 __ add(to, 8, O4); 3424 __ mov(8, O2); 3425 __ sub(O2, O5, O2); 3426 __ alignaddr(O2, G0, O2); 3427 __ faligndata(F54, F54, F54); 3428 __ faligndata(F56, F56, F56); 3429 __ and3(to, -8, to); 3430 __ and3(O4, -8, O4); 3431 __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); 3432 __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); 3433 __ add(to, 8, to); 3434 __ add(O4, 8, O4); 3435 __ orn(G0, O3, O3); 3436 __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); 3437 __ retl(); 3438 __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); 3439 3440 return start; 3441 } 3442 3443 address generate_aescrypt_decryptBlock() { 3444 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 3445 "the following code assumes that first element of an int array is aligned to 8 bytes"); 3446 // required since we read original key 'byte' array as well in the decryption stubs 3447 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 3448 "the following code assumes that first element of a byte array is aligned to 8 bytes"); 3449 __ align(CodeEntryAlignment); 3450 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 3451 address start = __ pc(); 3452 Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input; 3453 Label L_256bit_transform, L_common_transform, L_store_misaligned_output; 3454 Register from = O0; // source byte array 3455 Register to = O1; // destination byte array 3456 Register key = O2; // expanded key array 3457 Register original_key = O3; // original key array only required during decryption 3458 const Register keylen = O4; // reg for storing expanded key array length 3459 3460 // read expanded key array length 3461 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 3462 3463 // save 'from' since we may need to recheck alignment in case of 256-bit decryption 3464 __ mov(from, G1); 3465 3466 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3467 __ andcc(from, 7, G0); 3468 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); 3469 __ delayed()->alignaddr(from, G0, from); 3470 3471 // aligned case: load input into F52-F54 3472 __ ldf(FloatRegisterImpl::D, from, 0, F52); 3473 __ ldf(FloatRegisterImpl::D, from, 8, F54); 3474 __ ba_short(L_load_original_key); 3475 3476 __ BIND(L_load_misaligned_input); 3477 __ ldf(FloatRegisterImpl::D, from, 0, F52); 3478 __ ldf(FloatRegisterImpl::D, from, 8, F54); 3479 __ ldf(FloatRegisterImpl::D, from, 16, F56); 3480 __ faligndata(F52, F54, F52); 3481 __ faligndata(F54, F56, F54); 3482 3483 __ BIND(L_load_original_key); 3484 // load original key from SunJCE expanded decryption key 3485 // Since we load original key buffer starting first element, 8-byte alignment is guaranteed 3486 for ( int i = 0; i <= 3; i++ ) { 3487 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 3488 } 3489 3490 // 256-bit original key size 3491 __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit); 3492 3493 // 192-bit original key size 3494 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit); 3495 3496 // 128-bit original key size 3497 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 3498 for ( int i = 0; i <= 36; i += 4 ) { 3499 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4)); 3500 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6)); 3501 } 3502 3503 // perform 128-bit key specific inverse cipher transformation 3504 __ fxor(FloatRegisterImpl::D, F42, F54, F54); 3505 __ fxor(FloatRegisterImpl::D, F40, F52, F52); 3506 __ ba_short(L_common_transform); 3507 3508 __ BIND(L_expand192bit); 3509 3510 // start loading rest of the 192-bit key 3511 __ ldf(FloatRegisterImpl::S, original_key, 16, F4); 3512 __ ldf(FloatRegisterImpl::S, original_key, 20, F5); 3513 3514 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 3515 for ( int i = 0; i <= 36; i += 6 ) { 3516 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6)); 3517 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8)); 3518 __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10)); 3519 } 3520 __ aes_kexpand1(F42, F46, 7, F48); 3521 __ aes_kexpand2(F44, F48, F50); 3522 3523 // perform 192-bit key specific inverse cipher transformation 3524 __ fxor(FloatRegisterImpl::D, F50, F54, F54); 3525 __ fxor(FloatRegisterImpl::D, F48, F52, F52); 3526 __ aes_dround23(F46, F52, F54, F58); 3527 __ aes_dround01(F44, F52, F54, F56); 3528 __ aes_dround23(F42, F56, F58, F54); 3529 __ aes_dround01(F40, F56, F58, F52); 3530 __ ba_short(L_common_transform); 3531 3532 __ BIND(L_expand256bit); 3533 3534 // load rest of the 256-bit key 3535 for ( int i = 4; i <= 7; i++ ) { 3536 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 3537 } 3538 3539 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 3540 for ( int i = 0; i <= 40; i += 8 ) { 3541 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8)); 3542 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10)); 3543 __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12)); 3544 __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14)); 3545 } 3546 __ aes_kexpand1(F48, F54, 6, F56); 3547 __ aes_kexpand2(F50, F56, F58); 3548 3549 for ( int i = 0; i <= 6; i += 2 ) { 3550 __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); 3551 } 3552 3553 // reload original 'from' address 3554 __ mov(G1, from); 3555 3556 // re-check 8-byte alignment 3557 __ andcc(from, 7, G0); 3558 __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input); 3559 __ delayed()->alignaddr(from, G0, from); 3560 3561 // aligned case: load input into F52-F54 3562 __ ldf(FloatRegisterImpl::D, from, 0, F52); 3563 __ ldf(FloatRegisterImpl::D, from, 8, F54); 3564 __ ba_short(L_256bit_transform); 3565 3566 __ BIND(L_reload_misaligned_input); 3567 __ ldf(FloatRegisterImpl::D, from, 0, F52); 3568 __ ldf(FloatRegisterImpl::D, from, 8, F54); 3569 __ ldf(FloatRegisterImpl::D, from, 16, F56); 3570 __ faligndata(F52, F54, F52); 3571 __ faligndata(F54, F56, F54); 3572 3573 // perform 256-bit key specific inverse cipher transformation 3574 __ BIND(L_256bit_transform); 3575 __ fxor(FloatRegisterImpl::D, F0, F54, F54); 3576 __ fxor(FloatRegisterImpl::D, F2, F52, F52); 3577 __ aes_dround23(F4, F52, F54, F58); 3578 __ aes_dround01(F6, F52, F54, F56); 3579 __ aes_dround23(F50, F56, F58, F54); 3580 __ aes_dround01(F48, F56, F58, F52); 3581 __ aes_dround23(F46, F52, F54, F58); 3582 __ aes_dround01(F44, F52, F54, F56); 3583 __ aes_dround23(F42, F56, F58, F54); 3584 __ aes_dround01(F40, F56, F58, F52); 3585 3586 for ( int i = 0; i <= 7; i++ ) { 3587 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 3588 } 3589 3590 // perform inverse cipher transformations common for all key sizes 3591 __ BIND(L_common_transform); 3592 for ( int i = 38; i >= 6; i -= 8 ) { 3593 __ aes_dround23(as_FloatRegister(i), F52, F54, F58); 3594 __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56); 3595 if ( i != 6) { 3596 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54); 3597 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52); 3598 } else { 3599 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54); 3600 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52); 3601 } 3602 } 3603 3604 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3605 __ andcc(to, 7, O5); 3606 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); 3607 __ delayed()->edge8n(to, G0, O3); 3608 3609 // aligned case: store output into the destination array 3610 __ stf(FloatRegisterImpl::D, F52, to, 0); 3611 __ retl(); 3612 __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8); 3613 3614 __ BIND(L_store_misaligned_output); 3615 __ add(to, 8, O4); 3616 __ mov(8, O2); 3617 __ sub(O2, O5, O2); 3618 __ alignaddr(O2, G0, O2); 3619 __ faligndata(F52, F52, F52); 3620 __ faligndata(F54, F54, F54); 3621 __ and3(to, -8, to); 3622 __ and3(O4, -8, O4); 3623 __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); 3624 __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); 3625 __ add(to, 8, to); 3626 __ add(O4, 8, O4); 3627 __ orn(G0, O3, O3); 3628 __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); 3629 __ retl(); 3630 __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); 3631 3632 return start; 3633 } 3634 3635 address generate_cipherBlockChaining_encryptAESCrypt() { 3636 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 3637 "the following code assumes that first element of an int array is aligned to 8 bytes"); 3638 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 3639 "the following code assumes that first element of a byte array is aligned to 8 bytes"); 3640 __ align(CodeEntryAlignment); 3641 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 3642 Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit; 3643 Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform; 3644 Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit; 3645 Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit; 3646 address start = __ pc(); 3647 Register from = I0; // source byte array 3648 Register to = I1; // destination byte array 3649 Register key = I2; // expanded key array 3650 Register rvec = I3; // init vector 3651 const Register len_reg = I4; // cipher length 3652 const Register keylen = I5; // reg for storing expanded key array length 3653 3654 __ save_frame(0); 3655 // save cipher len to return in the end 3656 __ mov(len_reg, L0); 3657 3658 // read expanded key length 3659 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 3660 3661 // load initial vector, 8-byte alignment is guranteed 3662 __ ldf(FloatRegisterImpl::D, rvec, 0, F60); 3663 __ ldf(FloatRegisterImpl::D, rvec, 8, F62); 3664 // load key, 8-byte alignment is guranteed 3665 __ ldx(key,0,G1); 3666 __ ldx(key,8,G5); 3667 3668 // start loading expanded key, 8-byte alignment is guranteed 3669 for ( int i = 0, j = 16; i <= 38; i += 2, j += 8 ) { 3670 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 3671 } 3672 3673 // 128-bit original key size 3674 __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128); 3675 3676 for ( int i = 40, j = 176; i <= 46; i += 2, j += 8 ) { 3677 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 3678 } 3679 3680 // 192-bit original key size 3681 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192); 3682 3683 for ( int i = 48, j = 208; i <= 54; i += 2, j += 8 ) { 3684 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 3685 } 3686 3687 // 256-bit original key size 3688 __ ba_short(L_cbcenc256); 3689 3690 __ align(OptoLoopAlignment); 3691 __ BIND(L_cbcenc128); 3692 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3693 __ andcc(from, 7, G0); 3694 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit); 3695 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 3696 3697 // aligned case: load input into G3 and G4 3698 __ ldx(from,0,G3); 3699 __ ldx(from,8,G4); 3700 __ ba_short(L_128bit_transform); 3701 3702 __ BIND(L_load_misaligned_input_128bit); 3703 // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption 3704 __ alignaddr(from, G0, from); 3705 __ ldf(FloatRegisterImpl::D, from, 0, F48); 3706 __ ldf(FloatRegisterImpl::D, from, 8, F50); 3707 __ ldf(FloatRegisterImpl::D, from, 16, F52); 3708 __ faligndata(F48, F50, F48); 3709 __ faligndata(F50, F52, F50); 3710 __ movdtox(F48, G3); 3711 __ movdtox(F50, G4); 3712 __ mov(L1, from); 3713 3714 __ BIND(L_128bit_transform); 3715 __ xor3(G1,G3,G3); 3716 __ xor3(G5,G4,G4); 3717 __ movxtod(G3,F56); 3718 __ movxtod(G4,F58); 3719 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 3720 __ fxor(FloatRegisterImpl::D, F62, F58, F62); 3721 3722 // TEN_EROUNDS 3723 for ( int i = 0; i <= 32; i += 8 ) { 3724 __ aes_eround01(as_FloatRegister(i), F60, F62, F56); 3725 __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); 3726 if (i != 32 ) { 3727 __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); 3728 __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); 3729 } else { 3730 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 3731 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 3732 } 3733 } 3734 3735 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3736 __ andcc(to, 7, L1); 3737 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit); 3738 __ delayed()->edge8n(to, G0, L2); 3739 3740 // aligned case: store output into the destination array 3741 __ stf(FloatRegisterImpl::D, F60, to, 0); 3742 __ stf(FloatRegisterImpl::D, F62, to, 8); 3743 __ ba_short(L_check_loop_end_128bit); 3744 3745 __ BIND(L_store_misaligned_output_128bit); 3746 __ add(to, 8, L3); 3747 __ mov(8, L4); 3748 __ sub(L4, L1, L4); 3749 __ alignaddr(L4, G0, L4); 3750 // save cipher text before circular right shift 3751 // as it needs to be stored as iv for next block (see code before next retl) 3752 __ movdtox(F60, L6); 3753 __ movdtox(F62, L7); 3754 __ faligndata(F60, F60, F60); 3755 __ faligndata(F62, F62, F62); 3756 __ mov(to, L5); 3757 __ and3(to, -8, to); 3758 __ and3(L3, -8, L3); 3759 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3760 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3761 __ add(to, 8, to); 3762 __ add(L3, 8, L3); 3763 __ orn(G0, L2, L2); 3764 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3765 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3766 __ mov(L5, to); 3767 __ movxtod(L6, F60); 3768 __ movxtod(L7, F62); 3769 3770 __ BIND(L_check_loop_end_128bit); 3771 __ add(from, 16, from); 3772 __ add(to, 16, to); 3773 __ subcc(len_reg, 16, len_reg); 3774 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128); 3775 __ delayed()->nop(); 3776 // re-init intial vector for next block, 8-byte alignment is guaranteed 3777 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 3778 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 3779 __ mov(L0, I0); 3780 __ ret(); 3781 __ delayed()->restore(); 3782 3783 __ align(OptoLoopAlignment); 3784 __ BIND(L_cbcenc192); 3785 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3786 __ andcc(from, 7, G0); 3787 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit); 3788 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 3789 3790 // aligned case: load input into G3 and G4 3791 __ ldx(from,0,G3); 3792 __ ldx(from,8,G4); 3793 __ ba_short(L_192bit_transform); 3794 3795 __ BIND(L_load_misaligned_input_192bit); 3796 // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption 3797 __ alignaddr(from, G0, from); 3798 __ ldf(FloatRegisterImpl::D, from, 0, F48); 3799 __ ldf(FloatRegisterImpl::D, from, 8, F50); 3800 __ ldf(FloatRegisterImpl::D, from, 16, F52); 3801 __ faligndata(F48, F50, F48); 3802 __ faligndata(F50, F52, F50); 3803 __ movdtox(F48, G3); 3804 __ movdtox(F50, G4); 3805 __ mov(L1, from); 3806 3807 __ BIND(L_192bit_transform); 3808 __ xor3(G1,G3,G3); 3809 __ xor3(G5,G4,G4); 3810 __ movxtod(G3,F56); 3811 __ movxtod(G4,F58); 3812 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 3813 __ fxor(FloatRegisterImpl::D, F62, F58, F62); 3814 3815 // TWELEVE_EROUNDS 3816 for ( int i = 0; i <= 40; i += 8 ) { 3817 __ aes_eround01(as_FloatRegister(i), F60, F62, F56); 3818 __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); 3819 if (i != 40 ) { 3820 __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); 3821 __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); 3822 } else { 3823 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 3824 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 3825 } 3826 } 3827 3828 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3829 __ andcc(to, 7, L1); 3830 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit); 3831 __ delayed()->edge8n(to, G0, L2); 3832 3833 // aligned case: store output into the destination array 3834 __ stf(FloatRegisterImpl::D, F60, to, 0); 3835 __ stf(FloatRegisterImpl::D, F62, to, 8); 3836 __ ba_short(L_check_loop_end_192bit); 3837 3838 __ BIND(L_store_misaligned_output_192bit); 3839 __ add(to, 8, L3); 3840 __ mov(8, L4); 3841 __ sub(L4, L1, L4); 3842 __ alignaddr(L4, G0, L4); 3843 __ movdtox(F60, L6); 3844 __ movdtox(F62, L7); 3845 __ faligndata(F60, F60, F60); 3846 __ faligndata(F62, F62, F62); 3847 __ mov(to, L5); 3848 __ and3(to, -8, to); 3849 __ and3(L3, -8, L3); 3850 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3851 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3852 __ add(to, 8, to); 3853 __ add(L3, 8, L3); 3854 __ orn(G0, L2, L2); 3855 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3856 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3857 __ mov(L5, to); 3858 __ movxtod(L6, F60); 3859 __ movxtod(L7, F62); 3860 3861 __ BIND(L_check_loop_end_192bit); 3862 __ add(from, 16, from); 3863 __ subcc(len_reg, 16, len_reg); 3864 __ add(to, 16, to); 3865 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192); 3866 __ delayed()->nop(); 3867 // re-init intial vector for next block, 8-byte alignment is guaranteed 3868 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 3869 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 3870 __ mov(L0, I0); 3871 __ ret(); 3872 __ delayed()->restore(); 3873 3874 __ align(OptoLoopAlignment); 3875 __ BIND(L_cbcenc256); 3876 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3877 __ andcc(from, 7, G0); 3878 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit); 3879 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 3880 3881 // aligned case: load input into G3 and G4 3882 __ ldx(from,0,G3); 3883 __ ldx(from,8,G4); 3884 __ ba_short(L_256bit_transform); 3885 3886 __ BIND(L_load_misaligned_input_256bit); 3887 // cannot clobber F48, F50 and F52. F56, F58 can be used though 3888 __ alignaddr(from, G0, from); 3889 __ movdtox(F60, L2); // save F60 before overwriting 3890 __ ldf(FloatRegisterImpl::D, from, 0, F56); 3891 __ ldf(FloatRegisterImpl::D, from, 8, F58); 3892 __ ldf(FloatRegisterImpl::D, from, 16, F60); 3893 __ faligndata(F56, F58, F56); 3894 __ faligndata(F58, F60, F58); 3895 __ movdtox(F56, G3); 3896 __ movdtox(F58, G4); 3897 __ mov(L1, from); 3898 __ movxtod(L2, F60); 3899 3900 __ BIND(L_256bit_transform); 3901 __ xor3(G1,G3,G3); 3902 __ xor3(G5,G4,G4); 3903 __ movxtod(G3,F56); 3904 __ movxtod(G4,F58); 3905 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 3906 __ fxor(FloatRegisterImpl::D, F62, F58, F62); 3907 3908 // FOURTEEN_EROUNDS 3909 for ( int i = 0; i <= 48; i += 8 ) { 3910 __ aes_eround01(as_FloatRegister(i), F60, F62, F56); 3911 __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); 3912 if (i != 48 ) { 3913 __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); 3914 __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); 3915 } else { 3916 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 3917 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 3918 } 3919 } 3920 3921 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3922 __ andcc(to, 7, L1); 3923 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit); 3924 __ delayed()->edge8n(to, G0, L2); 3925 3926 // aligned case: store output into the destination array 3927 __ stf(FloatRegisterImpl::D, F60, to, 0); 3928 __ stf(FloatRegisterImpl::D, F62, to, 8); 3929 __ ba_short(L_check_loop_end_256bit); 3930 3931 __ BIND(L_store_misaligned_output_256bit); 3932 __ add(to, 8, L3); 3933 __ mov(8, L4); 3934 __ sub(L4, L1, L4); 3935 __ alignaddr(L4, G0, L4); 3936 __ movdtox(F60, L6); 3937 __ movdtox(F62, L7); 3938 __ faligndata(F60, F60, F60); 3939 __ faligndata(F62, F62, F62); 3940 __ mov(to, L5); 3941 __ and3(to, -8, to); 3942 __ and3(L3, -8, L3); 3943 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3944 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3945 __ add(to, 8, to); 3946 __ add(L3, 8, L3); 3947 __ orn(G0, L2, L2); 3948 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3949 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3950 __ mov(L5, to); 3951 __ movxtod(L6, F60); 3952 __ movxtod(L7, F62); 3953 3954 __ BIND(L_check_loop_end_256bit); 3955 __ add(from, 16, from); 3956 __ subcc(len_reg, 16, len_reg); 3957 __ add(to, 16, to); 3958 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256); 3959 __ delayed()->nop(); 3960 // re-init intial vector for next block, 8-byte alignment is guaranteed 3961 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 3962 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 3963 __ mov(L0, I0); 3964 __ ret(); 3965 __ delayed()->restore(); 3966 3967 return start; 3968 } 3969 3970 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 3971 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 3972 "the following code assumes that first element of an int array is aligned to 8 bytes"); 3973 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 3974 "the following code assumes that first element of a byte array is aligned to 8 bytes"); 3975 __ align(CodeEntryAlignment); 3976 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 3977 Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start; 3978 Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256; 3979 Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128; 3980 Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256; 3981 Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128; 3982 Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192; 3983 Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256; 3984 address start = __ pc(); 3985 Register from = I0; // source byte array 3986 Register to = I1; // destination byte array 3987 Register key = I2; // expanded key array 3988 Register rvec = I3; // init vector 3989 const Register len_reg = I4; // cipher length 3990 const Register original_key = I5; // original key array only required during decryption 3991 const Register keylen = L6; // reg for storing expanded key array length 3992 3993 __ save_frame(0); //args are read from I* registers since we save the frame in the beginning 3994 // save cipher len to return in the end 3995 __ mov(len_reg, L7); 3996 3997 // load original key from SunJCE expanded decryption key 3998 // Since we load original key buffer starting first element, 8-byte alignment is guaranteed 3999 for ( int i = 0; i <= 3; i++ ) { 4000 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 4001 } 4002 4003 // load initial vector, 8-byte alignment is guaranteed 4004 __ ldx(rvec,0,L0); 4005 __ ldx(rvec,8,L1); 4006 4007 // read expanded key array length 4008 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 4009 4010 // 256-bit original key size 4011 __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit); 4012 4013 // 192-bit original key size 4014 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit); 4015 4016 // 128-bit original key size 4017 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 4018 for ( int i = 0; i <= 36; i += 4 ) { 4019 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4)); 4020 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6)); 4021 } 4022 4023 // load expanded key[last-1] and key[last] elements 4024 __ movdtox(F40,L2); 4025 __ movdtox(F42,L3); 4026 4027 __ and3(len_reg, 16, L4); 4028 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128); 4029 __ nop(); 4030 4031 __ ba_short(L_dec_first_block_start); 4032 4033 __ BIND(L_expand192bit); 4034 // load rest of the 192-bit key 4035 __ ldf(FloatRegisterImpl::S, original_key, 16, F4); 4036 __ ldf(FloatRegisterImpl::S, original_key, 20, F5); 4037 4038 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 4039 for ( int i = 0; i <= 36; i += 6 ) { 4040 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6)); 4041 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8)); 4042 __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10)); 4043 } 4044 __ aes_kexpand1(F42, F46, 7, F48); 4045 __ aes_kexpand2(F44, F48, F50); 4046 4047 // load expanded key[last-1] and key[last] elements 4048 __ movdtox(F48,L2); 4049 __ movdtox(F50,L3); 4050 4051 __ and3(len_reg, 16, L4); 4052 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192); 4053 __ nop(); 4054 4055 __ ba_short(L_dec_first_block_start); 4056 4057 __ BIND(L_expand256bit); 4058 // load rest of the 256-bit key 4059 for ( int i = 4; i <= 7; i++ ) { 4060 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 4061 } 4062 4063 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 4064 for ( int i = 0; i <= 40; i += 8 ) { 4065 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8)); 4066 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10)); 4067 __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12)); 4068 __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14)); 4069 } 4070 __ aes_kexpand1(F48, F54, 6, F56); 4071 __ aes_kexpand2(F50, F56, F58); 4072 4073 // load expanded key[last-1] and key[last] elements 4074 __ movdtox(F56,L2); 4075 __ movdtox(F58,L3); 4076 4077 __ and3(len_reg, 16, L4); 4078 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256); 4079 4080 __ BIND(L_dec_first_block_start); 4081 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 4082 __ andcc(from, 7, G0); 4083 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block); 4084 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 4085 4086 // aligned case: load input into L4 and L5 4087 __ ldx(from,0,L4); 4088 __ ldx(from,8,L5); 4089 __ ba_short(L_transform_first_block); 4090 4091 __ BIND(L_load_misaligned_input_first_block); 4092 __ alignaddr(from, G0, from); 4093 // F58, F60, F62 can be clobbered 4094 __ ldf(FloatRegisterImpl::D, from, 0, F58); 4095 __ ldf(FloatRegisterImpl::D, from, 8, F60); 4096 __ ldf(FloatRegisterImpl::D, from, 16, F62); 4097 __ faligndata(F58, F60, F58); 4098 __ faligndata(F60, F62, F60); 4099 __ movdtox(F58, L4); 4100 __ movdtox(F60, L5); 4101 __ mov(G1, from); 4102 4103 __ BIND(L_transform_first_block); 4104 __ xor3(L2,L4,G1); 4105 __ movxtod(G1,F60); 4106 __ xor3(L3,L5,G1); 4107 __ movxtod(G1,F62); 4108 4109 // 128-bit original key size 4110 __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128); 4111 4112 // 192-bit original key size 4113 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192); 4114 4115 __ aes_dround23(F54, F60, F62, F58); 4116 __ aes_dround01(F52, F60, F62, F56); 4117 __ aes_dround23(F50, F56, F58, F62); 4118 __ aes_dround01(F48, F56, F58, F60); 4119 4120 __ BIND(L_dec_first_block192); 4121 __ aes_dround23(F46, F60, F62, F58); 4122 __ aes_dround01(F44, F60, F62, F56); 4123 __ aes_dround23(F42, F56, F58, F62); 4124 __ aes_dround01(F40, F56, F58, F60); 4125 4126 __ BIND(L_dec_first_block128); 4127 for ( int i = 38; i >= 6; i -= 8 ) { 4128 __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 4129 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 4130 if ( i != 6) { 4131 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 4132 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 4133 } else { 4134 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); 4135 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); 4136 } 4137 } 4138 4139 __ movxtod(L0,F56); 4140 __ movxtod(L1,F58); 4141 __ mov(L4,L0); 4142 __ mov(L5,L1); 4143 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 4144 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 4145 4146 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 4147 __ andcc(to, 7, G1); 4148 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block); 4149 __ delayed()->edge8n(to, G0, G2); 4150 4151 // aligned case: store output into the destination array 4152 __ stf(FloatRegisterImpl::D, F60, to, 0); 4153 __ stf(FloatRegisterImpl::D, F62, to, 8); 4154 __ ba_short(L_check_decrypt_end); 4155 4156 __ BIND(L_store_misaligned_output_first_block); 4157 __ add(to, 8, G3); 4158 __ mov(8, G4); 4159 __ sub(G4, G1, G4); 4160 __ alignaddr(G4, G0, G4); 4161 __ faligndata(F60, F60, F60); 4162 __ faligndata(F62, F62, F62); 4163 __ mov(to, G1); 4164 __ and3(to, -8, to); 4165 __ and3(G3, -8, G3); 4166 __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); 4167 __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); 4168 __ add(to, 8, to); 4169 __ add(G3, 8, G3); 4170 __ orn(G0, G2, G2); 4171 __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); 4172 __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); 4173 __ mov(G1, to); 4174 4175 __ BIND(L_check_decrypt_end); 4176 __ add(from, 16, from); 4177 __ add(to, 16, to); 4178 __ subcc(len_reg, 16, len_reg); 4179 __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end); 4180 __ delayed()->nop(); 4181 4182 // 256-bit original key size 4183 __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256); 4184 4185 // 192-bit original key size 4186 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192); 4187 4188 __ align(OptoLoopAlignment); 4189 __ BIND(L_dec_next2_blocks128); 4190 __ nop(); 4191 4192 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 4193 __ andcc(from, 7, G0); 4194 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128); 4195 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 4196 4197 // aligned case: load input into G4, G5, L4 and L5 4198 __ ldx(from,0,G4); 4199 __ ldx(from,8,G5); 4200 __ ldx(from,16,L4); 4201 __ ldx(from,24,L5); 4202 __ ba_short(L_transform_next2_blocks128); 4203 4204 __ BIND(L_load_misaligned_next2_blocks128); 4205 __ alignaddr(from, G0, from); 4206 // F40, F42, F58, F60, F62 can be clobbered 4207 __ ldf(FloatRegisterImpl::D, from, 0, F40); 4208 __ ldf(FloatRegisterImpl::D, from, 8, F42); 4209 __ ldf(FloatRegisterImpl::D, from, 16, F60); 4210 __ ldf(FloatRegisterImpl::D, from, 24, F62); 4211 __ ldf(FloatRegisterImpl::D, from, 32, F58); 4212 __ faligndata(F40, F42, F40); 4213 __ faligndata(F42, F60, F42); 4214 __ faligndata(F60, F62, F60); 4215 __ faligndata(F62, F58, F62); 4216 __ movdtox(F40, G4); 4217 __ movdtox(F42, G5); 4218 __ movdtox(F60, L4); 4219 __ movdtox(F62, L5); 4220 __ mov(G1, from); 4221 4222 __ BIND(L_transform_next2_blocks128); 4223 // F40:F42 used for first 16-bytes 4224 __ xor3(L2,G4,G1); 4225 __ movxtod(G1,F40); 4226 __ xor3(L3,G5,G1); 4227 __ movxtod(G1,F42); 4228 4229 // F60:F62 used for next 16-bytes 4230 __ xor3(L2,L4,G1); 4231 __ movxtod(G1,F60); 4232 __ xor3(L3,L5,G1); 4233 __ movxtod(G1,F62); 4234 4235 for ( int i = 38; i >= 6; i -= 8 ) { 4236 __ aes_dround23(as_FloatRegister(i), F40, F42, F44); 4237 __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46); 4238 __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 4239 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 4240 if (i != 6 ) { 4241 __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42); 4242 __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40); 4243 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 4244 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 4245 } else { 4246 __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42); 4247 __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40); 4248 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); 4249 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); 4250 } 4251 } 4252 4253 __ movxtod(L0,F46); 4254 __ movxtod(L1,F44); 4255 __ fxor(FloatRegisterImpl::D, F46, F40, F40); 4256 __ fxor(FloatRegisterImpl::D, F44, F42, F42); 4257 4258 __ movxtod(G4,F56); 4259 __ movxtod(G5,F58); 4260 __ mov(L4,L0); 4261 __ mov(L5,L1); 4262 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 4263 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 4264 4265 // For mis-aligned store of 32 bytes of result we can do: 4266 // Circular right-shift all 4 FP registers so that 'head' and 'tail' 4267 // parts that need to be stored starting at mis-aligned address are in a FP reg 4268 // the other 3 FP regs can thus be stored using regular store 4269 // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts 4270 4271 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 4272 __ andcc(to, 7, G1); 4273 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128); 4274 __ delayed()->edge8n(to, G0, G2); 4275 4276 // aligned case: store output into the destination array 4277 __ stf(FloatRegisterImpl::D, F40, to, 0); 4278 __ stf(FloatRegisterImpl::D, F42, to, 8); 4279 __ stf(FloatRegisterImpl::D, F60, to, 16); 4280 __ stf(FloatRegisterImpl::D, F62, to, 24); 4281 __ ba_short(L_check_decrypt_loop_end128); 4282 4283 __ BIND(L_store_misaligned_output_next2_blocks128); 4284 __ mov(8, G4); 4285 __ sub(G4, G1, G4); 4286 __ alignaddr(G4, G0, G4); 4287 __ faligndata(F40, F42, F56); // F56 can be clobbered 4288 __ faligndata(F42, F60, F42); 4289 __ faligndata(F60, F62, F60); 4290 __ faligndata(F62, F40, F40); 4291 __ mov(to, G1); 4292 __ and3(to, -8, to); 4293 __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); 4294 __ stf(FloatRegisterImpl::D, F56, to, 8); 4295 __ stf(FloatRegisterImpl::D, F42, to, 16); 4296 __ stf(FloatRegisterImpl::D, F60, to, 24); 4297 __ add(to, 32, to); 4298 __ orn(G0, G2, G2); 4299 __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); 4300 __ mov(G1, to); 4301 4302 __ BIND(L_check_decrypt_loop_end128); 4303 __ add(from, 32, from); 4304 __ add(to, 32, to); 4305 __ subcc(len_reg, 32, len_reg); 4306 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128); 4307 __ delayed()->nop(); 4308 __ ba_short(L_cbcdec_end); 4309 4310 __ align(OptoLoopAlignment); 4311 __ BIND(L_dec_next2_blocks192); 4312 __ nop(); 4313 4314 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 4315 __ andcc(from, 7, G0); 4316 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192); 4317 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 4318 4319 // aligned case: load input into G4, G5, L4 and L5 4320 __ ldx(from,0,G4); 4321 __ ldx(from,8,G5); 4322 __ ldx(from,16,L4); 4323 __ ldx(from,24,L5); 4324 __ ba_short(L_transform_next2_blocks192); 4325 4326 __ BIND(L_load_misaligned_next2_blocks192); 4327 __ alignaddr(from, G0, from); 4328 // F48, F50, F52, F60, F62 can be clobbered 4329 __ ldf(FloatRegisterImpl::D, from, 0, F48); 4330 __ ldf(FloatRegisterImpl::D, from, 8, F50); 4331 __ ldf(FloatRegisterImpl::D, from, 16, F60); 4332 __ ldf(FloatRegisterImpl::D, from, 24, F62); 4333 __ ldf(FloatRegisterImpl::D, from, 32, F52); 4334 __ faligndata(F48, F50, F48); 4335 __ faligndata(F50, F60, F50); 4336 __ faligndata(F60, F62, F60); 4337 __ faligndata(F62, F52, F62); 4338 __ movdtox(F48, G4); 4339 __ movdtox(F50, G5); 4340 __ movdtox(F60, L4); 4341 __ movdtox(F62, L5); 4342 __ mov(G1, from); 4343 4344 __ BIND(L_transform_next2_blocks192); 4345 // F48:F50 used for first 16-bytes 4346 __ xor3(L2,G4,G1); 4347 __ movxtod(G1,F48); 4348 __ xor3(L3,G5,G1); 4349 __ movxtod(G1,F50); 4350 4351 // F60:F62 used for next 16-bytes 4352 __ xor3(L2,L4,G1); 4353 __ movxtod(G1,F60); 4354 __ xor3(L3,L5,G1); 4355 __ movxtod(G1,F62); 4356 4357 for ( int i = 46; i >= 6; i -= 8 ) { 4358 __ aes_dround23(as_FloatRegister(i), F48, F50, F52); 4359 __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54); 4360 __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 4361 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 4362 if (i != 6 ) { 4363 __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50); 4364 __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48); 4365 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 4366 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 4367 } else { 4368 __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50); 4369 __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48); 4370 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); 4371 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); 4372 } 4373 } 4374 4375 __ movxtod(L0,F54); 4376 __ movxtod(L1,F52); 4377 __ fxor(FloatRegisterImpl::D, F54, F48, F48); 4378 __ fxor(FloatRegisterImpl::D, F52, F50, F50); 4379 4380 __ movxtod(G4,F56); 4381 __ movxtod(G5,F58); 4382 __ mov(L4,L0); 4383 __ mov(L5,L1); 4384 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 4385 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 4386 4387 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 4388 __ andcc(to, 7, G1); 4389 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192); 4390 __ delayed()->edge8n(to, G0, G2); 4391 4392 // aligned case: store output into the destination array 4393 __ stf(FloatRegisterImpl::D, F48, to, 0); 4394 __ stf(FloatRegisterImpl::D, F50, to, 8); 4395 __ stf(FloatRegisterImpl::D, F60, to, 16); 4396 __ stf(FloatRegisterImpl::D, F62, to, 24); 4397 __ ba_short(L_check_decrypt_loop_end192); 4398 4399 __ BIND(L_store_misaligned_output_next2_blocks192); 4400 __ mov(8, G4); 4401 __ sub(G4, G1, G4); 4402 __ alignaddr(G4, G0, G4); 4403 __ faligndata(F48, F50, F56); // F56 can be clobbered 4404 __ faligndata(F50, F60, F50); 4405 __ faligndata(F60, F62, F60); 4406 __ faligndata(F62, F48, F48); 4407 __ mov(to, G1); 4408 __ and3(to, -8, to); 4409 __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); 4410 __ stf(FloatRegisterImpl::D, F56, to, 8); 4411 __ stf(FloatRegisterImpl::D, F50, to, 16); 4412 __ stf(FloatRegisterImpl::D, F60, to, 24); 4413 __ add(to, 32, to); 4414 __ orn(G0, G2, G2); 4415 __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); 4416 __ mov(G1, to); 4417 4418 __ BIND(L_check_decrypt_loop_end192); 4419 __ add(from, 32, from); 4420 __ add(to, 32, to); 4421 __ subcc(len_reg, 32, len_reg); 4422 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192); 4423 __ delayed()->nop(); 4424 __ ba_short(L_cbcdec_end); 4425 4426 __ align(OptoLoopAlignment); 4427 __ BIND(L_dec_next2_blocks256); 4428 __ nop(); 4429 4430 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 4431 __ andcc(from, 7, G0); 4432 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256); 4433 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 4434 4435 // aligned case: load input into G4, G5, L4 and L5 4436 __ ldx(from,0,G4); 4437 __ ldx(from,8,G5); 4438 __ ldx(from,16,L4); 4439 __ ldx(from,24,L5); 4440 __ ba_short(L_transform_next2_blocks256); 4441 4442 __ BIND(L_load_misaligned_next2_blocks256); 4443 __ alignaddr(from, G0, from); 4444 // F0, F2, F4, F60, F62 can be clobbered 4445 __ ldf(FloatRegisterImpl::D, from, 0, F0); 4446 __ ldf(FloatRegisterImpl::D, from, 8, F2); 4447 __ ldf(FloatRegisterImpl::D, from, 16, F60); 4448 __ ldf(FloatRegisterImpl::D, from, 24, F62); 4449 __ ldf(FloatRegisterImpl::D, from, 32, F4); 4450 __ faligndata(F0, F2, F0); 4451 __ faligndata(F2, F60, F2); 4452 __ faligndata(F60, F62, F60); 4453 __ faligndata(F62, F4, F62); 4454 __ movdtox(F0, G4); 4455 __ movdtox(F2, G5); 4456 __ movdtox(F60, L4); 4457 __ movdtox(F62, L5); 4458 __ mov(G1, from); 4459 4460 __ BIND(L_transform_next2_blocks256); 4461 // F0:F2 used for first 16-bytes 4462 __ xor3(L2,G4,G1); 4463 __ movxtod(G1,F0); 4464 __ xor3(L3,G5,G1); 4465 __ movxtod(G1,F2); 4466 4467 // F60:F62 used for next 16-bytes 4468 __ xor3(L2,L4,G1); 4469 __ movxtod(G1,F60); 4470 __ xor3(L3,L5,G1); 4471 __ movxtod(G1,F62); 4472 4473 __ aes_dround23(F54, F0, F2, F4); 4474 __ aes_dround01(F52, F0, F2, F6); 4475 __ aes_dround23(F54, F60, F62, F58); 4476 __ aes_dround01(F52, F60, F62, F56); 4477 __ aes_dround23(F50, F6, F4, F2); 4478 __ aes_dround01(F48, F6, F4, F0); 4479 __ aes_dround23(F50, F56, F58, F62); 4480 __ aes_dround01(F48, F56, F58, F60); 4481 // save F48:F54 in temp registers 4482 __ movdtox(F54,G2); 4483 __ movdtox(F52,G3); 4484 __ movdtox(F50,G6); 4485 __ movdtox(F48,G1); 4486 for ( int i = 46; i >= 14; i -= 8 ) { 4487 __ aes_dround23(as_FloatRegister(i), F0, F2, F4); 4488 __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6); 4489 __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 4490 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 4491 __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2); 4492 __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0); 4493 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 4494 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 4495 } 4496 // init F48:F54 with F0:F6 values (original key) 4497 __ ldf(FloatRegisterImpl::D, original_key, 0, F48); 4498 __ ldf(FloatRegisterImpl::D, original_key, 8, F50); 4499 __ ldf(FloatRegisterImpl::D, original_key, 16, F52); 4500 __ ldf(FloatRegisterImpl::D, original_key, 24, F54); 4501 __ aes_dround23(F54, F0, F2, F4); 4502 __ aes_dround01(F52, F0, F2, F6); 4503 __ aes_dround23(F54, F60, F62, F58); 4504 __ aes_dround01(F52, F60, F62, F56); 4505 __ aes_dround23_l(F50, F6, F4, F2); 4506 __ aes_dround01_l(F48, F6, F4, F0); 4507 __ aes_dround23_l(F50, F56, F58, F62); 4508 __ aes_dround01_l(F48, F56, F58, F60); 4509 // re-init F48:F54 with their original values 4510 __ movxtod(G2,F54); 4511 __ movxtod(G3,F52); 4512 __ movxtod(G6,F50); 4513 __ movxtod(G1,F48); 4514 4515 __ movxtod(L0,F6); 4516 __ movxtod(L1,F4); 4517 __ fxor(FloatRegisterImpl::D, F6, F0, F0); 4518 __ fxor(FloatRegisterImpl::D, F4, F2, F2); 4519 4520 __ movxtod(G4,F56); 4521 __ movxtod(G5,F58); 4522 __ mov(L4,L0); 4523 __ mov(L5,L1); 4524 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 4525 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 4526 4527 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 4528 __ andcc(to, 7, G1); 4529 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256); 4530 __ delayed()->edge8n(to, G0, G2); 4531 4532 // aligned case: store output into the destination array 4533 __ stf(FloatRegisterImpl::D, F0, to, 0); 4534 __ stf(FloatRegisterImpl::D, F2, to, 8); 4535 __ stf(FloatRegisterImpl::D, F60, to, 16); 4536 __ stf(FloatRegisterImpl::D, F62, to, 24); 4537 __ ba_short(L_check_decrypt_loop_end256); 4538 4539 __ BIND(L_store_misaligned_output_next2_blocks256); 4540 __ mov(8, G4); 4541 __ sub(G4, G1, G4); 4542 __ alignaddr(G4, G0, G4); 4543 __ faligndata(F0, F2, F56); // F56 can be clobbered 4544 __ faligndata(F2, F60, F2); 4545 __ faligndata(F60, F62, F60); 4546 __ faligndata(F62, F0, F0); 4547 __ mov(to, G1); 4548 __ and3(to, -8, to); 4549 __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); 4550 __ stf(FloatRegisterImpl::D, F56, to, 8); 4551 __ stf(FloatRegisterImpl::D, F2, to, 16); 4552 __ stf(FloatRegisterImpl::D, F60, to, 24); 4553 __ add(to, 32, to); 4554 __ orn(G0, G2, G2); 4555 __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); 4556 __ mov(G1, to); 4557 4558 __ BIND(L_check_decrypt_loop_end256); 4559 __ add(from, 32, from); 4560 __ add(to, 32, to); 4561 __ subcc(len_reg, 32, len_reg); 4562 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256); 4563 __ delayed()->nop(); 4564 4565 __ BIND(L_cbcdec_end); 4566 // re-init intial vector for next block, 8-byte alignment is guaranteed 4567 __ stx(L0, rvec, 0); 4568 __ stx(L1, rvec, 8); 4569 __ mov(L7, I0); 4570 __ ret(); 4571 __ delayed()->restore(); 4572 4573 return start; 4574 } 4575 4576 address generate_sha1_implCompress(bool multi_block, const char *name) { 4577 __ align(CodeEntryAlignment); 4578 StubCodeMark mark(this, "StubRoutines", name); 4579 address start = __ pc(); 4580 4581 Label L_sha1_loop, L_sha1_unaligned_input, L_sha1_unaligned_input_loop; 4582 int i; 4583 4584 Register buf = O0; // byte[] source+offset 4585 Register state = O1; // int[] SHA.state 4586 Register ofs = O2; // int offset 4587 Register limit = O3; // int limit 4588 4589 // load state into F0-F4 4590 for (i = 0; i < 5; i++) { 4591 __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i)); 4592 } 4593 4594 __ andcc(buf, 7, G0); 4595 __ br(Assembler::notZero, false, Assembler::pn, L_sha1_unaligned_input); 4596 __ delayed()->nop(); 4597 4598 __ BIND(L_sha1_loop); 4599 // load buf into F8-F22 4600 for (i = 0; i < 8; i++) { 4601 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); 4602 } 4603 __ sha1(); 4604 if (multi_block) { 4605 __ add(ofs, 64, ofs); 4606 __ add(buf, 64, buf); 4607 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_loop); 4608 __ mov(ofs, O0); // to be returned 4609 } 4610 4611 // store F0-F4 into state and return 4612 for (i = 0; i < 4; i++) { 4613 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); 4614 } 4615 __ retl(); 4616 __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10); 4617 4618 __ BIND(L_sha1_unaligned_input); 4619 __ alignaddr(buf, G0, buf); 4620 4621 __ BIND(L_sha1_unaligned_input_loop); 4622 // load buf into F8-F22 4623 for (i = 0; i < 9; i++) { 4624 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); 4625 } 4626 for (i = 0; i < 8; i++) { 4627 __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8)); 4628 } 4629 __ sha1(); 4630 if (multi_block) { 4631 __ add(ofs, 64, ofs); 4632 __ add(buf, 64, buf); 4633 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_unaligned_input_loop); 4634 __ mov(ofs, O0); // to be returned 4635 } 4636 4637 // store F0-F4 into state and return 4638 for (i = 0; i < 4; i++) { 4639 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); 4640 } 4641 __ retl(); 4642 __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10); 4643 4644 return start; 4645 } 4646 4647 address generate_sha256_implCompress(bool multi_block, const char *name) { 4648 __ align(CodeEntryAlignment); 4649 StubCodeMark mark(this, "StubRoutines", name); 4650 address start = __ pc(); 4651 4652 Label L_sha256_loop, L_sha256_unaligned_input, L_sha256_unaligned_input_loop; 4653 int i; 4654 4655 Register buf = O0; // byte[] source+offset 4656 Register state = O1; // int[] SHA2.state 4657 Register ofs = O2; // int offset 4658 Register limit = O3; // int limit 4659 4660 // load state into F0-F7 4661 for (i = 0; i < 8; i++) { 4662 __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i)); 4663 } 4664 4665 __ andcc(buf, 7, G0); 4666 __ br(Assembler::notZero, false, Assembler::pn, L_sha256_unaligned_input); 4667 __ delayed()->nop(); 4668 4669 __ BIND(L_sha256_loop); 4670 // load buf into F8-F22 4671 for (i = 0; i < 8; i++) { 4672 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); 4673 } 4674 __ sha256(); 4675 if (multi_block) { 4676 __ add(ofs, 64, ofs); 4677 __ add(buf, 64, buf); 4678 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_loop); 4679 __ mov(ofs, O0); // to be returned 4680 } 4681 4682 // store F0-F7 into state and return 4683 for (i = 0; i < 7; i++) { 4684 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); 4685 } 4686 __ retl(); 4687 __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c); 4688 4689 __ BIND(L_sha256_unaligned_input); 4690 __ alignaddr(buf, G0, buf); 4691 4692 __ BIND(L_sha256_unaligned_input_loop); 4693 // load buf into F8-F22 4694 for (i = 0; i < 9; i++) { 4695 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); 4696 } 4697 for (i = 0; i < 8; i++) { 4698 __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8)); 4699 } 4700 __ sha256(); 4701 if (multi_block) { 4702 __ add(ofs, 64, ofs); 4703 __ add(buf, 64, buf); 4704 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_unaligned_input_loop); 4705 __ mov(ofs, O0); // to be returned 4706 } 4707 4708 // store F0-F7 into state and return 4709 for (i = 0; i < 7; i++) { 4710 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); 4711 } 4712 __ retl(); 4713 __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c); 4714 4715 return start; 4716 } 4717 4718 address generate_sha512_implCompress(bool multi_block, const char *name) { 4719 __ align(CodeEntryAlignment); 4720 StubCodeMark mark(this, "StubRoutines", name); 4721 address start = __ pc(); 4722 4723 Label L_sha512_loop, L_sha512_unaligned_input, L_sha512_unaligned_input_loop; 4724 int i; 4725 4726 Register buf = O0; // byte[] source+offset 4727 Register state = O1; // long[] SHA5.state 4728 Register ofs = O2; // int offset 4729 Register limit = O3; // int limit 4730 4731 // load state into F0-F14 4732 for (i = 0; i < 8; i++) { 4733 __ ldf(FloatRegisterImpl::D, state, i*8, as_FloatRegister(i*2)); 4734 } 4735 4736 __ andcc(buf, 7, G0); 4737 __ br(Assembler::notZero, false, Assembler::pn, L_sha512_unaligned_input); 4738 __ delayed()->nop(); 4739 4740 __ BIND(L_sha512_loop); 4741 // load buf into F16-F46 4742 for (i = 0; i < 16; i++) { 4743 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16)); 4744 } 4745 __ sha512(); 4746 if (multi_block) { 4747 __ add(ofs, 128, ofs); 4748 __ add(buf, 128, buf); 4749 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_loop); 4750 __ mov(ofs, O0); // to be returned 4751 } 4752 4753 // store F0-F14 into state and return 4754 for (i = 0; i < 7; i++) { 4755 __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8); 4756 } 4757 __ retl(); 4758 __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38); 4759 4760 __ BIND(L_sha512_unaligned_input); 4761 __ alignaddr(buf, G0, buf); 4762 4763 __ BIND(L_sha512_unaligned_input_loop); 4764 // load buf into F16-F46 4765 for (i = 0; i < 17; i++) { 4766 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16)); 4767 } 4768 for (i = 0; i < 16; i++) { 4769 __ faligndata(as_FloatRegister(i*2 + 16), as_FloatRegister(i*2 + 18), as_FloatRegister(i*2 + 16)); 4770 } 4771 __ sha512(); 4772 if (multi_block) { 4773 __ add(ofs, 128, ofs); 4774 __ add(buf, 128, buf); 4775 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_unaligned_input_loop); 4776 __ mov(ofs, O0); // to be returned 4777 } 4778 4779 // store F0-F14 into state and return 4780 for (i = 0; i < 7; i++) { 4781 __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8); 4782 } 4783 __ retl(); 4784 __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38); 4785 4786 return start; 4787 } 4788 4789 void generate_initial() { 4790 // Generates all stubs and initializes the entry points 4791 4792 //------------------------------------------------------------------------------------------------------------------------ 4793 // entry points that exist in all platforms 4794 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than 4795 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp. 4796 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4797 4798 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address); 4799 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4800 4801 //------------------------------------------------------------------------------------------------------------------------ 4802 // entry points that are platform specific 4803 StubRoutines::Sparc::_test_stop_entry = generate_test_stop(); 4804 4805 StubRoutines::Sparc::_stop_subroutine_entry = generate_stop_subroutine(); 4806 StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows(); 4807 4808 #if !defined(COMPILER2) && !defined(_LP64) 4809 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg(); 4810 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg(); 4811 StubRoutines::_atomic_add_entry = generate_atomic_add(); 4812 StubRoutines::_atomic_xchg_ptr_entry = StubRoutines::_atomic_xchg_entry; 4813 StubRoutines::_atomic_cmpxchg_ptr_entry = StubRoutines::_atomic_cmpxchg_entry; 4814 StubRoutines::_atomic_cmpxchg_byte_entry = ShouldNotCallThisStub(); 4815 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long(); 4816 StubRoutines::_atomic_add_ptr_entry = StubRoutines::_atomic_add_entry; 4817 #endif // COMPILER2 !=> _LP64 4818 4819 // Build this early so it's available for the interpreter. 4820 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError)); 4821 } 4822 4823 4824 void generate_all() { 4825 // Generates all stubs and initializes the entry points 4826 4827 // Generate partial_subtype_check first here since its code depends on 4828 // UseZeroBaseCompressedOops which is defined after heap initialization. 4829 StubRoutines::Sparc::_partial_subtype_check = generate_partial_subtype_check(); 4830 // These entry points require SharedInfo::stack0 to be set up in non-core builds 4831 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError)); 4832 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError)); 4833 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call)); 4834 4835 StubRoutines::_handler_for_unsafe_access_entry = 4836 generate_handler_for_unsafe_access(); 4837 4838 // support for verify_oop (must happen after universe_init) 4839 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop_subroutine(); 4840 4841 // arraycopy stubs used by compilers 4842 generate_arraycopy_stubs(); 4843 4844 // Don't initialize the platform math functions since sparc 4845 // doesn't have intrinsics for these operations. 4846 4847 // Safefetch stubs. 4848 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4849 &StubRoutines::_safefetch32_fault_pc, 4850 &StubRoutines::_safefetch32_continuation_pc); 4851 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4852 &StubRoutines::_safefetchN_fault_pc, 4853 &StubRoutines::_safefetchN_continuation_pc); 4854 4855 // generate AES intrinsics code 4856 if (UseAESIntrinsics) { 4857 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4858 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4859 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4860 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 4861 } 4862 4863 // generate SHA1/SHA256/SHA512 intrinsics code 4864 if (UseSHA1Intrinsics) { 4865 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4866 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4867 } 4868 if (UseSHA256Intrinsics) { 4869 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4870 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4871 } 4872 if (UseSHA512Intrinsics) { 4873 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 4874 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 4875 } 4876 } 4877 4878 4879 public: 4880 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4881 // replace the standard masm with a special one: 4882 _masm = new MacroAssembler(code); 4883 4884 _stub_count = !all ? 0x100 : 0x200; 4885 if (all) { 4886 generate_all(); 4887 } else { 4888 generate_initial(); 4889 } 4890 4891 // make sure this stub is available for all local calls 4892 if (_atomic_add_stub.is_unbound()) { 4893 // generate a second time, if necessary 4894 (void) generate_atomic_add(); 4895 } 4896 } 4897 4898 4899 private: 4900 int _stub_count; 4901 void stub_prolog(StubCodeDesc* cdesc) { 4902 # ifdef ASSERT 4903 // put extra information in the stub code, to make it more readable 4904 #ifdef _LP64 4905 // Write the high part of the address 4906 // [RGV] Check if there is a dependency on the size of this prolog 4907 __ emit_data((intptr_t)cdesc >> 32, relocInfo::none); 4908 #endif 4909 __ emit_data((intptr_t)cdesc, relocInfo::none); 4910 __ emit_data(++_stub_count, relocInfo::none); 4911 # endif 4912 align(true); 4913 } 4914 4915 void align(bool at_header = false) { 4916 // %%%%% move this constant somewhere else 4917 // UltraSPARC cache line size is 8 instructions: 4918 const unsigned int icache_line_size = 32; 4919 const unsigned int icache_half_line_size = 16; 4920 4921 if (at_header) { 4922 while ((intptr_t)(__ pc()) % icache_line_size != 0) { 4923 __ emit_data(0, relocInfo::none); 4924 } 4925 } else { 4926 while ((intptr_t)(__ pc()) % icache_half_line_size != 0) { 4927 __ nop(); 4928 } 4929 } 4930 } 4931 4932 }; // end class declaration 4933 4934 void StubGenerator_generate(CodeBuffer* code, bool all) { 4935 StubGenerator g(code, all); 4936 }