1 /* 2 * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/macroAssembler.inline.hpp" 27 #include "gc/shared/barrierSetCodeGen.hpp" 28 #include "interpreter/interpreter.hpp" 29 #include "nativeInst_sparc.hpp" 30 #include "oops/instanceOop.hpp" 31 #include "oops/method.hpp" 32 #include "oops/objArrayKlass.hpp" 33 #include "oops/oop.inline.hpp" 34 #include "prims/methodHandles.hpp" 35 #include "runtime/frame.inline.hpp" 36 #include "runtime/handles.inline.hpp" 37 #include "runtime/sharedRuntime.hpp" 38 #include "runtime/stubCodeGenerator.hpp" 39 #include "runtime/stubRoutines.hpp" 40 #include "runtime/thread.inline.hpp" 41 #ifdef COMPILER2 42 #include "opto/runtime.hpp" 43 #endif 44 45 // Declaration and definition of StubGenerator (no .hpp file). 46 // For a more detailed description of the stub routine structure 47 // see the comment in stubRoutines.hpp. 48 49 #define __ _masm-> 50 51 #ifdef PRODUCT 52 #define BLOCK_COMMENT(str) /* nothing */ 53 #else 54 #define BLOCK_COMMENT(str) __ block_comment(str) 55 #endif 56 57 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 58 59 // Note: The register L7 is used as L7_thread_cache, and may not be used 60 // any other way within this module. 61 62 63 static const Register& Lstub_temp = L2; 64 65 // ------------------------------------------------------------------------------------------------------------------------- 66 // Stub Code definitions 67 68 class StubGenerator: public StubCodeGenerator { 69 private: 70 71 #ifdef PRODUCT 72 #define inc_counter_np(a,b,c) 73 #else 74 #define inc_counter_np(counter, t1, t2) \ 75 BLOCK_COMMENT("inc_counter " #counter); \ 76 __ inc_counter(&counter, t1, t2); 77 #endif 78 79 //---------------------------------------------------------------------------------------------------- 80 // Call stubs are used to call Java from C 81 82 address generate_call_stub(address& return_pc) { 83 StubCodeMark mark(this, "StubRoutines", "call_stub"); 84 address start = __ pc(); 85 86 // Incoming arguments: 87 // 88 // o0 : call wrapper address 89 // o1 : result (address) 90 // o2 : result type 91 // o3 : method 92 // o4 : (interpreter) entry point 93 // o5 : parameters (address) 94 // [sp + 0x5c]: parameter size (in words) 95 // [sp + 0x60]: thread 96 // 97 // +---------------+ <--- sp + 0 98 // | | 99 // . reg save area . 100 // | | 101 // +---------------+ <--- sp + 0x40 102 // | | 103 // . extra 7 slots . 104 // | | 105 // +---------------+ <--- sp + 0x5c 106 // | param. size | 107 // +---------------+ <--- sp + 0x60 108 // | thread | 109 // +---------------+ 110 // | | 111 112 // note: if the link argument position changes, adjust 113 // the code in frame::entry_frame_call_wrapper() 114 115 const Argument link = Argument(0, false); // used only for GC 116 const Argument result = Argument(1, false); 117 const Argument result_type = Argument(2, false); 118 const Argument method = Argument(3, false); 119 const Argument entry_point = Argument(4, false); 120 const Argument parameters = Argument(5, false); 121 const Argument parameter_size = Argument(6, false); 122 const Argument thread = Argument(7, false); 123 124 // setup thread register 125 __ ld_ptr(thread.as_address(), G2_thread); 126 __ reinit_heapbase(); 127 128 #ifdef ASSERT 129 // make sure we have no pending exceptions 130 { const Register t = G3_scratch; 131 Label L; 132 __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t); 133 __ br_null_short(t, Assembler::pt, L); 134 __ stop("StubRoutines::call_stub: entered with pending exception"); 135 __ bind(L); 136 } 137 #endif 138 139 // create activation frame & allocate space for parameters 140 { const Register t = G3_scratch; 141 __ ld_ptr(parameter_size.as_address(), t); // get parameter size (in words) 142 __ add(t, frame::memory_parameter_word_sp_offset, t); // add space for save area (in words) 143 __ round_to(t, WordsPerLong); // make sure it is multiple of 2 (in words) 144 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes 145 __ neg(t); // negate so it can be used with save 146 __ save(SP, t, SP); // setup new frame 147 } 148 149 // +---------------+ <--- sp + 0 150 // | | 151 // . reg save area . 152 // | | 153 // +---------------+ <--- sp + 0x40 154 // | | 155 // . extra 7 slots . 156 // | | 157 // +---------------+ <--- sp + 0x5c 158 // | empty slot | (only if parameter size is even) 159 // +---------------+ 160 // | | 161 // . parameters . 162 // | | 163 // +---------------+ <--- fp + 0 164 // | | 165 // . reg save area . 166 // | | 167 // +---------------+ <--- fp + 0x40 168 // | | 169 // . extra 7 slots . 170 // | | 171 // +---------------+ <--- fp + 0x5c 172 // | param. size | 173 // +---------------+ <--- fp + 0x60 174 // | thread | 175 // +---------------+ 176 // | | 177 178 // pass parameters if any 179 BLOCK_COMMENT("pass parameters if any"); 180 { const Register src = parameters.as_in().as_register(); 181 const Register dst = Lentry_args; 182 const Register tmp = G3_scratch; 183 const Register cnt = G4_scratch; 184 185 // test if any parameters & setup of Lentry_args 186 Label exit; 187 __ ld_ptr(parameter_size.as_in().as_address(), cnt); // parameter counter 188 __ add( FP, STACK_BIAS, dst ); 189 __ cmp_zero_and_br(Assembler::zero, cnt, exit); 190 __ delayed()->sub(dst, BytesPerWord, dst); // setup Lentry_args 191 192 // copy parameters if any 193 Label loop; 194 __ BIND(loop); 195 // Store parameter value 196 __ ld_ptr(src, 0, tmp); 197 __ add(src, BytesPerWord, src); 198 __ st_ptr(tmp, dst, 0); 199 __ deccc(cnt); 200 __ br(Assembler::greater, false, Assembler::pt, loop); 201 __ delayed()->sub(dst, Interpreter::stackElementSize, dst); 202 203 // done 204 __ BIND(exit); 205 } 206 207 // setup parameters, method & call Java function 208 #ifdef ASSERT 209 // layout_activation_impl checks it's notion of saved SP against 210 // this register, so if this changes update it as well. 211 const Register saved_SP = Lscratch; 212 __ mov(SP, saved_SP); // keep track of SP before call 213 #endif 214 215 // setup parameters 216 const Register t = G3_scratch; 217 __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words) 218 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes 219 __ sub(FP, t, Gargs); // setup parameter pointer 220 __ add( Gargs, STACK_BIAS, Gargs ); // Account for LP64 stack bias 221 __ mov(SP, O5_savedSP); 222 223 224 // do the call 225 // 226 // the following register must be setup: 227 // 228 // G2_thread 229 // G5_method 230 // Gargs 231 BLOCK_COMMENT("call Java function"); 232 __ jmpl(entry_point.as_in().as_register(), G0, O7); 233 __ delayed()->mov(method.as_in().as_register(), G5_method); // setup method 234 235 BLOCK_COMMENT("call_stub_return_address:"); 236 return_pc = __ pc(); 237 238 // The callee, if it wasn't interpreted, can return with SP changed so 239 // we can no longer assert of change of SP. 240 241 // store result depending on type 242 // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE 243 // is treated as T_INT) 244 { const Register addr = result .as_in().as_register(); 245 const Register type = result_type.as_in().as_register(); 246 Label is_long, is_float, is_double, is_object, exit; 247 __ cmp(type, T_OBJECT); __ br(Assembler::equal, false, Assembler::pn, is_object); 248 __ delayed()->cmp(type, T_FLOAT); __ br(Assembler::equal, false, Assembler::pn, is_float); 249 __ delayed()->cmp(type, T_DOUBLE); __ br(Assembler::equal, false, Assembler::pn, is_double); 250 __ delayed()->cmp(type, T_LONG); __ br(Assembler::equal, false, Assembler::pn, is_long); 251 __ delayed()->nop(); 252 253 // store int result 254 __ st(O0, addr, G0); 255 256 __ BIND(exit); 257 __ ret(); 258 __ delayed()->restore(); 259 260 __ BIND(is_object); 261 __ ba(exit); 262 __ delayed()->st_ptr(O0, addr, G0); 263 264 __ BIND(is_float); 265 __ ba(exit); 266 __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0); 267 268 __ BIND(is_double); 269 __ ba(exit); 270 __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0); 271 272 __ BIND(is_long); 273 __ ba(exit); 274 __ delayed()->st_long(O0, addr, G0); // store entire long 275 } 276 return start; 277 } 278 279 280 //---------------------------------------------------------------------------------------------------- 281 // Return point for a Java call if there's an exception thrown in Java code. 282 // The exception is caught and transformed into a pending exception stored in 283 // JavaThread that can be tested from within the VM. 284 // 285 // Oexception: exception oop 286 287 address generate_catch_exception() { 288 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 289 290 address start = __ pc(); 291 // verify that thread corresponds 292 __ verify_thread(); 293 294 const Register& temp_reg = Gtemp; 295 Address pending_exception_addr (G2_thread, Thread::pending_exception_offset()); 296 Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset ()); 297 Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset ()); 298 299 // set pending exception 300 __ verify_oop(Oexception); 301 __ st_ptr(Oexception, pending_exception_addr); 302 __ set((intptr_t)__FILE__, temp_reg); 303 __ st_ptr(temp_reg, exception_file_offset_addr); 304 __ set((intptr_t)__LINE__, temp_reg); 305 __ st(temp_reg, exception_line_offset_addr); 306 307 // complete return to VM 308 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before"); 309 310 AddressLiteral stub_ret(StubRoutines::_call_stub_return_address); 311 __ jump_to(stub_ret, temp_reg); 312 __ delayed()->nop(); 313 314 return start; 315 } 316 317 318 //---------------------------------------------------------------------------------------------------- 319 // Continuation point for runtime calls returning with a pending exception 320 // The pending exception check happened in the runtime or native call stub 321 // The pending exception in Thread is converted into a Java-level exception 322 // 323 // Contract with Java-level exception handler: O0 = exception 324 // O1 = throwing pc 325 326 address generate_forward_exception() { 327 StubCodeMark mark(this, "StubRoutines", "forward_exception"); 328 address start = __ pc(); 329 330 // Upon entry, O7 has the return address returning into Java 331 // (interpreted or compiled) code; i.e. the return address 332 // becomes the throwing pc. 333 334 const Register& handler_reg = Gtemp; 335 336 Address exception_addr(G2_thread, Thread::pending_exception_offset()); 337 338 #ifdef ASSERT 339 // make sure that this code is only executed if there is a pending exception 340 { Label L; 341 __ ld_ptr(exception_addr, Gtemp); 342 __ br_notnull_short(Gtemp, Assembler::pt, L); 343 __ stop("StubRoutines::forward exception: no pending exception (1)"); 344 __ bind(L); 345 } 346 #endif 347 348 // compute exception handler into handler_reg 349 __ get_thread(); 350 __ ld_ptr(exception_addr, Oexception); 351 __ verify_oop(Oexception); 352 __ save_frame(0); // compensates for compiler weakness 353 __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC 354 BLOCK_COMMENT("call exception_handler_for_return_address"); 355 __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch); 356 __ mov(O0, handler_reg); 357 __ restore(); // compensates for compiler weakness 358 359 __ ld_ptr(exception_addr, Oexception); 360 __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC 361 362 #ifdef ASSERT 363 // make sure exception is set 364 { Label L; 365 __ br_notnull_short(Oexception, Assembler::pt, L); 366 __ stop("StubRoutines::forward exception: no pending exception (2)"); 367 __ bind(L); 368 } 369 #endif 370 // jump to exception handler 371 __ jmp(handler_reg, 0); 372 // clear pending exception 373 __ delayed()->st_ptr(G0, exception_addr); 374 375 return start; 376 } 377 378 // Safefetch stubs. 379 void generate_safefetch(const char* name, int size, address* entry, 380 address* fault_pc, address* continuation_pc) { 381 // safefetch signatures: 382 // int SafeFetch32(int* adr, int errValue); 383 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 384 // 385 // arguments: 386 // o0 = adr 387 // o1 = errValue 388 // 389 // result: 390 // o0 = *adr or errValue 391 392 StubCodeMark mark(this, "StubRoutines", name); 393 394 // Entry point, pc or function descriptor. 395 __ align(CodeEntryAlignment); 396 *entry = __ pc(); 397 398 __ mov(O0, G1); // g1 = o0 399 __ mov(O1, O0); // o0 = o1 400 // Load *adr into c_rarg1, may fault. 401 *fault_pc = __ pc(); 402 switch (size) { 403 case 4: 404 // int32_t 405 __ ldsw(G1, 0, O0); // o0 = [g1] 406 break; 407 case 8: 408 // int64_t 409 __ ldx(G1, 0, O0); // o0 = [g1] 410 break; 411 default: 412 ShouldNotReachHere(); 413 } 414 415 // return errValue or *adr 416 *continuation_pc = __ pc(); 417 // By convention with the trap handler we ensure there is a non-CTI 418 // instruction in the trap shadow. 419 __ nop(); 420 __ retl(); 421 __ delayed()->nop(); 422 } 423 424 //------------------------------------------------------------------------------------------------------------------------ 425 // Continuation point for throwing of implicit exceptions that are not handled in 426 // the current activation. Fabricates an exception oop and initiates normal 427 // exception dispatching in this frame. Only callee-saved registers are preserved 428 // (through the normal register window / RegisterMap handling). 429 // If the compiler needs all registers to be preserved between the fault 430 // point and the exception handler then it must assume responsibility for that in 431 // AbstractCompiler::continuation_for_implicit_null_exception or 432 // continuation_for_implicit_division_by_zero_exception. All other implicit 433 // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are 434 // either at call sites or otherwise assume that stack unwinding will be initiated, 435 // so caller saved registers were assumed volatile in the compiler. 436 437 // Note that we generate only this stub into a RuntimeStub, because it needs to be 438 // properly traversed and ignored during GC, so we change the meaning of the "__" 439 // macro within this method. 440 #undef __ 441 #define __ masm-> 442 443 address generate_throw_exception(const char* name, address runtime_entry, 444 Register arg1 = noreg, Register arg2 = noreg) { 445 #ifdef ASSERT 446 int insts_size = VerifyThread ? 1 * K : 600; 447 #else 448 int insts_size = VerifyThread ? 1 * K : 256; 449 #endif /* ASSERT */ 450 int locs_size = 32; 451 452 CodeBuffer code(name, insts_size, locs_size); 453 MacroAssembler* masm = new MacroAssembler(&code); 454 455 __ verify_thread(); 456 457 // This is an inlined and slightly modified version of call_VM 458 // which has the ability to fetch the return PC out of thread-local storage 459 __ assert_not_delayed(); 460 461 // Note that we always push a frame because on the SPARC 462 // architecture, for all of our implicit exception kinds at call 463 // sites, the implicit exception is taken before the callee frame 464 // is pushed. 465 __ save_frame(0); 466 467 int frame_complete = __ offset(); 468 469 // Note that we always have a runtime stub frame on the top of stack by this point 470 Register last_java_sp = SP; 471 // 64-bit last_java_sp is biased! 472 __ set_last_Java_frame(last_java_sp, G0); 473 if (VerifyThread) __ mov(G2_thread, O0); // about to be smashed; pass early 474 __ save_thread(noreg); 475 if (arg1 != noreg) { 476 assert(arg2 != O1, "clobbered"); 477 __ mov(arg1, O1); 478 } 479 if (arg2 != noreg) { 480 __ mov(arg2, O2); 481 } 482 // do the call 483 BLOCK_COMMENT("call runtime_entry"); 484 __ call(runtime_entry, relocInfo::runtime_call_type); 485 if (!VerifyThread) 486 __ delayed()->mov(G2_thread, O0); // pass thread as first argument 487 else 488 __ delayed()->nop(); // (thread already passed) 489 __ restore_thread(noreg); 490 __ reset_last_Java_frame(); 491 492 // check for pending exceptions. use Gtemp as scratch register. 493 #ifdef ASSERT 494 Label L; 495 496 Address exception_addr(G2_thread, Thread::pending_exception_offset()); 497 Register scratch_reg = Gtemp; 498 __ ld_ptr(exception_addr, scratch_reg); 499 __ br_notnull_short(scratch_reg, Assembler::pt, L); 500 __ should_not_reach_here(); 501 __ bind(L); 502 #endif // ASSERT 503 BLOCK_COMMENT("call forward_exception_entry"); 504 __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type); 505 // we use O7 linkage so that forward_exception_entry has the issuing PC 506 __ delayed()->restore(); 507 508 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false); 509 return stub->entry_point(); 510 } 511 512 #undef __ 513 #define __ _masm-> 514 515 516 // Generate a routine that sets all the registers so we 517 // can tell if the stop routine prints them correctly. 518 address generate_test_stop() { 519 StubCodeMark mark(this, "StubRoutines", "test_stop"); 520 address start = __ pc(); 521 522 int i; 523 524 __ save_frame(0); 525 526 static jfloat zero = 0.0, one = 1.0; 527 528 // put addr in L0, then load through L0 to F0 529 __ set((intptr_t)&zero, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F0); 530 __ set((intptr_t)&one, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1 531 532 // use add to put 2..18 in F2..F18 533 for ( i = 2; i <= 18; ++i ) { 534 __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1), as_FloatRegister(i)); 535 } 536 537 // Now put double 2 in F16, double 18 in F18 538 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 ); 539 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 ); 540 541 // use add to put 20..32 in F20..F32 542 for (i = 20; i < 32; i += 2) { 543 __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2), as_FloatRegister(i)); 544 } 545 546 // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's 547 for ( i = 0; i < 8; ++i ) { 548 if (i < 6) { 549 __ set( i, as_iRegister(i)); 550 __ set(16 + i, as_oRegister(i)); 551 __ set(24 + i, as_gRegister(i)); 552 } 553 __ set( 8 + i, as_lRegister(i)); 554 } 555 556 __ stop("testing stop"); 557 558 559 __ ret(); 560 __ delayed()->restore(); 561 562 return start; 563 } 564 565 566 address generate_stop_subroutine() { 567 StubCodeMark mark(this, "StubRoutines", "stop_subroutine"); 568 address start = __ pc(); 569 570 __ stop_subroutine(); 571 572 return start; 573 } 574 575 address generate_flush_callers_register_windows() { 576 StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows"); 577 address start = __ pc(); 578 579 __ flushw(); 580 __ retl(false); 581 __ delayed()->add( FP, STACK_BIAS, O0 ); 582 // The returned value must be a stack pointer whose register save area 583 // is flushed, and will stay flushed while the caller executes. 584 585 return start; 586 } 587 588 // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest). 589 // 590 // Arguments: 591 // 592 // exchange_value: O0 593 // dest: O1 594 // 595 // Results: 596 // 597 // O0: the value previously stored in dest 598 // 599 address generate_atomic_xchg() { 600 StubCodeMark mark(this, "StubRoutines", "atomic_xchg"); 601 address start = __ pc(); 602 603 if (UseCASForSwap) { 604 // Use CAS instead of swap, just in case the MP hardware 605 // prefers to work with just one kind of synch. instruction. 606 Label retry; 607 __ BIND(retry); 608 __ mov(O0, O3); // scratch copy of exchange value 609 __ ld(O1, 0, O2); // observe the previous value 610 // try to replace O2 with O3 611 __ cas(O1, O2, O3); 612 __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry); 613 614 __ retl(false); 615 __ delayed()->mov(O2, O0); // report previous value to caller 616 } else { 617 __ retl(false); 618 __ delayed()->swap(O1, 0, O0); 619 } 620 621 return start; 622 } 623 624 625 // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value) 626 // 627 // Arguments: 628 // 629 // exchange_value: O0 630 // dest: O1 631 // compare_value: O2 632 // 633 // Results: 634 // 635 // O0: the value previously stored in dest 636 // 637 address generate_atomic_cmpxchg() { 638 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg"); 639 address start = __ pc(); 640 641 // cmpxchg(dest, compare_value, exchange_value) 642 __ cas(O1, O2, O0); 643 __ retl(false); 644 __ delayed()->nop(); 645 646 return start; 647 } 648 649 // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value) 650 // 651 // Arguments: 652 // 653 // exchange_value: O1:O0 654 // dest: O2 655 // compare_value: O4:O3 656 // 657 // Results: 658 // 659 // O1:O0: the value previously stored in dest 660 // 661 // Overwrites: G1,G2,G3 662 // 663 address generate_atomic_cmpxchg_long() { 664 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long"); 665 address start = __ pc(); 666 667 __ sllx(O0, 32, O0); 668 __ srl(O1, 0, O1); 669 __ or3(O0,O1,O0); // O0 holds 64-bit value from compare_value 670 __ sllx(O3, 32, O3); 671 __ srl(O4, 0, O4); 672 __ or3(O3,O4,O3); // O3 holds 64-bit value from exchange_value 673 __ casx(O2, O3, O0); 674 __ srl(O0, 0, O1); // unpacked return value in O1:O0 675 __ retl(false); 676 __ delayed()->srlx(O0, 32, O0); 677 678 return start; 679 } 680 681 682 // Support for jint Atomic::add(jint add_value, volatile jint* dest). 683 // 684 // Arguments: 685 // 686 // add_value: O0 (e.g., +1 or -1) 687 // dest: O1 688 // 689 // Results: 690 // 691 // O0: the new value stored in dest 692 // 693 // Overwrites: O3 694 // 695 address generate_atomic_add() { 696 StubCodeMark mark(this, "StubRoutines", "atomic_add"); 697 address start = __ pc(); 698 __ BIND(_atomic_add_stub); 699 700 Label(retry); 701 __ BIND(retry); 702 703 __ lduw(O1, 0, O2); 704 __ add(O0, O2, O3); 705 __ cas(O1, O2, O3); 706 __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry); 707 __ retl(false); 708 __ delayed()->add(O0, O2, O0); // note that cas made O2==O3 709 710 return start; 711 } 712 Label _atomic_add_stub; // called from other stubs 713 714 715 // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super ); 716 // Arguments : 717 // 718 // ret : O0, returned 719 // icc/xcc: set as O0 (depending on wordSize) 720 // sub : O1, argument, not changed 721 // super: O2, argument, not changed 722 // raddr: O7, blown by call 723 address generate_partial_subtype_check() { 724 __ align(CodeEntryAlignment); 725 StubCodeMark mark(this, "StubRoutines", "partial_subtype_check"); 726 address start = __ pc(); 727 Label miss; 728 729 __ save_frame(0); 730 Register Rret = I0; 731 Register Rsub = I1; 732 Register Rsuper = I2; 733 734 Register L0_ary_len = L0; 735 Register L1_ary_ptr = L1; 736 Register L2_super = L2; 737 Register L3_index = L3; 738 739 __ check_klass_subtype_slow_path(Rsub, Rsuper, 740 L0, L1, L2, L3, 741 NULL, &miss); 742 743 // Match falls through here. 744 __ addcc(G0,0,Rret); // set Z flags, Z result 745 746 __ ret(); // Result in Rret is zero; flags set to Z 747 __ delayed()->restore(); 748 749 __ BIND(miss); 750 __ addcc(G0,1,Rret); // set NZ flags, NZ result 751 752 __ ret(); // Result in Rret is != 0; flags set to NZ 753 __ delayed()->restore(); 754 755 return start; 756 } 757 758 759 // Called from MacroAssembler::verify_oop 760 // 761 address generate_verify_oop_subroutine() { 762 StubCodeMark mark(this, "StubRoutines", "verify_oop_stub"); 763 764 address start = __ pc(); 765 766 __ verify_oop_subroutine(); 767 768 return start; 769 } 770 771 772 // 773 // Verify that a register contains clean 32-bits positive value 774 // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax). 775 // 776 // Input: 777 // Rint - 32-bits value 778 // Rtmp - scratch 779 // 780 void assert_clean_int(Register Rint, Register Rtmp) { 781 #if defined(ASSERT) 782 __ signx(Rint, Rtmp); 783 __ cmp(Rint, Rtmp); 784 __ breakpoint_trap(Assembler::notEqual, Assembler::xcc); 785 #endif 786 } 787 788 // 789 // Generate overlap test for array copy stubs 790 // 791 // Input: 792 // O0 - array1 793 // O1 - array2 794 // O2 - element count 795 // 796 // Kills temps: O3, O4 797 // 798 void array_overlap_test(address no_overlap_target, int log2_elem_size) { 799 assert(no_overlap_target != NULL, "must be generated"); 800 array_overlap_test(no_overlap_target, NULL, log2_elem_size); 801 } 802 void array_overlap_test(Label& L_no_overlap, int log2_elem_size) { 803 array_overlap_test(NULL, &L_no_overlap, log2_elem_size); 804 } 805 void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) { 806 const Register from = O0; 807 const Register to = O1; 808 const Register count = O2; 809 const Register to_from = O3; // to - from 810 const Register byte_count = O4; // count << log2_elem_size 811 812 __ subcc(to, from, to_from); 813 __ sll_ptr(count, log2_elem_size, byte_count); 814 if (NOLp == NULL) 815 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target); 816 else 817 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp)); 818 __ delayed()->cmp(to_from, byte_count); 819 if (NOLp == NULL) 820 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target); 821 else 822 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp)); 823 __ delayed()->nop(); 824 } 825 826 // 827 // Generate main code for disjoint arraycopy 828 // 829 typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec, 830 Label& L_loop, bool use_prefetch, bool use_bis); 831 832 void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size, 833 int iter_size, StubGenerator::CopyLoopFunc copy_loop_func) { 834 Label L_copy; 835 836 assert(log2_elem_size <= 3, "the following code should be changed"); 837 int count_dec = 16>>log2_elem_size; 838 839 int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance); 840 assert(prefetch_dist < 4096, "invalid value"); 841 prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size 842 int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count 843 844 if (UseBlockCopy) { 845 Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy; 846 847 // 64 bytes tail + bytes copied in one loop iteration 848 int tail_size = 64 + iter_size; 849 int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size; 850 // Use BIS copy only for big arrays since it requires membar. 851 __ set(block_copy_count, O4); 852 __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy); 853 // This code is for disjoint source and destination: 854 // to <= from || to >= from+count 855 // but BIS will stomp over 'from' if (to > from-tail_size && to <= from) 856 __ sub(from, to, O4); 857 __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm. 858 __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy); 859 860 __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY); 861 // BIS should not be used to copy tail (64 bytes+iter_size) 862 // to avoid zeroing of following values. 863 __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0 864 865 if (prefetch_count > 0) { // rounded up to one iteration count 866 // Do prefetching only if copy size is bigger 867 // than prefetch distance. 868 __ set(prefetch_count, O4); 869 __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy); 870 __ sub(count, prefetch_count, count); 871 872 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true); 873 __ add(count, prefetch_count, count); // restore count 874 875 } // prefetch_count > 0 876 877 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true); 878 __ add(count, (tail_size>>log2_elem_size), count); // restore count 879 880 __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT); 881 // BIS needs membar. 882 __ membar(Assembler::StoreLoad); 883 // Copy tail 884 __ ba_short(L_copy); 885 886 __ BIND(L_skip_block_copy); 887 } // UseBlockCopy 888 889 if (prefetch_count > 0) { // rounded up to one iteration count 890 // Do prefetching only if copy size is bigger 891 // than prefetch distance. 892 __ set(prefetch_count, O4); 893 __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy); 894 __ sub(count, prefetch_count, count); 895 896 Label L_copy_prefetch; 897 (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false); 898 __ add(count, prefetch_count, count); // restore count 899 900 } // prefetch_count > 0 901 902 (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false); 903 } 904 905 906 907 // 908 // Helper methods for copy_16_bytes_forward_with_shift() 909 // 910 void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec, 911 Label& L_loop, bool use_prefetch, bool use_bis) { 912 913 const Register left_shift = G1; // left shift bit counter 914 const Register right_shift = G5; // right shift bit counter 915 916 __ align(OptoLoopAlignment); 917 __ BIND(L_loop); 918 if (use_prefetch) { 919 if (ArraycopySrcPrefetchDistance > 0) { 920 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 921 } 922 if (ArraycopyDstPrefetchDistance > 0) { 923 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 924 } 925 } 926 __ ldx(from, 0, O4); 927 __ ldx(from, 8, G4); 928 __ inc(to, 16); 929 __ inc(from, 16); 930 __ deccc(count, count_dec); // Can we do next iteration after this one? 931 __ srlx(O4, right_shift, G3); 932 __ bset(G3, O3); 933 __ sllx(O4, left_shift, O4); 934 __ srlx(G4, right_shift, G3); 935 __ bset(G3, O4); 936 if (use_bis) { 937 __ stxa(O3, to, -16); 938 __ stxa(O4, to, -8); 939 } else { 940 __ stx(O3, to, -16); 941 __ stx(O4, to, -8); 942 } 943 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 944 __ delayed()->sllx(G4, left_shift, O3); 945 } 946 947 // Copy big chunks forward with shift 948 // 949 // Inputs: 950 // from - source arrays 951 // to - destination array aligned to 8-bytes 952 // count - elements count to copy >= the count equivalent to 16 bytes 953 // count_dec - elements count's decrement equivalent to 16 bytes 954 // L_copy_bytes - copy exit label 955 // 956 void copy_16_bytes_forward_with_shift(Register from, Register to, 957 Register count, int log2_elem_size, Label& L_copy_bytes) { 958 Label L_aligned_copy, L_copy_last_bytes; 959 assert(log2_elem_size <= 3, "the following code should be changed"); 960 int count_dec = 16>>log2_elem_size; 961 962 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy 963 __ andcc(from, 7, G1); // misaligned bytes 964 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 965 __ delayed()->nop(); 966 967 const Register left_shift = G1; // left shift bit counter 968 const Register right_shift = G5; // right shift bit counter 969 970 __ sll(G1, LogBitsPerByte, left_shift); 971 __ mov(64, right_shift); 972 __ sub(right_shift, left_shift, right_shift); 973 974 // 975 // Load 2 aligned 8-bytes chunks and use one from previous iteration 976 // to form 2 aligned 8-bytes chunks to store. 977 // 978 __ dec(count, count_dec); // Pre-decrement 'count' 979 __ andn(from, 7, from); // Align address 980 __ ldx(from, 0, O3); 981 __ inc(from, 8); 982 __ sllx(O3, left_shift, O3); 983 984 disjoint_copy_core(from, to, count, log2_elem_size, 16, &StubGenerator::copy_16_bytes_shift_loop); 985 986 __ inccc(count, count_dec>>1 ); // + 8 bytes 987 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 988 __ delayed()->inc(count, count_dec>>1); // restore 'count' 989 990 // copy 8 bytes, part of them already loaded in O3 991 __ ldx(from, 0, O4); 992 __ inc(to, 8); 993 __ inc(from, 8); 994 __ srlx(O4, right_shift, G3); 995 __ bset(O3, G3); 996 __ stx(G3, to, -8); 997 998 __ BIND(L_copy_last_bytes); 999 __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes 1000 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 1001 __ delayed()->sub(from, right_shift, from); // restore address 1002 1003 __ BIND(L_aligned_copy); 1004 } 1005 1006 // Copy big chunks backward with shift 1007 // 1008 // Inputs: 1009 // end_from - source arrays end address 1010 // end_to - destination array end address aligned to 8-bytes 1011 // count - elements count to copy >= the count equivalent to 16 bytes 1012 // count_dec - elements count's decrement equivalent to 16 bytes 1013 // L_aligned_copy - aligned copy exit label 1014 // L_copy_bytes - copy exit label 1015 // 1016 void copy_16_bytes_backward_with_shift(Register end_from, Register end_to, 1017 Register count, int count_dec, 1018 Label& L_aligned_copy, Label& L_copy_bytes) { 1019 Label L_loop, L_copy_last_bytes; 1020 1021 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy 1022 __ andcc(end_from, 7, G1); // misaligned bytes 1023 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1024 __ delayed()->deccc(count, count_dec); // Pre-decrement 'count' 1025 1026 const Register left_shift = G1; // left shift bit counter 1027 const Register right_shift = G5; // right shift bit counter 1028 1029 __ sll(G1, LogBitsPerByte, left_shift); 1030 __ mov(64, right_shift); 1031 __ sub(right_shift, left_shift, right_shift); 1032 1033 // 1034 // Load 2 aligned 8-bytes chunks and use one from previous iteration 1035 // to form 2 aligned 8-bytes chunks to store. 1036 // 1037 __ andn(end_from, 7, end_from); // Align address 1038 __ ldx(end_from, 0, O3); 1039 __ align(OptoLoopAlignment); 1040 __ BIND(L_loop); 1041 __ ldx(end_from, -8, O4); 1042 __ deccc(count, count_dec); // Can we do next iteration after this one? 1043 __ ldx(end_from, -16, G4); 1044 __ dec(end_to, 16); 1045 __ dec(end_from, 16); 1046 __ srlx(O3, right_shift, O3); 1047 __ sllx(O4, left_shift, G3); 1048 __ bset(G3, O3); 1049 __ stx(O3, end_to, 8); 1050 __ srlx(O4, right_shift, O4); 1051 __ sllx(G4, left_shift, G3); 1052 __ bset(G3, O4); 1053 __ stx(O4, end_to, 0); 1054 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1055 __ delayed()->mov(G4, O3); 1056 1057 __ inccc(count, count_dec>>1 ); // + 8 bytes 1058 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 1059 __ delayed()->inc(count, count_dec>>1); // restore 'count' 1060 1061 // copy 8 bytes, part of them already loaded in O3 1062 __ ldx(end_from, -8, O4); 1063 __ dec(end_to, 8); 1064 __ dec(end_from, 8); 1065 __ srlx(O3, right_shift, O3); 1066 __ sllx(O4, left_shift, G3); 1067 __ bset(O3, G3); 1068 __ stx(G3, end_to, 0); 1069 1070 __ BIND(L_copy_last_bytes); 1071 __ srl(left_shift, LogBitsPerByte, left_shift); // misaligned bytes 1072 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 1073 __ delayed()->add(end_from, left_shift, end_from); // restore address 1074 } 1075 1076 // 1077 // Generate stub for disjoint byte copy. If "aligned" is true, the 1078 // "from" and "to" addresses are assumed to be heapword aligned. 1079 // 1080 // Arguments for generated stub: 1081 // from: O0 1082 // to: O1 1083 // count: O2 treated as signed 1084 // 1085 address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) { 1086 __ align(CodeEntryAlignment); 1087 StubCodeMark mark(this, "StubRoutines", name); 1088 address start = __ pc(); 1089 1090 Label L_skip_alignment, L_align; 1091 Label L_copy_byte, L_copy_byte_loop, L_exit; 1092 1093 const Register from = O0; // source array address 1094 const Register to = O1; // destination array address 1095 const Register count = O2; // elements count 1096 const Register offset = O5; // offset from start of arrays 1097 // O3, O4, G3, G4 are used as temp registers 1098 1099 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1100 1101 if (entry != NULL) { 1102 *entry = __ pc(); 1103 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1104 BLOCK_COMMENT("Entry:"); 1105 } 1106 1107 // for short arrays, just do single element copy 1108 __ cmp(count, 23); // 16 + 7 1109 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte); 1110 __ delayed()->mov(G0, offset); 1111 1112 if (aligned) { 1113 // 'aligned' == true when it is known statically during compilation 1114 // of this arraycopy call site that both 'from' and 'to' addresses 1115 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1116 // 1117 // Aligned arrays have 4 bytes alignment in 32-bits VM 1118 // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM 1119 // 1120 } else { 1121 // copy bytes to align 'to' on 8 byte boundary 1122 __ andcc(to, 7, G1); // misaligned bytes 1123 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1124 __ delayed()->neg(G1); 1125 __ inc(G1, 8); // bytes need to copy to next 8-bytes alignment 1126 __ sub(count, G1, count); 1127 __ BIND(L_align); 1128 __ ldub(from, 0, O3); 1129 __ deccc(G1); 1130 __ inc(from); 1131 __ stb(O3, to, 0); 1132 __ br(Assembler::notZero, false, Assembler::pt, L_align); 1133 __ delayed()->inc(to); 1134 __ BIND(L_skip_alignment); 1135 } 1136 if (!aligned) 1137 { 1138 // Copy with shift 16 bytes per iteration if arrays do not have 1139 // the same alignment mod 8, otherwise fall through to the next 1140 // code for aligned copy. 1141 // The compare above (count >= 23) guarantes 'count' >= 16 bytes. 1142 // Also jump over aligned copy after the copy with shift completed. 1143 1144 copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte); 1145 } 1146 1147 // Both array are 8 bytes aligned, copy 16 bytes at a time 1148 __ and3(count, 7, G4); // Save count 1149 __ srl(count, 3, count); 1150 generate_disjoint_long_copy_core(aligned); 1151 __ mov(G4, count); // Restore count 1152 1153 // copy tailing bytes 1154 __ BIND(L_copy_byte); 1155 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1156 __ align(OptoLoopAlignment); 1157 __ BIND(L_copy_byte_loop); 1158 __ ldub(from, offset, O3); 1159 __ deccc(count); 1160 __ stb(O3, to, offset); 1161 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop); 1162 __ delayed()->inc(offset); 1163 1164 __ BIND(L_exit); 1165 // O3, O4 are used as temp registers 1166 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4); 1167 __ retl(); 1168 __ delayed()->mov(G0, O0); // return 0 1169 return start; 1170 } 1171 1172 // 1173 // Generate stub for conjoint byte copy. If "aligned" is true, the 1174 // "from" and "to" addresses are assumed to be heapword aligned. 1175 // 1176 // Arguments for generated stub: 1177 // from: O0 1178 // to: O1 1179 // count: O2 treated as signed 1180 // 1181 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1182 address *entry, const char *name) { 1183 // Do reverse copy. 1184 1185 __ align(CodeEntryAlignment); 1186 StubCodeMark mark(this, "StubRoutines", name); 1187 address start = __ pc(); 1188 1189 Label L_skip_alignment, L_align, L_aligned_copy; 1190 Label L_copy_byte, L_copy_byte_loop, L_exit; 1191 1192 const Register from = O0; // source array address 1193 const Register to = O1; // destination array address 1194 const Register count = O2; // elements count 1195 const Register end_from = from; // source array end address 1196 const Register end_to = to; // destination array end address 1197 1198 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1199 1200 if (entry != NULL) { 1201 *entry = __ pc(); 1202 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1203 BLOCK_COMMENT("Entry:"); 1204 } 1205 1206 array_overlap_test(nooverlap_target, 0); 1207 1208 __ add(to, count, end_to); // offset after last copied element 1209 1210 // for short arrays, just do single element copy 1211 __ cmp(count, 23); // 16 + 7 1212 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte); 1213 __ delayed()->add(from, count, end_from); 1214 1215 { 1216 // Align end of arrays since they could be not aligned even 1217 // when arrays itself are aligned. 1218 1219 // copy bytes to align 'end_to' on 8 byte boundary 1220 __ andcc(end_to, 7, G1); // misaligned bytes 1221 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1222 __ delayed()->nop(); 1223 __ sub(count, G1, count); 1224 __ BIND(L_align); 1225 __ dec(end_from); 1226 __ dec(end_to); 1227 __ ldub(end_from, 0, O3); 1228 __ deccc(G1); 1229 __ brx(Assembler::notZero, false, Assembler::pt, L_align); 1230 __ delayed()->stb(O3, end_to, 0); 1231 __ BIND(L_skip_alignment); 1232 } 1233 if (aligned) { 1234 // Both arrays are aligned to 8-bytes in 64-bits VM. 1235 // The 'count' is decremented in copy_16_bytes_backward_with_shift() 1236 // in unaligned case. 1237 __ dec(count, 16); 1238 } else 1239 { 1240 // Copy with shift 16 bytes per iteration if arrays do not have 1241 // the same alignment mod 8, otherwise jump to the next 1242 // code for aligned copy (and substracting 16 from 'count' before jump). 1243 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1244 // Also jump over aligned copy after the copy with shift completed. 1245 1246 copy_16_bytes_backward_with_shift(end_from, end_to, count, 16, 1247 L_aligned_copy, L_copy_byte); 1248 } 1249 // copy 4 elements (16 bytes) at a time 1250 __ align(OptoLoopAlignment); 1251 __ BIND(L_aligned_copy); 1252 __ dec(end_from, 16); 1253 __ ldx(end_from, 8, O3); 1254 __ ldx(end_from, 0, O4); 1255 __ dec(end_to, 16); 1256 __ deccc(count, 16); 1257 __ stx(O3, end_to, 8); 1258 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 1259 __ delayed()->stx(O4, end_to, 0); 1260 __ inc(count, 16); 1261 1262 // copy 1 element (2 bytes) at a time 1263 __ BIND(L_copy_byte); 1264 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1265 __ align(OptoLoopAlignment); 1266 __ BIND(L_copy_byte_loop); 1267 __ dec(end_from); 1268 __ dec(end_to); 1269 __ ldub(end_from, 0, O4); 1270 __ deccc(count); 1271 __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop); 1272 __ delayed()->stb(O4, end_to, 0); 1273 1274 __ BIND(L_exit); 1275 // O3, O4 are used as temp registers 1276 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4); 1277 __ retl(); 1278 __ delayed()->mov(G0, O0); // return 0 1279 return start; 1280 } 1281 1282 // 1283 // Generate stub for disjoint short copy. If "aligned" is true, the 1284 // "from" and "to" addresses are assumed to be heapword aligned. 1285 // 1286 // Arguments for generated stub: 1287 // from: O0 1288 // to: O1 1289 // count: O2 treated as signed 1290 // 1291 address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) { 1292 __ align(CodeEntryAlignment); 1293 StubCodeMark mark(this, "StubRoutines", name); 1294 address start = __ pc(); 1295 1296 Label L_skip_alignment, L_skip_alignment2; 1297 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit; 1298 1299 const Register from = O0; // source array address 1300 const Register to = O1; // destination array address 1301 const Register count = O2; // elements count 1302 const Register offset = O5; // offset from start of arrays 1303 // O3, O4, G3, G4 are used as temp registers 1304 1305 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1306 1307 if (entry != NULL) { 1308 *entry = __ pc(); 1309 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1310 BLOCK_COMMENT("Entry:"); 1311 } 1312 1313 // for short arrays, just do single element copy 1314 __ cmp(count, 11); // 8 + 3 (22 bytes) 1315 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes); 1316 __ delayed()->mov(G0, offset); 1317 1318 if (aligned) { 1319 // 'aligned' == true when it is known statically during compilation 1320 // of this arraycopy call site that both 'from' and 'to' addresses 1321 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1322 // 1323 // Aligned arrays have 4 bytes alignment in 32-bits VM 1324 // and 8 bytes - in 64-bits VM. 1325 // 1326 } else { 1327 // copy 1 element if necessary to align 'to' on an 4 bytes 1328 __ andcc(to, 3, G0); 1329 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1330 __ delayed()->lduh(from, 0, O3); 1331 __ inc(from, 2); 1332 __ inc(to, 2); 1333 __ dec(count); 1334 __ sth(O3, to, -2); 1335 __ BIND(L_skip_alignment); 1336 1337 // copy 2 elements to align 'to' on an 8 byte boundary 1338 __ andcc(to, 7, G0); 1339 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2); 1340 __ delayed()->lduh(from, 0, O3); 1341 __ dec(count, 2); 1342 __ lduh(from, 2, O4); 1343 __ inc(from, 4); 1344 __ inc(to, 4); 1345 __ sth(O3, to, -4); 1346 __ sth(O4, to, -2); 1347 __ BIND(L_skip_alignment2); 1348 } 1349 if (!aligned) 1350 { 1351 // Copy with shift 16 bytes per iteration if arrays do not have 1352 // the same alignment mod 8, otherwise fall through to the next 1353 // code for aligned copy. 1354 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1355 // Also jump over aligned copy after the copy with shift completed. 1356 1357 copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes); 1358 } 1359 1360 // Both array are 8 bytes aligned, copy 16 bytes at a time 1361 __ and3(count, 3, G4); // Save 1362 __ srl(count, 2, count); 1363 generate_disjoint_long_copy_core(aligned); 1364 __ mov(G4, count); // restore 1365 1366 // copy 1 element at a time 1367 __ BIND(L_copy_2_bytes); 1368 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1369 __ align(OptoLoopAlignment); 1370 __ BIND(L_copy_2_bytes_loop); 1371 __ lduh(from, offset, O3); 1372 __ deccc(count); 1373 __ sth(O3, to, offset); 1374 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop); 1375 __ delayed()->inc(offset, 2); 1376 1377 __ BIND(L_exit); 1378 // O3, O4 are used as temp registers 1379 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4); 1380 __ retl(); 1381 __ delayed()->mov(G0, O0); // return 0 1382 return start; 1383 } 1384 1385 // 1386 // Generate stub for disjoint short fill. If "aligned" is true, the 1387 // "to" address is assumed to be heapword aligned. 1388 // 1389 // Arguments for generated stub: 1390 // to: O0 1391 // value: O1 1392 // count: O2 treated as signed 1393 // 1394 address generate_fill(BasicType t, bool aligned, const char* name) { 1395 __ align(CodeEntryAlignment); 1396 StubCodeMark mark(this, "StubRoutines", name); 1397 address start = __ pc(); 1398 1399 const Register to = O0; // source array address 1400 const Register value = O1; // fill value 1401 const Register count = O2; // elements count 1402 // O3 is used as a temp register 1403 1404 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1405 1406 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte; 1407 Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes; 1408 1409 int shift = -1; 1410 switch (t) { 1411 case T_BYTE: 1412 shift = 2; 1413 break; 1414 case T_SHORT: 1415 shift = 1; 1416 break; 1417 case T_INT: 1418 shift = 0; 1419 break; 1420 default: ShouldNotReachHere(); 1421 } 1422 1423 BLOCK_COMMENT("Entry:"); 1424 1425 if (t == T_BYTE) { 1426 // Zero extend value 1427 __ and3(value, 0xff, value); 1428 __ sllx(value, 8, O3); 1429 __ or3(value, O3, value); 1430 } 1431 if (t == T_SHORT) { 1432 // Zero extend value 1433 __ sllx(value, 48, value); 1434 __ srlx(value, 48, value); 1435 } 1436 if (t == T_BYTE || t == T_SHORT) { 1437 __ sllx(value, 16, O3); 1438 __ or3(value, O3, value); 1439 } 1440 1441 __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element 1442 __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp 1443 __ delayed()->andcc(count, 1, G0); 1444 1445 if (!aligned && (t == T_BYTE || t == T_SHORT)) { 1446 // align source address at 4 bytes address boundary 1447 if (t == T_BYTE) { 1448 // One byte misalignment happens only for byte arrays 1449 __ andcc(to, 1, G0); 1450 __ br(Assembler::zero, false, Assembler::pt, L_skip_align1); 1451 __ delayed()->nop(); 1452 __ stb(value, to, 0); 1453 __ inc(to, 1); 1454 __ dec(count, 1); 1455 __ BIND(L_skip_align1); 1456 } 1457 // Two bytes misalignment happens only for byte and short (char) arrays 1458 __ andcc(to, 2, G0); 1459 __ br(Assembler::zero, false, Assembler::pt, L_skip_align2); 1460 __ delayed()->nop(); 1461 __ sth(value, to, 0); 1462 __ inc(to, 2); 1463 __ dec(count, 1 << (shift - 1)); 1464 __ BIND(L_skip_align2); 1465 } 1466 if (!aligned) { 1467 // align to 8 bytes, we know we are 4 byte aligned to start 1468 __ andcc(to, 7, G0); 1469 __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes); 1470 __ delayed()->nop(); 1471 __ stw(value, to, 0); 1472 __ inc(to, 4); 1473 __ dec(count, 1 << shift); 1474 __ BIND(L_fill_32_bytes); 1475 } 1476 1477 if (t == T_INT) { 1478 // Zero extend value 1479 __ srl(value, 0, value); 1480 } 1481 if (t == T_BYTE || t == T_SHORT || t == T_INT) { 1482 __ sllx(value, 32, O3); 1483 __ or3(value, O3, value); 1484 } 1485 1486 Label L_check_fill_8_bytes; 1487 // Fill 32-byte chunks 1488 __ subcc(count, 8 << shift, count); 1489 __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes); 1490 __ delayed()->nop(); 1491 1492 Label L_fill_32_bytes_loop, L_fill_4_bytes; 1493 __ align(16); 1494 __ BIND(L_fill_32_bytes_loop); 1495 1496 __ stx(value, to, 0); 1497 __ stx(value, to, 8); 1498 __ stx(value, to, 16); 1499 __ stx(value, to, 24); 1500 1501 __ subcc(count, 8 << shift, count); 1502 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop); 1503 __ delayed()->add(to, 32, to); 1504 1505 __ BIND(L_check_fill_8_bytes); 1506 __ addcc(count, 8 << shift, count); 1507 __ brx(Assembler::zero, false, Assembler::pn, L_exit); 1508 __ delayed()->subcc(count, 1 << (shift + 1), count); 1509 __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes); 1510 __ delayed()->andcc(count, 1<<shift, G0); 1511 1512 // 1513 // length is too short, just fill 8 bytes at a time 1514 // 1515 Label L_fill_8_bytes_loop; 1516 __ BIND(L_fill_8_bytes_loop); 1517 __ stx(value, to, 0); 1518 __ subcc(count, 1 << (shift + 1), count); 1519 __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop); 1520 __ delayed()->add(to, 8, to); 1521 1522 // fill trailing 4 bytes 1523 __ andcc(count, 1<<shift, G0); // in delay slot of branches 1524 if (t == T_INT) { 1525 __ BIND(L_fill_elements); 1526 } 1527 __ BIND(L_fill_4_bytes); 1528 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes); 1529 if (t == T_BYTE || t == T_SHORT) { 1530 __ delayed()->andcc(count, 1<<(shift-1), G0); 1531 } else { 1532 __ delayed()->nop(); 1533 } 1534 __ stw(value, to, 0); 1535 if (t == T_BYTE || t == T_SHORT) { 1536 __ inc(to, 4); 1537 // fill trailing 2 bytes 1538 __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches 1539 __ BIND(L_fill_2_bytes); 1540 __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte); 1541 __ delayed()->andcc(count, 1, count); 1542 __ sth(value, to, 0); 1543 if (t == T_BYTE) { 1544 __ inc(to, 2); 1545 // fill trailing byte 1546 __ andcc(count, 1, count); // in delay slot of branches 1547 __ BIND(L_fill_byte); 1548 __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1549 __ delayed()->nop(); 1550 __ stb(value, to, 0); 1551 } else { 1552 __ BIND(L_fill_byte); 1553 } 1554 } else { 1555 __ BIND(L_fill_2_bytes); 1556 } 1557 __ BIND(L_exit); 1558 __ retl(); 1559 __ delayed()->nop(); 1560 1561 // Handle copies less than 8 bytes. Int is handled elsewhere. 1562 if (t == T_BYTE) { 1563 __ BIND(L_fill_elements); 1564 Label L_fill_2, L_fill_4; 1565 // in delay slot __ andcc(count, 1, G0); 1566 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2); 1567 __ delayed()->andcc(count, 2, G0); 1568 __ stb(value, to, 0); 1569 __ inc(to, 1); 1570 __ BIND(L_fill_2); 1571 __ brx(Assembler::zero, false, Assembler::pt, L_fill_4); 1572 __ delayed()->andcc(count, 4, G0); 1573 __ stb(value, to, 0); 1574 __ stb(value, to, 1); 1575 __ inc(to, 2); 1576 __ BIND(L_fill_4); 1577 __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1578 __ delayed()->nop(); 1579 __ stb(value, to, 0); 1580 __ stb(value, to, 1); 1581 __ stb(value, to, 2); 1582 __ retl(); 1583 __ delayed()->stb(value, to, 3); 1584 } 1585 1586 if (t == T_SHORT) { 1587 Label L_fill_2; 1588 __ BIND(L_fill_elements); 1589 // in delay slot __ andcc(count, 1, G0); 1590 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2); 1591 __ delayed()->andcc(count, 2, G0); 1592 __ sth(value, to, 0); 1593 __ inc(to, 2); 1594 __ BIND(L_fill_2); 1595 __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1596 __ delayed()->nop(); 1597 __ sth(value, to, 0); 1598 __ retl(); 1599 __ delayed()->sth(value, to, 2); 1600 } 1601 return start; 1602 } 1603 1604 // 1605 // Generate stub for conjoint short copy. If "aligned" is true, the 1606 // "from" and "to" addresses are assumed to be heapword aligned. 1607 // 1608 // Arguments for generated stub: 1609 // from: O0 1610 // to: O1 1611 // count: O2 treated as signed 1612 // 1613 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1614 address *entry, const char *name) { 1615 // Do reverse copy. 1616 1617 __ align(CodeEntryAlignment); 1618 StubCodeMark mark(this, "StubRoutines", name); 1619 address start = __ pc(); 1620 1621 Label L_skip_alignment, L_skip_alignment2, L_aligned_copy; 1622 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit; 1623 1624 const Register from = O0; // source array address 1625 const Register to = O1; // destination array address 1626 const Register count = O2; // elements count 1627 const Register end_from = from; // source array end address 1628 const Register end_to = to; // destination array end address 1629 1630 const Register byte_count = O3; // bytes count to copy 1631 1632 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1633 1634 if (entry != NULL) { 1635 *entry = __ pc(); 1636 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1637 BLOCK_COMMENT("Entry:"); 1638 } 1639 1640 array_overlap_test(nooverlap_target, 1); 1641 1642 __ sllx(count, LogBytesPerShort, byte_count); 1643 __ add(to, byte_count, end_to); // offset after last copied element 1644 1645 // for short arrays, just do single element copy 1646 __ cmp(count, 11); // 8 + 3 (22 bytes) 1647 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes); 1648 __ delayed()->add(from, byte_count, end_from); 1649 1650 { 1651 // Align end of arrays since they could be not aligned even 1652 // when arrays itself are aligned. 1653 1654 // copy 1 element if necessary to align 'end_to' on an 4 bytes 1655 __ andcc(end_to, 3, G0); 1656 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1657 __ delayed()->lduh(end_from, -2, O3); 1658 __ dec(end_from, 2); 1659 __ dec(end_to, 2); 1660 __ dec(count); 1661 __ sth(O3, end_to, 0); 1662 __ BIND(L_skip_alignment); 1663 1664 // copy 2 elements to align 'end_to' on an 8 byte boundary 1665 __ andcc(end_to, 7, G0); 1666 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2); 1667 __ delayed()->lduh(end_from, -2, O3); 1668 __ dec(count, 2); 1669 __ lduh(end_from, -4, O4); 1670 __ dec(end_from, 4); 1671 __ dec(end_to, 4); 1672 __ sth(O3, end_to, 2); 1673 __ sth(O4, end_to, 0); 1674 __ BIND(L_skip_alignment2); 1675 } 1676 if (aligned) { 1677 // Both arrays are aligned to 8-bytes in 64-bits VM. 1678 // The 'count' is decremented in copy_16_bytes_backward_with_shift() 1679 // in unaligned case. 1680 __ dec(count, 8); 1681 } else 1682 { 1683 // Copy with shift 16 bytes per iteration if arrays do not have 1684 // the same alignment mod 8, otherwise jump to the next 1685 // code for aligned copy (and substracting 8 from 'count' before jump). 1686 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1687 // Also jump over aligned copy after the copy with shift completed. 1688 1689 copy_16_bytes_backward_with_shift(end_from, end_to, count, 8, 1690 L_aligned_copy, L_copy_2_bytes); 1691 } 1692 // copy 4 elements (16 bytes) at a time 1693 __ align(OptoLoopAlignment); 1694 __ BIND(L_aligned_copy); 1695 __ dec(end_from, 16); 1696 __ ldx(end_from, 8, O3); 1697 __ ldx(end_from, 0, O4); 1698 __ dec(end_to, 16); 1699 __ deccc(count, 8); 1700 __ stx(O3, end_to, 8); 1701 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 1702 __ delayed()->stx(O4, end_to, 0); 1703 __ inc(count, 8); 1704 1705 // copy 1 element (2 bytes) at a time 1706 __ BIND(L_copy_2_bytes); 1707 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1708 __ BIND(L_copy_2_bytes_loop); 1709 __ dec(end_from, 2); 1710 __ dec(end_to, 2); 1711 __ lduh(end_from, 0, O4); 1712 __ deccc(count); 1713 __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop); 1714 __ delayed()->sth(O4, end_to, 0); 1715 1716 __ BIND(L_exit); 1717 // O3, O4 are used as temp registers 1718 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4); 1719 __ retl(); 1720 __ delayed()->mov(G0, O0); // return 0 1721 return start; 1722 } 1723 1724 // 1725 // Helper methods for generate_disjoint_int_copy_core() 1726 // 1727 void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec, 1728 Label& L_loop, bool use_prefetch, bool use_bis) { 1729 1730 __ align(OptoLoopAlignment); 1731 __ BIND(L_loop); 1732 if (use_prefetch) { 1733 if (ArraycopySrcPrefetchDistance > 0) { 1734 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 1735 } 1736 if (ArraycopyDstPrefetchDistance > 0) { 1737 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 1738 } 1739 } 1740 __ ldx(from, 4, O4); 1741 __ ldx(from, 12, G4); 1742 __ inc(to, 16); 1743 __ inc(from, 16); 1744 __ deccc(count, 4); // Can we do next iteration after this one? 1745 1746 __ srlx(O4, 32, G3); 1747 __ bset(G3, O3); 1748 __ sllx(O4, 32, O4); 1749 __ srlx(G4, 32, G3); 1750 __ bset(G3, O4); 1751 if (use_bis) { 1752 __ stxa(O3, to, -16); 1753 __ stxa(O4, to, -8); 1754 } else { 1755 __ stx(O3, to, -16); 1756 __ stx(O4, to, -8); 1757 } 1758 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1759 __ delayed()->sllx(G4, 32, O3); 1760 1761 } 1762 1763 // 1764 // Generate core code for disjoint int copy (and oop copy on 32-bit). 1765 // If "aligned" is true, the "from" and "to" addresses are assumed 1766 // to be heapword aligned. 1767 // 1768 // Arguments: 1769 // from: O0 1770 // to: O1 1771 // count: O2 treated as signed 1772 // 1773 void generate_disjoint_int_copy_core(bool aligned) { 1774 1775 Label L_skip_alignment, L_aligned_copy; 1776 Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 1777 1778 const Register from = O0; // source array address 1779 const Register to = O1; // destination array address 1780 const Register count = O2; // elements count 1781 const Register offset = O5; // offset from start of arrays 1782 // O3, O4, G3, G4 are used as temp registers 1783 1784 // 'aligned' == true when it is known statically during compilation 1785 // of this arraycopy call site that both 'from' and 'to' addresses 1786 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1787 // 1788 // Aligned arrays have 4 bytes alignment in 32-bits VM 1789 // and 8 bytes - in 64-bits VM. 1790 // 1791 if (!aligned) 1792 { 1793 // The next check could be put under 'ifndef' since the code in 1794 // generate_disjoint_long_copy_core() has own checks and set 'offset'. 1795 1796 // for short arrays, just do single element copy 1797 __ cmp(count, 5); // 4 + 1 (20 bytes) 1798 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes); 1799 __ delayed()->mov(G0, offset); 1800 1801 // copy 1 element to align 'to' on an 8 byte boundary 1802 __ andcc(to, 7, G0); 1803 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1804 __ delayed()->ld(from, 0, O3); 1805 __ inc(from, 4); 1806 __ inc(to, 4); 1807 __ dec(count); 1808 __ st(O3, to, -4); 1809 __ BIND(L_skip_alignment); 1810 1811 // if arrays have same alignment mod 8, do 4 elements copy 1812 __ andcc(from, 7, G0); 1813 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1814 __ delayed()->ld(from, 0, O3); 1815 1816 // 1817 // Load 2 aligned 8-bytes chunks and use one from previous iteration 1818 // to form 2 aligned 8-bytes chunks to store. 1819 // 1820 // copy_16_bytes_forward_with_shift() is not used here since this 1821 // code is more optimal. 1822 1823 // copy with shift 4 elements (16 bytes) at a time 1824 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 1825 __ sllx(O3, 32, O3); 1826 1827 disjoint_copy_core(from, to, count, 2, 16, &StubGenerator::copy_16_bytes_loop); 1828 1829 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); 1830 __ delayed()->inc(count, 4); // restore 'count' 1831 1832 __ BIND(L_aligned_copy); 1833 } // !aligned 1834 1835 // copy 4 elements (16 bytes) at a time 1836 __ and3(count, 1, G4); // Save 1837 __ srl(count, 1, count); 1838 generate_disjoint_long_copy_core(aligned); 1839 __ mov(G4, count); // Restore 1840 1841 // copy 1 element at a time 1842 __ BIND(L_copy_4_bytes); 1843 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1844 __ BIND(L_copy_4_bytes_loop); 1845 __ ld(from, offset, O3); 1846 __ deccc(count); 1847 __ st(O3, to, offset); 1848 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop); 1849 __ delayed()->inc(offset, 4); 1850 __ BIND(L_exit); 1851 } 1852 1853 // 1854 // Generate stub for disjoint int copy. If "aligned" is true, the 1855 // "from" and "to" addresses are assumed to be heapword aligned. 1856 // 1857 // Arguments for generated stub: 1858 // from: O0 1859 // to: O1 1860 // count: O2 treated as signed 1861 // 1862 address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) { 1863 __ align(CodeEntryAlignment); 1864 StubCodeMark mark(this, "StubRoutines", name); 1865 address start = __ pc(); 1866 1867 const Register count = O2; 1868 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1869 1870 if (entry != NULL) { 1871 *entry = __ pc(); 1872 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1873 BLOCK_COMMENT("Entry:"); 1874 } 1875 1876 generate_disjoint_int_copy_core(aligned); 1877 1878 // O3, O4 are used as temp registers 1879 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4); 1880 __ retl(); 1881 __ delayed()->mov(G0, O0); // return 0 1882 return start; 1883 } 1884 1885 // 1886 // Generate core code for conjoint int copy (and oop copy on 32-bit). 1887 // If "aligned" is true, the "from" and "to" addresses are assumed 1888 // to be heapword aligned. 1889 // 1890 // Arguments: 1891 // from: O0 1892 // to: O1 1893 // count: O2 treated as signed 1894 // 1895 void generate_conjoint_int_copy_core(bool aligned) { 1896 // Do reverse copy. 1897 1898 Label L_skip_alignment, L_aligned_copy; 1899 Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 1900 1901 const Register from = O0; // source array address 1902 const Register to = O1; // destination array address 1903 const Register count = O2; // elements count 1904 const Register end_from = from; // source array end address 1905 const Register end_to = to; // destination array end address 1906 // O3, O4, O5, G3 are used as temp registers 1907 1908 const Register byte_count = O3; // bytes count to copy 1909 1910 __ sllx(count, LogBytesPerInt, byte_count); 1911 __ add(to, byte_count, end_to); // offset after last copied element 1912 1913 __ cmp(count, 5); // for short arrays, just do single element copy 1914 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes); 1915 __ delayed()->add(from, byte_count, end_from); 1916 1917 // copy 1 element to align 'to' on an 8 byte boundary 1918 __ andcc(end_to, 7, G0); 1919 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1920 __ delayed()->nop(); 1921 __ dec(count); 1922 __ dec(end_from, 4); 1923 __ dec(end_to, 4); 1924 __ ld(end_from, 0, O4); 1925 __ st(O4, end_to, 0); 1926 __ BIND(L_skip_alignment); 1927 1928 // Check if 'end_from' and 'end_to' has the same alignment. 1929 __ andcc(end_from, 7, G0); 1930 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1931 __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4 1932 1933 // copy with shift 4 elements (16 bytes) at a time 1934 // 1935 // Load 2 aligned 8-bytes chunks and use one from previous iteration 1936 // to form 2 aligned 8-bytes chunks to store. 1937 // 1938 __ ldx(end_from, -4, O3); 1939 __ align(OptoLoopAlignment); 1940 __ BIND(L_copy_16_bytes); 1941 __ ldx(end_from, -12, O4); 1942 __ deccc(count, 4); 1943 __ ldx(end_from, -20, O5); 1944 __ dec(end_to, 16); 1945 __ dec(end_from, 16); 1946 __ srlx(O3, 32, O3); 1947 __ sllx(O4, 32, G3); 1948 __ bset(G3, O3); 1949 __ stx(O3, end_to, 8); 1950 __ srlx(O4, 32, O4); 1951 __ sllx(O5, 32, G3); 1952 __ bset(O4, G3); 1953 __ stx(G3, end_to, 0); 1954 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); 1955 __ delayed()->mov(O5, O3); 1956 1957 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); 1958 __ delayed()->inc(count, 4); 1959 1960 // copy 4 elements (16 bytes) at a time 1961 __ align(OptoLoopAlignment); 1962 __ BIND(L_aligned_copy); 1963 __ dec(end_from, 16); 1964 __ ldx(end_from, 8, O3); 1965 __ ldx(end_from, 0, O4); 1966 __ dec(end_to, 16); 1967 __ deccc(count, 4); 1968 __ stx(O3, end_to, 8); 1969 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 1970 __ delayed()->stx(O4, end_to, 0); 1971 __ inc(count, 4); 1972 1973 // copy 1 element (4 bytes) at a time 1974 __ BIND(L_copy_4_bytes); 1975 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1976 __ BIND(L_copy_4_bytes_loop); 1977 __ dec(end_from, 4); 1978 __ dec(end_to, 4); 1979 __ ld(end_from, 0, O4); 1980 __ deccc(count); 1981 __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop); 1982 __ delayed()->st(O4, end_to, 0); 1983 __ BIND(L_exit); 1984 } 1985 1986 // 1987 // Generate stub for conjoint int copy. If "aligned" is true, the 1988 // "from" and "to" addresses are assumed to be heapword aligned. 1989 // 1990 // Arguments for generated stub: 1991 // from: O0 1992 // to: O1 1993 // count: O2 treated as signed 1994 // 1995 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1996 address *entry, const char *name) { 1997 __ align(CodeEntryAlignment); 1998 StubCodeMark mark(this, "StubRoutines", name); 1999 address start = __ pc(); 2000 2001 assert_clean_int(O2, O3); // Make sure 'count' is clean int. 2002 2003 if (entry != NULL) { 2004 *entry = __ pc(); 2005 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2006 BLOCK_COMMENT("Entry:"); 2007 } 2008 2009 array_overlap_test(nooverlap_target, 2); 2010 2011 generate_conjoint_int_copy_core(aligned); 2012 2013 // O3, O4 are used as temp registers 2014 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4); 2015 __ retl(); 2016 __ delayed()->mov(G0, O0); // return 0 2017 return start; 2018 } 2019 2020 // 2021 // Helper methods for generate_disjoint_long_copy_core() 2022 // 2023 void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec, 2024 Label& L_loop, bool use_prefetch, bool use_bis) { 2025 __ align(OptoLoopAlignment); 2026 __ BIND(L_loop); 2027 for (int off = 0; off < 64; off += 16) { 2028 if (use_prefetch && (off & 31) == 0) { 2029 if (ArraycopySrcPrefetchDistance > 0) { 2030 __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads); 2031 } 2032 if (ArraycopyDstPrefetchDistance > 0) { 2033 __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads); 2034 } 2035 } 2036 __ ldx(from, off+0, O4); 2037 __ ldx(from, off+8, O5); 2038 if (use_bis) { 2039 __ stxa(O4, to, off+0); 2040 __ stxa(O5, to, off+8); 2041 } else { 2042 __ stx(O4, to, off+0); 2043 __ stx(O5, to, off+8); 2044 } 2045 } 2046 __ deccc(count, 8); 2047 __ inc(from, 64); 2048 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 2049 __ delayed()->inc(to, 64); 2050 } 2051 2052 // 2053 // Generate core code for disjoint long copy (and oop copy on 64-bit). 2054 // "aligned" is ignored, because we must make the stronger 2055 // assumption that both addresses are always 64-bit aligned. 2056 // 2057 // Arguments: 2058 // from: O0 2059 // to: O1 2060 // count: O2 treated as signed 2061 // 2062 // count -= 2; 2063 // if ( count >= 0 ) { // >= 2 elements 2064 // if ( count > 6) { // >= 8 elements 2065 // count -= 6; // original count - 8 2066 // do { 2067 // copy_8_elements; 2068 // count -= 8; 2069 // } while ( count >= 0 ); 2070 // count += 6; 2071 // } 2072 // if ( count >= 0 ) { // >= 2 elements 2073 // do { 2074 // copy_2_elements; 2075 // } while ( (count=count-2) >= 0 ); 2076 // } 2077 // } 2078 // count += 2; 2079 // if ( count != 0 ) { // 1 element left 2080 // copy_1_element; 2081 // } 2082 // 2083 void generate_disjoint_long_copy_core(bool aligned) { 2084 Label L_copy_8_bytes, L_copy_16_bytes, L_exit; 2085 const Register from = O0; // source array address 2086 const Register to = O1; // destination array address 2087 const Register count = O2; // elements count 2088 const Register offset0 = O4; // element offset 2089 const Register offset8 = O5; // next element offset 2090 2091 __ deccc(count, 2); 2092 __ mov(G0, offset0); // offset from start of arrays (0) 2093 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 2094 __ delayed()->add(offset0, 8, offset8); 2095 2096 // Copy by 64 bytes chunks 2097 2098 const Register from64 = O3; // source address 2099 const Register to64 = G3; // destination address 2100 __ subcc(count, 6, O3); 2101 __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes ); 2102 __ delayed()->mov(to, to64); 2103 // Now we can use O4(offset0), O5(offset8) as temps 2104 __ mov(O3, count); 2105 // count >= 0 (original count - 8) 2106 __ mov(from, from64); 2107 2108 disjoint_copy_core(from64, to64, count, 3, 64, &StubGenerator::copy_64_bytes_loop); 2109 2110 // Restore O4(offset0), O5(offset8) 2111 __ sub(from64, from, offset0); 2112 __ inccc(count, 6); // restore count 2113 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 2114 __ delayed()->add(offset0, 8, offset8); 2115 2116 // Copy by 16 bytes chunks 2117 __ align(OptoLoopAlignment); 2118 __ BIND(L_copy_16_bytes); 2119 __ ldx(from, offset0, O3); 2120 __ ldx(from, offset8, G3); 2121 __ deccc(count, 2); 2122 __ stx(O3, to, offset0); 2123 __ inc(offset0, 16); 2124 __ stx(G3, to, offset8); 2125 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); 2126 __ delayed()->inc(offset8, 16); 2127 2128 // Copy last 8 bytes 2129 __ BIND(L_copy_8_bytes); 2130 __ inccc(count, 2); 2131 __ brx(Assembler::zero, true, Assembler::pn, L_exit ); 2132 __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs 2133 __ ldx(from, offset0, O3); 2134 __ stx(O3, to, offset0); 2135 __ BIND(L_exit); 2136 } 2137 2138 // 2139 // Generate stub for disjoint long copy. 2140 // "aligned" is ignored, because we must make the stronger 2141 // assumption that both addresses are always 64-bit aligned. 2142 // 2143 // Arguments for generated stub: 2144 // from: O0 2145 // to: O1 2146 // count: O2 treated as signed 2147 // 2148 address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) { 2149 __ align(CodeEntryAlignment); 2150 StubCodeMark mark(this, "StubRoutines", name); 2151 address start = __ pc(); 2152 2153 assert_clean_int(O2, O3); // Make sure 'count' is clean int. 2154 2155 if (entry != NULL) { 2156 *entry = __ pc(); 2157 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2158 BLOCK_COMMENT("Entry:"); 2159 } 2160 2161 generate_disjoint_long_copy_core(aligned); 2162 2163 // O3, O4 are used as temp registers 2164 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4); 2165 __ retl(); 2166 __ delayed()->mov(G0, O0); // return 0 2167 return start; 2168 } 2169 2170 // 2171 // Generate core code for conjoint long copy (and oop copy on 64-bit). 2172 // "aligned" is ignored, because we must make the stronger 2173 // assumption that both addresses are always 64-bit aligned. 2174 // 2175 // Arguments: 2176 // from: O0 2177 // to: O1 2178 // count: O2 treated as signed 2179 // 2180 void generate_conjoint_long_copy_core(bool aligned) { 2181 // Do reverse copy. 2182 Label L_copy_8_bytes, L_copy_16_bytes, L_exit; 2183 const Register from = O0; // source array address 2184 const Register to = O1; // destination array address 2185 const Register count = O2; // elements count 2186 const Register offset8 = O4; // element offset 2187 const Register offset0 = O5; // previous element offset 2188 2189 __ subcc(count, 1, count); 2190 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes ); 2191 __ delayed()->sllx(count, LogBytesPerLong, offset8); 2192 __ sub(offset8, 8, offset0); 2193 __ align(OptoLoopAlignment); 2194 __ BIND(L_copy_16_bytes); 2195 __ ldx(from, offset8, O2); 2196 __ ldx(from, offset0, O3); 2197 __ stx(O2, to, offset8); 2198 __ deccc(offset8, 16); // use offset8 as counter 2199 __ stx(O3, to, offset0); 2200 __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes); 2201 __ delayed()->dec(offset0, 16); 2202 2203 __ BIND(L_copy_8_bytes); 2204 __ brx(Assembler::negative, false, Assembler::pn, L_exit ); 2205 __ delayed()->nop(); 2206 __ ldx(from, 0, O3); 2207 __ stx(O3, to, 0); 2208 __ BIND(L_exit); 2209 } 2210 2211 // Generate stub for conjoint long copy. 2212 // "aligned" is ignored, because we must make the stronger 2213 // assumption that both addresses are always 64-bit aligned. 2214 // 2215 // Arguments for generated stub: 2216 // from: O0 2217 // to: O1 2218 // count: O2 treated as signed 2219 // 2220 address generate_conjoint_long_copy(bool aligned, address nooverlap_target, 2221 address *entry, const char *name) { 2222 __ align(CodeEntryAlignment); 2223 StubCodeMark mark(this, "StubRoutines", name); 2224 address start = __ pc(); 2225 2226 assert(aligned, "Should always be aligned"); 2227 2228 assert_clean_int(O2, O3); // Make sure 'count' is clean int. 2229 2230 if (entry != NULL) { 2231 *entry = __ pc(); 2232 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2233 BLOCK_COMMENT("Entry:"); 2234 } 2235 2236 array_overlap_test(nooverlap_target, 3); 2237 2238 generate_conjoint_long_copy_core(aligned); 2239 2240 // O3, O4 are used as temp registers 2241 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4); 2242 __ retl(); 2243 __ delayed()->mov(G0, O0); // return 0 2244 return start; 2245 } 2246 2247 // Generate stub for disjoint oop copy. If "aligned" is true, the 2248 // "from" and "to" addresses are assumed to be heapword aligned. 2249 // 2250 // Arguments for generated stub: 2251 // from: O0 2252 // to: O1 2253 // count: O2 treated as signed 2254 // 2255 address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name, 2256 bool dest_uninitialized = false) { 2257 2258 const Register from = O0; // source array address 2259 const Register to = O1; // destination array address 2260 const Register count = O2; // elements count 2261 2262 __ align(CodeEntryAlignment); 2263 StubCodeMark mark(this, "StubRoutines", name); 2264 address start = __ pc(); 2265 2266 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2267 2268 if (entry != NULL) { 2269 *entry = __ pc(); 2270 // caller can pass a 64-bit byte count here 2271 BLOCK_COMMENT("Entry:"); 2272 } 2273 2274 BarrierSetCodeGen *bs = Universe::heap()->barrier_set()->code_gen(); 2275 DecoratorSet decorators = DEST_COVARIANT | DEST_DISJOINT; 2276 if (dest_uninitialized) { 2277 decorators |= DEST_NOT_INITIALIZED; 2278 2279 } 2280 if (aligned) { 2281 decorators |= ACCESS_ALIGNED; 2282 } 2283 bs->arraycopy_prologue(_masm, decorators, T_OBJECT, from, to, count); 2284 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2285 if (UseCompressedOops) { 2286 generate_disjoint_int_copy_core(aligned); 2287 } else { 2288 generate_disjoint_long_copy_core(aligned); 2289 } 2290 bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, from, to, count); 2291 2292 // O3, O4 are used as temp registers 2293 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4); 2294 __ retl(); 2295 __ delayed()->mov(G0, O0); // return 0 2296 return start; 2297 } 2298 2299 // Generate stub for conjoint oop copy. If "aligned" is true, the 2300 // "from" and "to" addresses are assumed to be heapword aligned. 2301 // 2302 // Arguments for generated stub: 2303 // from: O0 2304 // to: O1 2305 // count: O2 treated as signed 2306 // 2307 address generate_conjoint_oop_copy(bool aligned, address nooverlap_target, 2308 address *entry, const char *name, 2309 bool dest_uninitialized = false) { 2310 2311 const Register from = O0; // source array address 2312 const Register to = O1; // destination array address 2313 const Register count = O2; // elements count 2314 2315 __ align(CodeEntryAlignment); 2316 StubCodeMark mark(this, "StubRoutines", name); 2317 address start = __ pc(); 2318 2319 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2320 2321 if (entry != NULL) { 2322 *entry = __ pc(); 2323 // caller can pass a 64-bit byte count here 2324 BLOCK_COMMENT("Entry:"); 2325 } 2326 2327 array_overlap_test(nooverlap_target, LogBytesPerHeapOop); 2328 2329 BarrierSetCodeGen *bs = Universe::heap()->barrier_set()->code_gen(); 2330 DecoratorSet decorators = DEST_COVARIANT | DEST_CONJOINT; 2331 if (dest_uninitialized) { 2332 decorators |= DEST_NOT_INITIALIZED; 2333 } 2334 if (aligned) { 2335 decorators |= ACCESS_ALIGNED; 2336 } 2337 bs->arraycopy_prologue(_masm, decorators, T_OBJECT, from, to, count); 2338 if (UseCompressedOops) { 2339 generate_conjoint_int_copy_core(aligned); 2340 } else { 2341 generate_conjoint_long_copy_core(aligned); 2342 } 2343 bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, from, to, count); 2344 2345 // O3, O4 are used as temp registers 2346 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4); 2347 __ retl(); 2348 __ delayed()->mov(G0, O0); // return 0 2349 return start; 2350 } 2351 2352 2353 // Helper for generating a dynamic type check. 2354 // Smashes only the given temp registers. 2355 void generate_type_check(Register sub_klass, 2356 Register super_check_offset, 2357 Register super_klass, 2358 Register temp, 2359 Label& L_success) { 2360 assert_different_registers(sub_klass, super_check_offset, super_klass, temp); 2361 2362 BLOCK_COMMENT("type_check:"); 2363 2364 Label L_miss, L_pop_to_miss; 2365 2366 assert_clean_int(super_check_offset, temp); 2367 2368 __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg, 2369 &L_success, &L_miss, NULL, 2370 super_check_offset); 2371 2372 BLOCK_COMMENT("type_check_slow_path:"); 2373 __ save_frame(0); 2374 __ check_klass_subtype_slow_path(sub_klass->after_save(), 2375 super_klass->after_save(), 2376 L0, L1, L2, L4, 2377 NULL, &L_pop_to_miss); 2378 __ ba(L_success); 2379 __ delayed()->restore(); 2380 2381 __ bind(L_pop_to_miss); 2382 __ restore(); 2383 2384 // Fall through on failure! 2385 __ BIND(L_miss); 2386 } 2387 2388 2389 // Generate stub for checked oop copy. 2390 // 2391 // Arguments for generated stub: 2392 // from: O0 2393 // to: O1 2394 // count: O2 treated as signed 2395 // ckoff: O3 (super_check_offset) 2396 // ckval: O4 (super_klass) 2397 // ret: O0 zero for success; (-1^K) where K is partial transfer count 2398 // 2399 address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) { 2400 2401 const Register O0_from = O0; // source array address 2402 const Register O1_to = O1; // destination array address 2403 const Register O2_count = O2; // elements count 2404 const Register O3_ckoff = O3; // super_check_offset 2405 const Register O4_ckval = O4; // super_klass 2406 2407 const Register O5_offset = O5; // loop var, with stride wordSize 2408 const Register G1_remain = G1; // loop var, with stride -1 2409 const Register G3_oop = G3; // actual oop copied 2410 const Register G4_klass = G4; // oop._klass 2411 const Register G5_super = G5; // oop._klass._primary_supers[ckval] 2412 2413 __ align(CodeEntryAlignment); 2414 StubCodeMark mark(this, "StubRoutines", name); 2415 address start = __ pc(); 2416 2417 #ifdef ASSERT 2418 // We sometimes save a frame (see generate_type_check below). 2419 // If this will cause trouble, let's fail now instead of later. 2420 __ save_frame(0); 2421 __ restore(); 2422 #endif 2423 2424 assert_clean_int(O2_count, G1); // Make sure 'count' is clean int. 2425 2426 #ifdef ASSERT 2427 // caller guarantees that the arrays really are different 2428 // otherwise, we would have to make conjoint checks 2429 { Label L; 2430 __ mov(O3, G1); // spill: overlap test smashes O3 2431 __ mov(O4, G4); // spill: overlap test smashes O4 2432 array_overlap_test(L, LogBytesPerHeapOop); 2433 __ stop("checkcast_copy within a single array"); 2434 __ bind(L); 2435 __ mov(G1, O3); 2436 __ mov(G4, O4); 2437 } 2438 #endif //ASSERT 2439 2440 if (entry != NULL) { 2441 *entry = __ pc(); 2442 // caller can pass a 64-bit byte count here (from generic stub) 2443 BLOCK_COMMENT("Entry:"); 2444 } 2445 2446 BarrierSetCodeGen *bs = Universe::heap()->barrier_set()->code_gen(); 2447 DecoratorSet decorators = DEST_DISJOINT; 2448 if (dest_uninitialized) { 2449 decorators |= DEST_NOT_INITIALIZED; 2450 } 2451 2452 bs->arraycopy_prologue(_masm, decorators, T_OBJECT, O0_from, O1_to, O2_count); 2453 2454 Label load_element, store_element, do_epilogue, fail, done; 2455 __ addcc(O2_count, 0, G1_remain); // initialize loop index, and test it 2456 __ brx(Assembler::notZero, false, Assembler::pt, load_element); 2457 __ delayed()->mov(G0, O5_offset); // offset from start of arrays 2458 2459 // Empty array: Nothing to do. 2460 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4); 2461 __ retl(); 2462 __ delayed()->set(0, O0); // return 0 on (trivial) success 2463 2464 // ======== begin loop ======== 2465 // (Loop is rotated; its entry is load_element.) 2466 // Loop variables: 2467 // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays 2468 // (O2 = len; O2 != 0; O2--) --- number of oops *remaining* 2469 // G3, G4, G5 --- current oop, oop.klass, oop.klass.super 2470 __ align(OptoLoopAlignment); 2471 2472 __ BIND(store_element); 2473 __ deccc(G1_remain); // decrement the count 2474 __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop 2475 __ inc(O5_offset, heapOopSize); // step to next offset 2476 __ brx(Assembler::zero, true, Assembler::pt, do_epilogue); 2477 __ delayed()->set(0, O0); // return -1 on success 2478 2479 // ======== loop entry is here ======== 2480 __ BIND(load_element); 2481 __ load_heap_oop(O0_from, O5_offset, G3_oop); // load the oop 2482 __ br_null_short(G3_oop, Assembler::pt, store_element); 2483 2484 __ load_klass(G3_oop, G4_klass); // query the object klass 2485 2486 generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super, 2487 // branch to this on success: 2488 store_element); 2489 // ======== end loop ======== 2490 2491 // It was a real error; we must depend on the caller to finish the job. 2492 // Register G1 has number of *remaining* oops, O2 number of *total* oops. 2493 // Emit GC store barriers for the oops we have copied (O2 minus G1), 2494 // and report their number to the caller. 2495 __ BIND(fail); 2496 __ subcc(O2_count, G1_remain, O2_count); 2497 __ brx(Assembler::zero, false, Assembler::pt, done); 2498 __ delayed()->not1(O2_count, O0); // report (-1^K) to caller 2499 2500 __ BIND(do_epilogue); 2501 bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, O0_from, O1_to, O2_count); 2502 2503 __ BIND(done); 2504 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4); 2505 __ retl(); 2506 __ delayed()->nop(); // return value in 00 2507 2508 return start; 2509 } 2510 2511 2512 // Generate 'unsafe' array copy stub 2513 // Though just as safe as the other stubs, it takes an unscaled 2514 // size_t argument instead of an element count. 2515 // 2516 // Arguments for generated stub: 2517 // from: O0 2518 // to: O1 2519 // count: O2 byte count, treated as ssize_t, can be zero 2520 // 2521 // Examines the alignment of the operands and dispatches 2522 // to a long, int, short, or byte copy loop. 2523 // 2524 address generate_unsafe_copy(const char* name, 2525 address byte_copy_entry, 2526 address short_copy_entry, 2527 address int_copy_entry, 2528 address long_copy_entry) { 2529 2530 const Register O0_from = O0; // source array address 2531 const Register O1_to = O1; // destination array address 2532 const Register O2_count = O2; // elements count 2533 2534 const Register G1_bits = G1; // test copy of low bits 2535 2536 __ align(CodeEntryAlignment); 2537 StubCodeMark mark(this, "StubRoutines", name); 2538 address start = __ pc(); 2539 2540 // bump this on entry, not on exit: 2541 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3); 2542 2543 __ or3(O0_from, O1_to, G1_bits); 2544 __ or3(O2_count, G1_bits, G1_bits); 2545 2546 __ btst(BytesPerLong-1, G1_bits); 2547 __ br(Assembler::zero, true, Assembler::pt, 2548 long_copy_entry, relocInfo::runtime_call_type); 2549 // scale the count on the way out: 2550 __ delayed()->srax(O2_count, LogBytesPerLong, O2_count); 2551 2552 __ btst(BytesPerInt-1, G1_bits); 2553 __ br(Assembler::zero, true, Assembler::pt, 2554 int_copy_entry, relocInfo::runtime_call_type); 2555 // scale the count on the way out: 2556 __ delayed()->srax(O2_count, LogBytesPerInt, O2_count); 2557 2558 __ btst(BytesPerShort-1, G1_bits); 2559 __ br(Assembler::zero, true, Assembler::pt, 2560 short_copy_entry, relocInfo::runtime_call_type); 2561 // scale the count on the way out: 2562 __ delayed()->srax(O2_count, LogBytesPerShort, O2_count); 2563 2564 __ br(Assembler::always, false, Assembler::pt, 2565 byte_copy_entry, relocInfo::runtime_call_type); 2566 __ delayed()->nop(); 2567 2568 return start; 2569 } 2570 2571 2572 // Perform range checks on the proposed arraycopy. 2573 // Kills the two temps, but nothing else. 2574 // Also, clean the sign bits of src_pos and dst_pos. 2575 void arraycopy_range_checks(Register src, // source array oop (O0) 2576 Register src_pos, // source position (O1) 2577 Register dst, // destination array oo (O2) 2578 Register dst_pos, // destination position (O3) 2579 Register length, // length of copy (O4) 2580 Register temp1, Register temp2, 2581 Label& L_failed) { 2582 BLOCK_COMMENT("arraycopy_range_checks:"); 2583 2584 // if (src_pos + length > arrayOop(src)->length() ) FAIL; 2585 2586 const Register array_length = temp1; // scratch 2587 const Register end_pos = temp2; // scratch 2588 2589 // Note: This next instruction may be in the delay slot of a branch: 2590 __ add(length, src_pos, end_pos); // src_pos + length 2591 __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length); 2592 __ cmp(end_pos, array_length); 2593 __ br(Assembler::greater, false, Assembler::pn, L_failed); 2594 2595 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL; 2596 __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length 2597 __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length); 2598 __ cmp(end_pos, array_length); 2599 __ br(Assembler::greater, false, Assembler::pn, L_failed); 2600 2601 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'. 2602 // Move with sign extension can be used since they are positive. 2603 __ delayed()->signx(src_pos, src_pos); 2604 __ signx(dst_pos, dst_pos); 2605 2606 BLOCK_COMMENT("arraycopy_range_checks done"); 2607 } 2608 2609 2610 // 2611 // Generate generic array copy stubs 2612 // 2613 // Input: 2614 // O0 - src oop 2615 // O1 - src_pos 2616 // O2 - dst oop 2617 // O3 - dst_pos 2618 // O4 - element count 2619 // 2620 // Output: 2621 // O0 == 0 - success 2622 // O0 == -1 - need to call System.arraycopy 2623 // 2624 address generate_generic_copy(const char *name, 2625 address entry_jbyte_arraycopy, 2626 address entry_jshort_arraycopy, 2627 address entry_jint_arraycopy, 2628 address entry_oop_arraycopy, 2629 address entry_jlong_arraycopy, 2630 address entry_checkcast_arraycopy) { 2631 Label L_failed, L_objArray; 2632 2633 // Input registers 2634 const Register src = O0; // source array oop 2635 const Register src_pos = O1; // source position 2636 const Register dst = O2; // destination array oop 2637 const Register dst_pos = O3; // destination position 2638 const Register length = O4; // elements count 2639 2640 // registers used as temp 2641 const Register G3_src_klass = G3; // source array klass 2642 const Register G4_dst_klass = G4; // destination array klass 2643 const Register G5_lh = G5; // layout handler 2644 const Register O5_temp = O5; 2645 2646 __ align(CodeEntryAlignment); 2647 StubCodeMark mark(this, "StubRoutines", name); 2648 address start = __ pc(); 2649 2650 // bump this on entry, not on exit: 2651 inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3); 2652 2653 // In principle, the int arguments could be dirty. 2654 //assert_clean_int(src_pos, G1); 2655 //assert_clean_int(dst_pos, G1); 2656 //assert_clean_int(length, G1); 2657 2658 //----------------------------------------------------------------------- 2659 // Assembler stubs will be used for this call to arraycopy 2660 // if the following conditions are met: 2661 // 2662 // (1) src and dst must not be null. 2663 // (2) src_pos must not be negative. 2664 // (3) dst_pos must not be negative. 2665 // (4) length must not be negative. 2666 // (5) src klass and dst klass should be the same and not NULL. 2667 // (6) src and dst should be arrays. 2668 // (7) src_pos + length must not exceed length of src. 2669 // (8) dst_pos + length must not exceed length of dst. 2670 BLOCK_COMMENT("arraycopy initial argument checks"); 2671 2672 // if (src == NULL) return -1; 2673 __ br_null(src, false, Assembler::pn, L_failed); 2674 2675 // if (src_pos < 0) return -1; 2676 __ delayed()->tst(src_pos); 2677 __ br(Assembler::negative, false, Assembler::pn, L_failed); 2678 __ delayed()->nop(); 2679 2680 // if (dst == NULL) return -1; 2681 __ br_null(dst, false, Assembler::pn, L_failed); 2682 2683 // if (dst_pos < 0) return -1; 2684 __ delayed()->tst(dst_pos); 2685 __ br(Assembler::negative, false, Assembler::pn, L_failed); 2686 2687 // if (length < 0) return -1; 2688 __ delayed()->tst(length); 2689 __ br(Assembler::negative, false, Assembler::pn, L_failed); 2690 2691 BLOCK_COMMENT("arraycopy argument klass checks"); 2692 // get src->klass() 2693 if (UseCompressedClassPointers) { 2694 __ delayed()->nop(); // ??? not good 2695 __ load_klass(src, G3_src_klass); 2696 } else { 2697 __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass); 2698 } 2699 2700 #ifdef ASSERT 2701 // assert(src->klass() != NULL); 2702 BLOCK_COMMENT("assert klasses not null"); 2703 { Label L_a, L_b; 2704 __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL 2705 __ bind(L_a); 2706 __ stop("broken null klass"); 2707 __ bind(L_b); 2708 __ load_klass(dst, G4_dst_klass); 2709 __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also 2710 __ delayed()->mov(G0, G4_dst_klass); // scribble the temp 2711 BLOCK_COMMENT("assert done"); 2712 } 2713 #endif 2714 2715 // Load layout helper 2716 // 2717 // |array_tag| | header_size | element_type | |log2_element_size| 2718 // 32 30 24 16 8 2 0 2719 // 2720 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2721 // 2722 2723 int lh_offset = in_bytes(Klass::layout_helper_offset()); 2724 2725 // Load 32-bits signed value. Use br() instruction with it to check icc. 2726 __ lduw(G3_src_klass, lh_offset, G5_lh); 2727 2728 if (UseCompressedClassPointers) { 2729 __ load_klass(dst, G4_dst_klass); 2730 } 2731 // Handle objArrays completely differently... 2732 juint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2733 __ set(objArray_lh, O5_temp); 2734 __ cmp(G5_lh, O5_temp); 2735 __ br(Assembler::equal, false, Assembler::pt, L_objArray); 2736 if (UseCompressedClassPointers) { 2737 __ delayed()->nop(); 2738 } else { 2739 __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass); 2740 } 2741 2742 // if (src->klass() != dst->klass()) return -1; 2743 __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed); 2744 2745 // if (!src->is_Array()) return -1; 2746 __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0 2747 __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed); 2748 2749 // At this point, it is known to be a typeArray (array_tag 0x3). 2750 #ifdef ASSERT 2751 __ delayed()->nop(); 2752 { Label L; 2753 jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2754 __ set(lh_prim_tag_in_place, O5_temp); 2755 __ cmp(G5_lh, O5_temp); 2756 __ br(Assembler::greaterEqual, false, Assembler::pt, L); 2757 __ delayed()->nop(); 2758 __ stop("must be a primitive array"); 2759 __ bind(L); 2760 } 2761 #else 2762 __ delayed(); // match next insn to prev branch 2763 #endif 2764 2765 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2766 O5_temp, G4_dst_klass, L_failed); 2767 2768 // TypeArrayKlass 2769 // 2770 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2771 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2772 // 2773 2774 const Register G4_offset = G4_dst_klass; // array offset 2775 const Register G3_elsize = G3_src_klass; // log2 element size 2776 2777 __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset); 2778 __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset 2779 __ add(src, G4_offset, src); // src array offset 2780 __ add(dst, G4_offset, dst); // dst array offset 2781 __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size 2782 2783 // next registers should be set before the jump to corresponding stub 2784 const Register from = O0; // source array address 2785 const Register to = O1; // destination array address 2786 const Register count = O2; // elements count 2787 2788 // 'from', 'to', 'count' registers should be set in this order 2789 // since they are the same as 'src', 'src_pos', 'dst'. 2790 2791 BLOCK_COMMENT("scale indexes to element size"); 2792 __ sll_ptr(src_pos, G3_elsize, src_pos); 2793 __ sll_ptr(dst_pos, G3_elsize, dst_pos); 2794 __ add(src, src_pos, from); // src_addr 2795 __ add(dst, dst_pos, to); // dst_addr 2796 2797 BLOCK_COMMENT("choose copy loop based on element size"); 2798 __ cmp(G3_elsize, 0); 2799 __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy); 2800 __ delayed()->signx(length, count); // length 2801 2802 __ cmp(G3_elsize, LogBytesPerShort); 2803 __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy); 2804 __ delayed()->signx(length, count); // length 2805 2806 __ cmp(G3_elsize, LogBytesPerInt); 2807 __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy); 2808 __ delayed()->signx(length, count); // length 2809 #ifdef ASSERT 2810 { Label L; 2811 __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L); 2812 __ stop("must be long copy, but elsize is wrong"); 2813 __ bind(L); 2814 } 2815 #endif 2816 __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy); 2817 __ delayed()->signx(length, count); // length 2818 2819 // ObjArrayKlass 2820 __ BIND(L_objArray); 2821 // live at this point: G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length 2822 2823 Label L_plain_copy, L_checkcast_copy; 2824 // test array classes for subtyping 2825 __ cmp(G3_src_klass, G4_dst_klass); // usual case is exact equality 2826 __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy); 2827 __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below 2828 2829 // Identically typed arrays can be copied without element-wise checks. 2830 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2831 O5_temp, G5_lh, L_failed); 2832 2833 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset 2834 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset 2835 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos); 2836 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos); 2837 __ add(src, src_pos, from); // src_addr 2838 __ add(dst, dst_pos, to); // dst_addr 2839 __ BIND(L_plain_copy); 2840 __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy); 2841 __ delayed()->signx(length, count); // length 2842 2843 __ BIND(L_checkcast_copy); 2844 // live at this point: G3_src_klass, G4_dst_klass 2845 { 2846 // Before looking at dst.length, make sure dst is also an objArray. 2847 // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot 2848 __ cmp(G5_lh, O5_temp); 2849 __ br(Assembler::notEqual, false, Assembler::pn, L_failed); 2850 2851 // It is safe to examine both src.length and dst.length. 2852 __ delayed(); // match next insn to prev branch 2853 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2854 O5_temp, G5_lh, L_failed); 2855 2856 // Marshal the base address arguments now, freeing registers. 2857 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset 2858 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset 2859 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos); 2860 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos); 2861 __ add(src, src_pos, from); // src_addr 2862 __ add(dst, dst_pos, to); // dst_addr 2863 __ signx(length, count); // length (reloaded) 2864 2865 Register sco_temp = O3; // this register is free now 2866 assert_different_registers(from, to, count, sco_temp, 2867 G4_dst_klass, G3_src_klass); 2868 2869 // Generate the type check. 2870 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2871 __ lduw(G4_dst_klass, sco_offset, sco_temp); 2872 generate_type_check(G3_src_klass, sco_temp, G4_dst_klass, 2873 O5_temp, L_plain_copy); 2874 2875 // Fetch destination element klass from the ObjArrayKlass header. 2876 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2877 2878 // the checkcast_copy loop needs two extra arguments: 2879 __ ld_ptr(G4_dst_klass, ek_offset, O4); // dest elem klass 2880 // lduw(O4, sco_offset, O3); // sco of elem klass 2881 2882 __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy); 2883 __ delayed()->lduw(O4, sco_offset, O3); 2884 } 2885 2886 __ BIND(L_failed); 2887 __ retl(); 2888 __ delayed()->sub(G0, 1, O0); // return -1 2889 return start; 2890 } 2891 2892 // 2893 // Generate stub for heap zeroing. 2894 // "to" address is aligned to jlong (8 bytes). 2895 // 2896 // Arguments for generated stub: 2897 // to: O0 2898 // count: O1 treated as signed (count of HeapWord) 2899 // count could be 0 2900 // 2901 address generate_zero_aligned_words(const char* name) { 2902 __ align(CodeEntryAlignment); 2903 StubCodeMark mark(this, "StubRoutines", name); 2904 address start = __ pc(); 2905 2906 const Register to = O0; // source array address 2907 const Register count = O1; // HeapWords count 2908 const Register temp = O2; // scratch 2909 2910 Label Ldone; 2911 __ sllx(count, LogHeapWordSize, count); // to bytes count 2912 // Use BIS for zeroing 2913 __ bis_zeroing(to, count, temp, Ldone); 2914 __ bind(Ldone); 2915 __ retl(); 2916 __ delayed()->nop(); 2917 return start; 2918 } 2919 2920 void generate_arraycopy_stubs() { 2921 address entry; 2922 address entry_jbyte_arraycopy; 2923 address entry_jshort_arraycopy; 2924 address entry_jint_arraycopy; 2925 address entry_oop_arraycopy; 2926 address entry_jlong_arraycopy; 2927 address entry_checkcast_arraycopy; 2928 2929 //*** jbyte 2930 // Always need aligned and unaligned versions 2931 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2932 "jbyte_disjoint_arraycopy"); 2933 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2934 &entry_jbyte_arraycopy, 2935 "jbyte_arraycopy"); 2936 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2937 "arrayof_jbyte_disjoint_arraycopy"); 2938 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2939 "arrayof_jbyte_arraycopy"); 2940 2941 //*** jshort 2942 // Always need aligned and unaligned versions 2943 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2944 "jshort_disjoint_arraycopy"); 2945 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2946 &entry_jshort_arraycopy, 2947 "jshort_arraycopy"); 2948 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2949 "arrayof_jshort_disjoint_arraycopy"); 2950 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2951 "arrayof_jshort_arraycopy"); 2952 2953 //*** jint 2954 // Aligned versions 2955 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2956 "arrayof_jint_disjoint_arraycopy"); 2957 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2958 "arrayof_jint_arraycopy"); 2959 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2960 // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it). 2961 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2962 "jint_disjoint_arraycopy"); 2963 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2964 &entry_jint_arraycopy, 2965 "jint_arraycopy"); 2966 2967 //*** jlong 2968 // It is always aligned 2969 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2970 "arrayof_jlong_disjoint_arraycopy"); 2971 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2972 "arrayof_jlong_arraycopy"); 2973 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2974 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2975 2976 2977 //*** oops 2978 // Aligned versions 2979 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, &entry, 2980 "arrayof_oop_disjoint_arraycopy"); 2981 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy, 2982 "arrayof_oop_arraycopy"); 2983 // Aligned versions without pre-barriers 2984 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry, 2985 "arrayof_oop_disjoint_arraycopy_uninit", 2986 /*dest_uninitialized*/true); 2987 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, entry, NULL, 2988 "arrayof_oop_arraycopy_uninit", 2989 /*dest_uninitialized*/true); 2990 if (UseCompressedOops) { 2991 // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy. 2992 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, &entry, 2993 "oop_disjoint_arraycopy"); 2994 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy, 2995 "oop_arraycopy"); 2996 // Unaligned versions without pre-barriers 2997 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, &entry, 2998 "oop_disjoint_arraycopy_uninit", 2999 /*dest_uninitialized*/true); 3000 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, entry, NULL, 3001 "oop_arraycopy_uninit", 3002 /*dest_uninitialized*/true); 3003 } else 3004 { 3005 // oop arraycopy is always aligned on 32bit and 64bit without compressed oops 3006 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 3007 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 3008 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 3009 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 3010 } 3011 3012 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 3013 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 3014 /*dest_uninitialized*/true); 3015 3016 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 3017 entry_jbyte_arraycopy, 3018 entry_jshort_arraycopy, 3019 entry_jint_arraycopy, 3020 entry_jlong_arraycopy); 3021 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 3022 entry_jbyte_arraycopy, 3023 entry_jshort_arraycopy, 3024 entry_jint_arraycopy, 3025 entry_oop_arraycopy, 3026 entry_jlong_arraycopy, 3027 entry_checkcast_arraycopy); 3028 3029 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 3030 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 3031 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 3032 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 3033 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 3034 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 3035 3036 if (UseBlockZeroing) { 3037 StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words"); 3038 } 3039 } 3040 3041 address generate_aescrypt_encryptBlock() { 3042 // required since we read expanded key 'int' array starting first element without alignment considerations 3043 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 3044 "the following code assumes that first element of an int array is aligned to 8 bytes"); 3045 __ align(CodeEntryAlignment); 3046 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 3047 Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output; 3048 address start = __ pc(); 3049 Register from = O0; // source byte array 3050 Register to = O1; // destination byte array 3051 Register key = O2; // expanded key array 3052 const Register keylen = O4; //reg for storing expanded key array length 3053 3054 // read expanded key length 3055 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 3056 3057 // Method to address arbitrary alignment for load instructions: 3058 // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary 3059 // If zero/aligned then continue with double FP load instructions 3060 // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata 3061 // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address 3062 // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address 3063 // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs 3064 3065 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3066 __ andcc(from, 7, G0); 3067 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); 3068 __ delayed()->alignaddr(from, G0, from); 3069 3070 // aligned case: load input into F54-F56 3071 __ ldf(FloatRegisterImpl::D, from, 0, F54); 3072 __ ldf(FloatRegisterImpl::D, from, 8, F56); 3073 __ ba_short(L_load_expanded_key); 3074 3075 __ BIND(L_load_misaligned_input); 3076 __ ldf(FloatRegisterImpl::D, from, 0, F54); 3077 __ ldf(FloatRegisterImpl::D, from, 8, F56); 3078 __ ldf(FloatRegisterImpl::D, from, 16, F58); 3079 __ faligndata(F54, F56, F54); 3080 __ faligndata(F56, F58, F56); 3081 3082 __ BIND(L_load_expanded_key); 3083 // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed 3084 for ( int i = 0; i <= 38; i += 2 ) { 3085 __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i)); 3086 } 3087 3088 // perform cipher transformation 3089 __ fxor(FloatRegisterImpl::D, F0, F54, F54); 3090 __ fxor(FloatRegisterImpl::D, F2, F56, F56); 3091 // rounds 1 through 8 3092 for ( int i = 4; i <= 28; i += 8 ) { 3093 __ aes_eround01(as_FloatRegister(i), F54, F56, F58); 3094 __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60); 3095 __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54); 3096 __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56); 3097 } 3098 __ aes_eround01(F36, F54, F56, F58); //round 9 3099 __ aes_eround23(F38, F54, F56, F60); 3100 3101 // 128-bit original key size 3102 __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit); 3103 3104 for ( int i = 40; i <= 50; i += 2 ) { 3105 __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) ); 3106 } 3107 __ aes_eround01(F40, F58, F60, F54); //round 10 3108 __ aes_eround23(F42, F58, F60, F56); 3109 __ aes_eround01(F44, F54, F56, F58); //round 11 3110 __ aes_eround23(F46, F54, F56, F60); 3111 3112 // 192-bit original key size 3113 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput); 3114 3115 __ ldf(FloatRegisterImpl::D, key, 208, F52); 3116 __ aes_eround01(F48, F58, F60, F54); //round 12 3117 __ aes_eround23(F50, F58, F60, F56); 3118 __ ldf(FloatRegisterImpl::D, key, 216, F46); 3119 __ ldf(FloatRegisterImpl::D, key, 224, F48); 3120 __ ldf(FloatRegisterImpl::D, key, 232, F50); 3121 __ aes_eround01(F52, F54, F56, F58); //round 13 3122 __ aes_eround23(F46, F54, F56, F60); 3123 __ ba_short(L_storeOutput); 3124 3125 __ BIND(L_doLast128bit); 3126 __ ldf(FloatRegisterImpl::D, key, 160, F48); 3127 __ ldf(FloatRegisterImpl::D, key, 168, F50); 3128 3129 __ BIND(L_storeOutput); 3130 // perform last round of encryption common for all key sizes 3131 __ aes_eround01_l(F48, F58, F60, F54); //last round 3132 __ aes_eround23_l(F50, F58, F60, F56); 3133 3134 // Method to address arbitrary alignment for store instructions: 3135 // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary 3136 // If zero/aligned then continue with double FP store instructions 3137 // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case) 3138 // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001 3139 // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case 3140 // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case. 3141 // Set GSR.align to (8-n) using alignaddr 3142 // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf 3143 // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address 3144 // Store (partial) the original first (8-n) bytes starting at the original 'dest' address 3145 // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address 3146 // We need to execute this process for both the 8-byte result values 3147 3148 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3149 __ andcc(to, 7, O5); 3150 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); 3151 __ delayed()->edge8n(to, G0, O3); 3152 3153 // aligned case: store output into the destination array 3154 __ stf(FloatRegisterImpl::D, F54, to, 0); 3155 __ retl(); 3156 __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8); 3157 3158 __ BIND(L_store_misaligned_output); 3159 __ add(to, 8, O4); 3160 __ mov(8, O2); 3161 __ sub(O2, O5, O2); 3162 __ alignaddr(O2, G0, O2); 3163 __ faligndata(F54, F54, F54); 3164 __ faligndata(F56, F56, F56); 3165 __ and3(to, -8, to); 3166 __ and3(O4, -8, O4); 3167 __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); 3168 __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); 3169 __ add(to, 8, to); 3170 __ add(O4, 8, O4); 3171 __ orn(G0, O3, O3); 3172 __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); 3173 __ retl(); 3174 __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); 3175 3176 return start; 3177 } 3178 3179 address generate_aescrypt_decryptBlock() { 3180 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 3181 "the following code assumes that first element of an int array is aligned to 8 bytes"); 3182 // required since we read original key 'byte' array as well in the decryption stubs 3183 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 3184 "the following code assumes that first element of a byte array is aligned to 8 bytes"); 3185 __ align(CodeEntryAlignment); 3186 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 3187 address start = __ pc(); 3188 Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input; 3189 Label L_256bit_transform, L_common_transform, L_store_misaligned_output; 3190 Register from = O0; // source byte array 3191 Register to = O1; // destination byte array 3192 Register key = O2; // expanded key array 3193 Register original_key = O3; // original key array only required during decryption 3194 const Register keylen = O4; // reg for storing expanded key array length 3195 3196 // read expanded key array length 3197 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 3198 3199 // save 'from' since we may need to recheck alignment in case of 256-bit decryption 3200 __ mov(from, G1); 3201 3202 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3203 __ andcc(from, 7, G0); 3204 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); 3205 __ delayed()->alignaddr(from, G0, from); 3206 3207 // aligned case: load input into F52-F54 3208 __ ldf(FloatRegisterImpl::D, from, 0, F52); 3209 __ ldf(FloatRegisterImpl::D, from, 8, F54); 3210 __ ba_short(L_load_original_key); 3211 3212 __ BIND(L_load_misaligned_input); 3213 __ ldf(FloatRegisterImpl::D, from, 0, F52); 3214 __ ldf(FloatRegisterImpl::D, from, 8, F54); 3215 __ ldf(FloatRegisterImpl::D, from, 16, F56); 3216 __ faligndata(F52, F54, F52); 3217 __ faligndata(F54, F56, F54); 3218 3219 __ BIND(L_load_original_key); 3220 // load original key from SunJCE expanded decryption key 3221 // Since we load original key buffer starting first element, 8-byte alignment is guaranteed 3222 for ( int i = 0; i <= 3; i++ ) { 3223 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 3224 } 3225 3226 // 256-bit original key size 3227 __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit); 3228 3229 // 192-bit original key size 3230 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit); 3231 3232 // 128-bit original key size 3233 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 3234 for ( int i = 0; i <= 36; i += 4 ) { 3235 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4)); 3236 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6)); 3237 } 3238 3239 // perform 128-bit key specific inverse cipher transformation 3240 __ fxor(FloatRegisterImpl::D, F42, F54, F54); 3241 __ fxor(FloatRegisterImpl::D, F40, F52, F52); 3242 __ ba_short(L_common_transform); 3243 3244 __ BIND(L_expand192bit); 3245 3246 // start loading rest of the 192-bit key 3247 __ ldf(FloatRegisterImpl::S, original_key, 16, F4); 3248 __ ldf(FloatRegisterImpl::S, original_key, 20, F5); 3249 3250 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 3251 for ( int i = 0; i <= 36; i += 6 ) { 3252 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6)); 3253 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8)); 3254 __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10)); 3255 } 3256 __ aes_kexpand1(F42, F46, 7, F48); 3257 __ aes_kexpand2(F44, F48, F50); 3258 3259 // perform 192-bit key specific inverse cipher transformation 3260 __ fxor(FloatRegisterImpl::D, F50, F54, F54); 3261 __ fxor(FloatRegisterImpl::D, F48, F52, F52); 3262 __ aes_dround23(F46, F52, F54, F58); 3263 __ aes_dround01(F44, F52, F54, F56); 3264 __ aes_dround23(F42, F56, F58, F54); 3265 __ aes_dround01(F40, F56, F58, F52); 3266 __ ba_short(L_common_transform); 3267 3268 __ BIND(L_expand256bit); 3269 3270 // load rest of the 256-bit key 3271 for ( int i = 4; i <= 7; i++ ) { 3272 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 3273 } 3274 3275 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 3276 for ( int i = 0; i <= 40; i += 8 ) { 3277 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8)); 3278 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10)); 3279 __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12)); 3280 __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14)); 3281 } 3282 __ aes_kexpand1(F48, F54, 6, F56); 3283 __ aes_kexpand2(F50, F56, F58); 3284 3285 for ( int i = 0; i <= 6; i += 2 ) { 3286 __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); 3287 } 3288 3289 // reload original 'from' address 3290 __ mov(G1, from); 3291 3292 // re-check 8-byte alignment 3293 __ andcc(from, 7, G0); 3294 __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input); 3295 __ delayed()->alignaddr(from, G0, from); 3296 3297 // aligned case: load input into F52-F54 3298 __ ldf(FloatRegisterImpl::D, from, 0, F52); 3299 __ ldf(FloatRegisterImpl::D, from, 8, F54); 3300 __ ba_short(L_256bit_transform); 3301 3302 __ BIND(L_reload_misaligned_input); 3303 __ ldf(FloatRegisterImpl::D, from, 0, F52); 3304 __ ldf(FloatRegisterImpl::D, from, 8, F54); 3305 __ ldf(FloatRegisterImpl::D, from, 16, F56); 3306 __ faligndata(F52, F54, F52); 3307 __ faligndata(F54, F56, F54); 3308 3309 // perform 256-bit key specific inverse cipher transformation 3310 __ BIND(L_256bit_transform); 3311 __ fxor(FloatRegisterImpl::D, F0, F54, F54); 3312 __ fxor(FloatRegisterImpl::D, F2, F52, F52); 3313 __ aes_dround23(F4, F52, F54, F58); 3314 __ aes_dround01(F6, F52, F54, F56); 3315 __ aes_dround23(F50, F56, F58, F54); 3316 __ aes_dround01(F48, F56, F58, F52); 3317 __ aes_dround23(F46, F52, F54, F58); 3318 __ aes_dround01(F44, F52, F54, F56); 3319 __ aes_dround23(F42, F56, F58, F54); 3320 __ aes_dround01(F40, F56, F58, F52); 3321 3322 for ( int i = 0; i <= 7; i++ ) { 3323 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 3324 } 3325 3326 // perform inverse cipher transformations common for all key sizes 3327 __ BIND(L_common_transform); 3328 for ( int i = 38; i >= 6; i -= 8 ) { 3329 __ aes_dround23(as_FloatRegister(i), F52, F54, F58); 3330 __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56); 3331 if ( i != 6) { 3332 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54); 3333 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52); 3334 } else { 3335 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54); 3336 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52); 3337 } 3338 } 3339 3340 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3341 __ andcc(to, 7, O5); 3342 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); 3343 __ delayed()->edge8n(to, G0, O3); 3344 3345 // aligned case: store output into the destination array 3346 __ stf(FloatRegisterImpl::D, F52, to, 0); 3347 __ retl(); 3348 __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8); 3349 3350 __ BIND(L_store_misaligned_output); 3351 __ add(to, 8, O4); 3352 __ mov(8, O2); 3353 __ sub(O2, O5, O2); 3354 __ alignaddr(O2, G0, O2); 3355 __ faligndata(F52, F52, F52); 3356 __ faligndata(F54, F54, F54); 3357 __ and3(to, -8, to); 3358 __ and3(O4, -8, O4); 3359 __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); 3360 __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); 3361 __ add(to, 8, to); 3362 __ add(O4, 8, O4); 3363 __ orn(G0, O3, O3); 3364 __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); 3365 __ retl(); 3366 __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); 3367 3368 return start; 3369 } 3370 3371 address generate_cipherBlockChaining_encryptAESCrypt() { 3372 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 3373 "the following code assumes that first element of an int array is aligned to 8 bytes"); 3374 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 3375 "the following code assumes that first element of a byte array is aligned to 8 bytes"); 3376 __ align(CodeEntryAlignment); 3377 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 3378 Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit; 3379 Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform; 3380 Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit; 3381 Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit; 3382 address start = __ pc(); 3383 Register from = I0; // source byte array 3384 Register to = I1; // destination byte array 3385 Register key = I2; // expanded key array 3386 Register rvec = I3; // init vector 3387 const Register len_reg = I4; // cipher length 3388 const Register keylen = I5; // reg for storing expanded key array length 3389 3390 __ save_frame(0); 3391 // save cipher len to return in the end 3392 __ mov(len_reg, L0); 3393 3394 // read expanded key length 3395 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 3396 3397 // load initial vector, 8-byte alignment is guranteed 3398 __ ldf(FloatRegisterImpl::D, rvec, 0, F60); 3399 __ ldf(FloatRegisterImpl::D, rvec, 8, F62); 3400 // load key, 8-byte alignment is guranteed 3401 __ ldx(key,0,G1); 3402 __ ldx(key,8,G5); 3403 3404 // start loading expanded key, 8-byte alignment is guranteed 3405 for ( int i = 0, j = 16; i <= 38; i += 2, j += 8 ) { 3406 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 3407 } 3408 3409 // 128-bit original key size 3410 __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128); 3411 3412 for ( int i = 40, j = 176; i <= 46; i += 2, j += 8 ) { 3413 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 3414 } 3415 3416 // 192-bit original key size 3417 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192); 3418 3419 for ( int i = 48, j = 208; i <= 54; i += 2, j += 8 ) { 3420 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 3421 } 3422 3423 // 256-bit original key size 3424 __ ba_short(L_cbcenc256); 3425 3426 __ align(OptoLoopAlignment); 3427 __ BIND(L_cbcenc128); 3428 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3429 __ andcc(from, 7, G0); 3430 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit); 3431 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 3432 3433 // aligned case: load input into G3 and G4 3434 __ ldx(from,0,G3); 3435 __ ldx(from,8,G4); 3436 __ ba_short(L_128bit_transform); 3437 3438 __ BIND(L_load_misaligned_input_128bit); 3439 // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption 3440 __ alignaddr(from, G0, from); 3441 __ ldf(FloatRegisterImpl::D, from, 0, F48); 3442 __ ldf(FloatRegisterImpl::D, from, 8, F50); 3443 __ ldf(FloatRegisterImpl::D, from, 16, F52); 3444 __ faligndata(F48, F50, F48); 3445 __ faligndata(F50, F52, F50); 3446 __ movdtox(F48, G3); 3447 __ movdtox(F50, G4); 3448 __ mov(L1, from); 3449 3450 __ BIND(L_128bit_transform); 3451 __ xor3(G1,G3,G3); 3452 __ xor3(G5,G4,G4); 3453 __ movxtod(G3,F56); 3454 __ movxtod(G4,F58); 3455 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 3456 __ fxor(FloatRegisterImpl::D, F62, F58, F62); 3457 3458 // TEN_EROUNDS 3459 for ( int i = 0; i <= 32; i += 8 ) { 3460 __ aes_eround01(as_FloatRegister(i), F60, F62, F56); 3461 __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); 3462 if (i != 32 ) { 3463 __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); 3464 __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); 3465 } else { 3466 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 3467 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 3468 } 3469 } 3470 3471 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3472 __ andcc(to, 7, L1); 3473 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit); 3474 __ delayed()->edge8n(to, G0, L2); 3475 3476 // aligned case: store output into the destination array 3477 __ stf(FloatRegisterImpl::D, F60, to, 0); 3478 __ stf(FloatRegisterImpl::D, F62, to, 8); 3479 __ ba_short(L_check_loop_end_128bit); 3480 3481 __ BIND(L_store_misaligned_output_128bit); 3482 __ add(to, 8, L3); 3483 __ mov(8, L4); 3484 __ sub(L4, L1, L4); 3485 __ alignaddr(L4, G0, L4); 3486 // save cipher text before circular right shift 3487 // as it needs to be stored as iv for next block (see code before next retl) 3488 __ movdtox(F60, L6); 3489 __ movdtox(F62, L7); 3490 __ faligndata(F60, F60, F60); 3491 __ faligndata(F62, F62, F62); 3492 __ mov(to, L5); 3493 __ and3(to, -8, to); 3494 __ and3(L3, -8, L3); 3495 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3496 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3497 __ add(to, 8, to); 3498 __ add(L3, 8, L3); 3499 __ orn(G0, L2, L2); 3500 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3501 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3502 __ mov(L5, to); 3503 __ movxtod(L6, F60); 3504 __ movxtod(L7, F62); 3505 3506 __ BIND(L_check_loop_end_128bit); 3507 __ add(from, 16, from); 3508 __ add(to, 16, to); 3509 __ subcc(len_reg, 16, len_reg); 3510 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128); 3511 __ delayed()->nop(); 3512 // re-init intial vector for next block, 8-byte alignment is guaranteed 3513 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 3514 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 3515 __ mov(L0, I0); 3516 __ ret(); 3517 __ delayed()->restore(); 3518 3519 __ align(OptoLoopAlignment); 3520 __ BIND(L_cbcenc192); 3521 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3522 __ andcc(from, 7, G0); 3523 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit); 3524 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 3525 3526 // aligned case: load input into G3 and G4 3527 __ ldx(from,0,G3); 3528 __ ldx(from,8,G4); 3529 __ ba_short(L_192bit_transform); 3530 3531 __ BIND(L_load_misaligned_input_192bit); 3532 // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption 3533 __ alignaddr(from, G0, from); 3534 __ ldf(FloatRegisterImpl::D, from, 0, F48); 3535 __ ldf(FloatRegisterImpl::D, from, 8, F50); 3536 __ ldf(FloatRegisterImpl::D, from, 16, F52); 3537 __ faligndata(F48, F50, F48); 3538 __ faligndata(F50, F52, F50); 3539 __ movdtox(F48, G3); 3540 __ movdtox(F50, G4); 3541 __ mov(L1, from); 3542 3543 __ BIND(L_192bit_transform); 3544 __ xor3(G1,G3,G3); 3545 __ xor3(G5,G4,G4); 3546 __ movxtod(G3,F56); 3547 __ movxtod(G4,F58); 3548 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 3549 __ fxor(FloatRegisterImpl::D, F62, F58, F62); 3550 3551 // TWELEVE_EROUNDS 3552 for ( int i = 0; i <= 40; i += 8 ) { 3553 __ aes_eround01(as_FloatRegister(i), F60, F62, F56); 3554 __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); 3555 if (i != 40 ) { 3556 __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); 3557 __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); 3558 } else { 3559 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 3560 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 3561 } 3562 } 3563 3564 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3565 __ andcc(to, 7, L1); 3566 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit); 3567 __ delayed()->edge8n(to, G0, L2); 3568 3569 // aligned case: store output into the destination array 3570 __ stf(FloatRegisterImpl::D, F60, to, 0); 3571 __ stf(FloatRegisterImpl::D, F62, to, 8); 3572 __ ba_short(L_check_loop_end_192bit); 3573 3574 __ BIND(L_store_misaligned_output_192bit); 3575 __ add(to, 8, L3); 3576 __ mov(8, L4); 3577 __ sub(L4, L1, L4); 3578 __ alignaddr(L4, G0, L4); 3579 __ movdtox(F60, L6); 3580 __ movdtox(F62, L7); 3581 __ faligndata(F60, F60, F60); 3582 __ faligndata(F62, F62, F62); 3583 __ mov(to, L5); 3584 __ and3(to, -8, to); 3585 __ and3(L3, -8, L3); 3586 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3587 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3588 __ add(to, 8, to); 3589 __ add(L3, 8, L3); 3590 __ orn(G0, L2, L2); 3591 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3592 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3593 __ mov(L5, to); 3594 __ movxtod(L6, F60); 3595 __ movxtod(L7, F62); 3596 3597 __ BIND(L_check_loop_end_192bit); 3598 __ add(from, 16, from); 3599 __ subcc(len_reg, 16, len_reg); 3600 __ add(to, 16, to); 3601 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192); 3602 __ delayed()->nop(); 3603 // re-init intial vector for next block, 8-byte alignment is guaranteed 3604 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 3605 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 3606 __ mov(L0, I0); 3607 __ ret(); 3608 __ delayed()->restore(); 3609 3610 __ align(OptoLoopAlignment); 3611 __ BIND(L_cbcenc256); 3612 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3613 __ andcc(from, 7, G0); 3614 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit); 3615 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 3616 3617 // aligned case: load input into G3 and G4 3618 __ ldx(from,0,G3); 3619 __ ldx(from,8,G4); 3620 __ ba_short(L_256bit_transform); 3621 3622 __ BIND(L_load_misaligned_input_256bit); 3623 // cannot clobber F48, F50 and F52. F56, F58 can be used though 3624 __ alignaddr(from, G0, from); 3625 __ movdtox(F60, L2); // save F60 before overwriting 3626 __ ldf(FloatRegisterImpl::D, from, 0, F56); 3627 __ ldf(FloatRegisterImpl::D, from, 8, F58); 3628 __ ldf(FloatRegisterImpl::D, from, 16, F60); 3629 __ faligndata(F56, F58, F56); 3630 __ faligndata(F58, F60, F58); 3631 __ movdtox(F56, G3); 3632 __ movdtox(F58, G4); 3633 __ mov(L1, from); 3634 __ movxtod(L2, F60); 3635 3636 __ BIND(L_256bit_transform); 3637 __ xor3(G1,G3,G3); 3638 __ xor3(G5,G4,G4); 3639 __ movxtod(G3,F56); 3640 __ movxtod(G4,F58); 3641 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 3642 __ fxor(FloatRegisterImpl::D, F62, F58, F62); 3643 3644 // FOURTEEN_EROUNDS 3645 for ( int i = 0; i <= 48; i += 8 ) { 3646 __ aes_eround01(as_FloatRegister(i), F60, F62, F56); 3647 __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); 3648 if (i != 48 ) { 3649 __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); 3650 __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); 3651 } else { 3652 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 3653 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 3654 } 3655 } 3656 3657 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3658 __ andcc(to, 7, L1); 3659 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit); 3660 __ delayed()->edge8n(to, G0, L2); 3661 3662 // aligned case: store output into the destination array 3663 __ stf(FloatRegisterImpl::D, F60, to, 0); 3664 __ stf(FloatRegisterImpl::D, F62, to, 8); 3665 __ ba_short(L_check_loop_end_256bit); 3666 3667 __ BIND(L_store_misaligned_output_256bit); 3668 __ add(to, 8, L3); 3669 __ mov(8, L4); 3670 __ sub(L4, L1, L4); 3671 __ alignaddr(L4, G0, L4); 3672 __ movdtox(F60, L6); 3673 __ movdtox(F62, L7); 3674 __ faligndata(F60, F60, F60); 3675 __ faligndata(F62, F62, F62); 3676 __ mov(to, L5); 3677 __ and3(to, -8, to); 3678 __ and3(L3, -8, L3); 3679 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3680 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3681 __ add(to, 8, to); 3682 __ add(L3, 8, L3); 3683 __ orn(G0, L2, L2); 3684 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3685 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3686 __ mov(L5, to); 3687 __ movxtod(L6, F60); 3688 __ movxtod(L7, F62); 3689 3690 __ BIND(L_check_loop_end_256bit); 3691 __ add(from, 16, from); 3692 __ subcc(len_reg, 16, len_reg); 3693 __ add(to, 16, to); 3694 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256); 3695 __ delayed()->nop(); 3696 // re-init intial vector for next block, 8-byte alignment is guaranteed 3697 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 3698 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 3699 __ mov(L0, I0); 3700 __ ret(); 3701 __ delayed()->restore(); 3702 3703 return start; 3704 } 3705 3706 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 3707 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 3708 "the following code assumes that first element of an int array is aligned to 8 bytes"); 3709 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 3710 "the following code assumes that first element of a byte array is aligned to 8 bytes"); 3711 __ align(CodeEntryAlignment); 3712 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 3713 Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start; 3714 Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256; 3715 Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128; 3716 Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256; 3717 Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128; 3718 Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192; 3719 Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256; 3720 address start = __ pc(); 3721 Register from = I0; // source byte array 3722 Register to = I1; // destination byte array 3723 Register key = I2; // expanded key array 3724 Register rvec = I3; // init vector 3725 const Register len_reg = I4; // cipher length 3726 const Register original_key = I5; // original key array only required during decryption 3727 const Register keylen = L6; // reg for storing expanded key array length 3728 3729 __ save_frame(0); //args are read from I* registers since we save the frame in the beginning 3730 // save cipher len to return in the end 3731 __ mov(len_reg, L7); 3732 3733 // load original key from SunJCE expanded decryption key 3734 // Since we load original key buffer starting first element, 8-byte alignment is guaranteed 3735 for ( int i = 0; i <= 3; i++ ) { 3736 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 3737 } 3738 3739 // load initial vector, 8-byte alignment is guaranteed 3740 __ ldx(rvec,0,L0); 3741 __ ldx(rvec,8,L1); 3742 3743 // read expanded key array length 3744 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 3745 3746 // 256-bit original key size 3747 __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit); 3748 3749 // 192-bit original key size 3750 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit); 3751 3752 // 128-bit original key size 3753 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 3754 for ( int i = 0; i <= 36; i += 4 ) { 3755 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4)); 3756 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6)); 3757 } 3758 3759 // load expanded key[last-1] and key[last] elements 3760 __ movdtox(F40,L2); 3761 __ movdtox(F42,L3); 3762 3763 __ and3(len_reg, 16, L4); 3764 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128); 3765 __ nop(); 3766 3767 __ ba_short(L_dec_first_block_start); 3768 3769 __ BIND(L_expand192bit); 3770 // load rest of the 192-bit key 3771 __ ldf(FloatRegisterImpl::S, original_key, 16, F4); 3772 __ ldf(FloatRegisterImpl::S, original_key, 20, F5); 3773 3774 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 3775 for ( int i = 0; i <= 36; i += 6 ) { 3776 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6)); 3777 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8)); 3778 __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10)); 3779 } 3780 __ aes_kexpand1(F42, F46, 7, F48); 3781 __ aes_kexpand2(F44, F48, F50); 3782 3783 // load expanded key[last-1] and key[last] elements 3784 __ movdtox(F48,L2); 3785 __ movdtox(F50,L3); 3786 3787 __ and3(len_reg, 16, L4); 3788 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192); 3789 __ nop(); 3790 3791 __ ba_short(L_dec_first_block_start); 3792 3793 __ BIND(L_expand256bit); 3794 // load rest of the 256-bit key 3795 for ( int i = 4; i <= 7; i++ ) { 3796 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 3797 } 3798 3799 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 3800 for ( int i = 0; i <= 40; i += 8 ) { 3801 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8)); 3802 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10)); 3803 __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12)); 3804 __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14)); 3805 } 3806 __ aes_kexpand1(F48, F54, 6, F56); 3807 __ aes_kexpand2(F50, F56, F58); 3808 3809 // load expanded key[last-1] and key[last] elements 3810 __ movdtox(F56,L2); 3811 __ movdtox(F58,L3); 3812 3813 __ and3(len_reg, 16, L4); 3814 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256); 3815 3816 __ BIND(L_dec_first_block_start); 3817 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3818 __ andcc(from, 7, G0); 3819 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block); 3820 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 3821 3822 // aligned case: load input into L4 and L5 3823 __ ldx(from,0,L4); 3824 __ ldx(from,8,L5); 3825 __ ba_short(L_transform_first_block); 3826 3827 __ BIND(L_load_misaligned_input_first_block); 3828 __ alignaddr(from, G0, from); 3829 // F58, F60, F62 can be clobbered 3830 __ ldf(FloatRegisterImpl::D, from, 0, F58); 3831 __ ldf(FloatRegisterImpl::D, from, 8, F60); 3832 __ ldf(FloatRegisterImpl::D, from, 16, F62); 3833 __ faligndata(F58, F60, F58); 3834 __ faligndata(F60, F62, F60); 3835 __ movdtox(F58, L4); 3836 __ movdtox(F60, L5); 3837 __ mov(G1, from); 3838 3839 __ BIND(L_transform_first_block); 3840 __ xor3(L2,L4,G1); 3841 __ movxtod(G1,F60); 3842 __ xor3(L3,L5,G1); 3843 __ movxtod(G1,F62); 3844 3845 // 128-bit original key size 3846 __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128); 3847 3848 // 192-bit original key size 3849 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192); 3850 3851 __ aes_dround23(F54, F60, F62, F58); 3852 __ aes_dround01(F52, F60, F62, F56); 3853 __ aes_dround23(F50, F56, F58, F62); 3854 __ aes_dround01(F48, F56, F58, F60); 3855 3856 __ BIND(L_dec_first_block192); 3857 __ aes_dround23(F46, F60, F62, F58); 3858 __ aes_dround01(F44, F60, F62, F56); 3859 __ aes_dround23(F42, F56, F58, F62); 3860 __ aes_dround01(F40, F56, F58, F60); 3861 3862 __ BIND(L_dec_first_block128); 3863 for ( int i = 38; i >= 6; i -= 8 ) { 3864 __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 3865 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 3866 if ( i != 6) { 3867 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 3868 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 3869 } else { 3870 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); 3871 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); 3872 } 3873 } 3874 3875 __ movxtod(L0,F56); 3876 __ movxtod(L1,F58); 3877 __ mov(L4,L0); 3878 __ mov(L5,L1); 3879 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 3880 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 3881 3882 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3883 __ andcc(to, 7, G1); 3884 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block); 3885 __ delayed()->edge8n(to, G0, G2); 3886 3887 // aligned case: store output into the destination array 3888 __ stf(FloatRegisterImpl::D, F60, to, 0); 3889 __ stf(FloatRegisterImpl::D, F62, to, 8); 3890 __ ba_short(L_check_decrypt_end); 3891 3892 __ BIND(L_store_misaligned_output_first_block); 3893 __ add(to, 8, G3); 3894 __ mov(8, G4); 3895 __ sub(G4, G1, G4); 3896 __ alignaddr(G4, G0, G4); 3897 __ faligndata(F60, F60, F60); 3898 __ faligndata(F62, F62, F62); 3899 __ mov(to, G1); 3900 __ and3(to, -8, to); 3901 __ and3(G3, -8, G3); 3902 __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); 3903 __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); 3904 __ add(to, 8, to); 3905 __ add(G3, 8, G3); 3906 __ orn(G0, G2, G2); 3907 __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); 3908 __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); 3909 __ mov(G1, to); 3910 3911 __ BIND(L_check_decrypt_end); 3912 __ add(from, 16, from); 3913 __ add(to, 16, to); 3914 __ subcc(len_reg, 16, len_reg); 3915 __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end); 3916 __ delayed()->nop(); 3917 3918 // 256-bit original key size 3919 __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256); 3920 3921 // 192-bit original key size 3922 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192); 3923 3924 __ align(OptoLoopAlignment); 3925 __ BIND(L_dec_next2_blocks128); 3926 __ nop(); 3927 3928 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3929 __ andcc(from, 7, G0); 3930 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128); 3931 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 3932 3933 // aligned case: load input into G4, G5, L4 and L5 3934 __ ldx(from,0,G4); 3935 __ ldx(from,8,G5); 3936 __ ldx(from,16,L4); 3937 __ ldx(from,24,L5); 3938 __ ba_short(L_transform_next2_blocks128); 3939 3940 __ BIND(L_load_misaligned_next2_blocks128); 3941 __ alignaddr(from, G0, from); 3942 // F40, F42, F58, F60, F62 can be clobbered 3943 __ ldf(FloatRegisterImpl::D, from, 0, F40); 3944 __ ldf(FloatRegisterImpl::D, from, 8, F42); 3945 __ ldf(FloatRegisterImpl::D, from, 16, F60); 3946 __ ldf(FloatRegisterImpl::D, from, 24, F62); 3947 __ ldf(FloatRegisterImpl::D, from, 32, F58); 3948 __ faligndata(F40, F42, F40); 3949 __ faligndata(F42, F60, F42); 3950 __ faligndata(F60, F62, F60); 3951 __ faligndata(F62, F58, F62); 3952 __ movdtox(F40, G4); 3953 __ movdtox(F42, G5); 3954 __ movdtox(F60, L4); 3955 __ movdtox(F62, L5); 3956 __ mov(G1, from); 3957 3958 __ BIND(L_transform_next2_blocks128); 3959 // F40:F42 used for first 16-bytes 3960 __ xor3(L2,G4,G1); 3961 __ movxtod(G1,F40); 3962 __ xor3(L3,G5,G1); 3963 __ movxtod(G1,F42); 3964 3965 // F60:F62 used for next 16-bytes 3966 __ xor3(L2,L4,G1); 3967 __ movxtod(G1,F60); 3968 __ xor3(L3,L5,G1); 3969 __ movxtod(G1,F62); 3970 3971 for ( int i = 38; i >= 6; i -= 8 ) { 3972 __ aes_dround23(as_FloatRegister(i), F40, F42, F44); 3973 __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46); 3974 __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 3975 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 3976 if (i != 6 ) { 3977 __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42); 3978 __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40); 3979 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 3980 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 3981 } else { 3982 __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42); 3983 __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40); 3984 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); 3985 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); 3986 } 3987 } 3988 3989 __ movxtod(L0,F46); 3990 __ movxtod(L1,F44); 3991 __ fxor(FloatRegisterImpl::D, F46, F40, F40); 3992 __ fxor(FloatRegisterImpl::D, F44, F42, F42); 3993 3994 __ movxtod(G4,F56); 3995 __ movxtod(G5,F58); 3996 __ mov(L4,L0); 3997 __ mov(L5,L1); 3998 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 3999 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 4000 4001 // For mis-aligned store of 32 bytes of result we can do: 4002 // Circular right-shift all 4 FP registers so that 'head' and 'tail' 4003 // parts that need to be stored starting at mis-aligned address are in a FP reg 4004 // the other 3 FP regs can thus be stored using regular store 4005 // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts 4006 4007 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 4008 __ andcc(to, 7, G1); 4009 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128); 4010 __ delayed()->edge8n(to, G0, G2); 4011 4012 // aligned case: store output into the destination array 4013 __ stf(FloatRegisterImpl::D, F40, to, 0); 4014 __ stf(FloatRegisterImpl::D, F42, to, 8); 4015 __ stf(FloatRegisterImpl::D, F60, to, 16); 4016 __ stf(FloatRegisterImpl::D, F62, to, 24); 4017 __ ba_short(L_check_decrypt_loop_end128); 4018 4019 __ BIND(L_store_misaligned_output_next2_blocks128); 4020 __ mov(8, G4); 4021 __ sub(G4, G1, G4); 4022 __ alignaddr(G4, G0, G4); 4023 __ faligndata(F40, F42, F56); // F56 can be clobbered 4024 __ faligndata(F42, F60, F42); 4025 __ faligndata(F60, F62, F60); 4026 __ faligndata(F62, F40, F40); 4027 __ mov(to, G1); 4028 __ and3(to, -8, to); 4029 __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); 4030 __ stf(FloatRegisterImpl::D, F56, to, 8); 4031 __ stf(FloatRegisterImpl::D, F42, to, 16); 4032 __ stf(FloatRegisterImpl::D, F60, to, 24); 4033 __ add(to, 32, to); 4034 __ orn(G0, G2, G2); 4035 __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); 4036 __ mov(G1, to); 4037 4038 __ BIND(L_check_decrypt_loop_end128); 4039 __ add(from, 32, from); 4040 __ add(to, 32, to); 4041 __ subcc(len_reg, 32, len_reg); 4042 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128); 4043 __ delayed()->nop(); 4044 __ ba_short(L_cbcdec_end); 4045 4046 __ align(OptoLoopAlignment); 4047 __ BIND(L_dec_next2_blocks192); 4048 __ nop(); 4049 4050 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 4051 __ andcc(from, 7, G0); 4052 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192); 4053 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 4054 4055 // aligned case: load input into G4, G5, L4 and L5 4056 __ ldx(from,0,G4); 4057 __ ldx(from,8,G5); 4058 __ ldx(from,16,L4); 4059 __ ldx(from,24,L5); 4060 __ ba_short(L_transform_next2_blocks192); 4061 4062 __ BIND(L_load_misaligned_next2_blocks192); 4063 __ alignaddr(from, G0, from); 4064 // F48, F50, F52, F60, F62 can be clobbered 4065 __ ldf(FloatRegisterImpl::D, from, 0, F48); 4066 __ ldf(FloatRegisterImpl::D, from, 8, F50); 4067 __ ldf(FloatRegisterImpl::D, from, 16, F60); 4068 __ ldf(FloatRegisterImpl::D, from, 24, F62); 4069 __ ldf(FloatRegisterImpl::D, from, 32, F52); 4070 __ faligndata(F48, F50, F48); 4071 __ faligndata(F50, F60, F50); 4072 __ faligndata(F60, F62, F60); 4073 __ faligndata(F62, F52, F62); 4074 __ movdtox(F48, G4); 4075 __ movdtox(F50, G5); 4076 __ movdtox(F60, L4); 4077 __ movdtox(F62, L5); 4078 __ mov(G1, from); 4079 4080 __ BIND(L_transform_next2_blocks192); 4081 // F48:F50 used for first 16-bytes 4082 __ xor3(L2,G4,G1); 4083 __ movxtod(G1,F48); 4084 __ xor3(L3,G5,G1); 4085 __ movxtod(G1,F50); 4086 4087 // F60:F62 used for next 16-bytes 4088 __ xor3(L2,L4,G1); 4089 __ movxtod(G1,F60); 4090 __ xor3(L3,L5,G1); 4091 __ movxtod(G1,F62); 4092 4093 for ( int i = 46; i >= 6; i -= 8 ) { 4094 __ aes_dround23(as_FloatRegister(i), F48, F50, F52); 4095 __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54); 4096 __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 4097 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 4098 if (i != 6 ) { 4099 __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50); 4100 __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48); 4101 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 4102 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 4103 } else { 4104 __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50); 4105 __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48); 4106 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); 4107 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); 4108 } 4109 } 4110 4111 __ movxtod(L0,F54); 4112 __ movxtod(L1,F52); 4113 __ fxor(FloatRegisterImpl::D, F54, F48, F48); 4114 __ fxor(FloatRegisterImpl::D, F52, F50, F50); 4115 4116 __ movxtod(G4,F56); 4117 __ movxtod(G5,F58); 4118 __ mov(L4,L0); 4119 __ mov(L5,L1); 4120 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 4121 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 4122 4123 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 4124 __ andcc(to, 7, G1); 4125 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192); 4126 __ delayed()->edge8n(to, G0, G2); 4127 4128 // aligned case: store output into the destination array 4129 __ stf(FloatRegisterImpl::D, F48, to, 0); 4130 __ stf(FloatRegisterImpl::D, F50, to, 8); 4131 __ stf(FloatRegisterImpl::D, F60, to, 16); 4132 __ stf(FloatRegisterImpl::D, F62, to, 24); 4133 __ ba_short(L_check_decrypt_loop_end192); 4134 4135 __ BIND(L_store_misaligned_output_next2_blocks192); 4136 __ mov(8, G4); 4137 __ sub(G4, G1, G4); 4138 __ alignaddr(G4, G0, G4); 4139 __ faligndata(F48, F50, F56); // F56 can be clobbered 4140 __ faligndata(F50, F60, F50); 4141 __ faligndata(F60, F62, F60); 4142 __ faligndata(F62, F48, F48); 4143 __ mov(to, G1); 4144 __ and3(to, -8, to); 4145 __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); 4146 __ stf(FloatRegisterImpl::D, F56, to, 8); 4147 __ stf(FloatRegisterImpl::D, F50, to, 16); 4148 __ stf(FloatRegisterImpl::D, F60, to, 24); 4149 __ add(to, 32, to); 4150 __ orn(G0, G2, G2); 4151 __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); 4152 __ mov(G1, to); 4153 4154 __ BIND(L_check_decrypt_loop_end192); 4155 __ add(from, 32, from); 4156 __ add(to, 32, to); 4157 __ subcc(len_reg, 32, len_reg); 4158 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192); 4159 __ delayed()->nop(); 4160 __ ba_short(L_cbcdec_end); 4161 4162 __ align(OptoLoopAlignment); 4163 __ BIND(L_dec_next2_blocks256); 4164 __ nop(); 4165 4166 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 4167 __ andcc(from, 7, G0); 4168 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256); 4169 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 4170 4171 // aligned case: load input into G4, G5, L4 and L5 4172 __ ldx(from,0,G4); 4173 __ ldx(from,8,G5); 4174 __ ldx(from,16,L4); 4175 __ ldx(from,24,L5); 4176 __ ba_short(L_transform_next2_blocks256); 4177 4178 __ BIND(L_load_misaligned_next2_blocks256); 4179 __ alignaddr(from, G0, from); 4180 // F0, F2, F4, F60, F62 can be clobbered 4181 __ ldf(FloatRegisterImpl::D, from, 0, F0); 4182 __ ldf(FloatRegisterImpl::D, from, 8, F2); 4183 __ ldf(FloatRegisterImpl::D, from, 16, F60); 4184 __ ldf(FloatRegisterImpl::D, from, 24, F62); 4185 __ ldf(FloatRegisterImpl::D, from, 32, F4); 4186 __ faligndata(F0, F2, F0); 4187 __ faligndata(F2, F60, F2); 4188 __ faligndata(F60, F62, F60); 4189 __ faligndata(F62, F4, F62); 4190 __ movdtox(F0, G4); 4191 __ movdtox(F2, G5); 4192 __ movdtox(F60, L4); 4193 __ movdtox(F62, L5); 4194 __ mov(G1, from); 4195 4196 __ BIND(L_transform_next2_blocks256); 4197 // F0:F2 used for first 16-bytes 4198 __ xor3(L2,G4,G1); 4199 __ movxtod(G1,F0); 4200 __ xor3(L3,G5,G1); 4201 __ movxtod(G1,F2); 4202 4203 // F60:F62 used for next 16-bytes 4204 __ xor3(L2,L4,G1); 4205 __ movxtod(G1,F60); 4206 __ xor3(L3,L5,G1); 4207 __ movxtod(G1,F62); 4208 4209 __ aes_dround23(F54, F0, F2, F4); 4210 __ aes_dround01(F52, F0, F2, F6); 4211 __ aes_dround23(F54, F60, F62, F58); 4212 __ aes_dround01(F52, F60, F62, F56); 4213 __ aes_dround23(F50, F6, F4, F2); 4214 __ aes_dround01(F48, F6, F4, F0); 4215 __ aes_dround23(F50, F56, F58, F62); 4216 __ aes_dround01(F48, F56, F58, F60); 4217 // save F48:F54 in temp registers 4218 __ movdtox(F54,G2); 4219 __ movdtox(F52,G3); 4220 __ movdtox(F50,G6); 4221 __ movdtox(F48,G1); 4222 for ( int i = 46; i >= 14; i -= 8 ) { 4223 __ aes_dround23(as_FloatRegister(i), F0, F2, F4); 4224 __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6); 4225 __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 4226 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 4227 __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2); 4228 __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0); 4229 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 4230 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 4231 } 4232 // init F48:F54 with F0:F6 values (original key) 4233 __ ldf(FloatRegisterImpl::D, original_key, 0, F48); 4234 __ ldf(FloatRegisterImpl::D, original_key, 8, F50); 4235 __ ldf(FloatRegisterImpl::D, original_key, 16, F52); 4236 __ ldf(FloatRegisterImpl::D, original_key, 24, F54); 4237 __ aes_dround23(F54, F0, F2, F4); 4238 __ aes_dround01(F52, F0, F2, F6); 4239 __ aes_dround23(F54, F60, F62, F58); 4240 __ aes_dround01(F52, F60, F62, F56); 4241 __ aes_dround23_l(F50, F6, F4, F2); 4242 __ aes_dround01_l(F48, F6, F4, F0); 4243 __ aes_dround23_l(F50, F56, F58, F62); 4244 __ aes_dround01_l(F48, F56, F58, F60); 4245 // re-init F48:F54 with their original values 4246 __ movxtod(G2,F54); 4247 __ movxtod(G3,F52); 4248 __ movxtod(G6,F50); 4249 __ movxtod(G1,F48); 4250 4251 __ movxtod(L0,F6); 4252 __ movxtod(L1,F4); 4253 __ fxor(FloatRegisterImpl::D, F6, F0, F0); 4254 __ fxor(FloatRegisterImpl::D, F4, F2, F2); 4255 4256 __ movxtod(G4,F56); 4257 __ movxtod(G5,F58); 4258 __ mov(L4,L0); 4259 __ mov(L5,L1); 4260 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 4261 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 4262 4263 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 4264 __ andcc(to, 7, G1); 4265 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256); 4266 __ delayed()->edge8n(to, G0, G2); 4267 4268 // aligned case: store output into the destination array 4269 __ stf(FloatRegisterImpl::D, F0, to, 0); 4270 __ stf(FloatRegisterImpl::D, F2, to, 8); 4271 __ stf(FloatRegisterImpl::D, F60, to, 16); 4272 __ stf(FloatRegisterImpl::D, F62, to, 24); 4273 __ ba_short(L_check_decrypt_loop_end256); 4274 4275 __ BIND(L_store_misaligned_output_next2_blocks256); 4276 __ mov(8, G4); 4277 __ sub(G4, G1, G4); 4278 __ alignaddr(G4, G0, G4); 4279 __ faligndata(F0, F2, F56); // F56 can be clobbered 4280 __ faligndata(F2, F60, F2); 4281 __ faligndata(F60, F62, F60); 4282 __ faligndata(F62, F0, F0); 4283 __ mov(to, G1); 4284 __ and3(to, -8, to); 4285 __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); 4286 __ stf(FloatRegisterImpl::D, F56, to, 8); 4287 __ stf(FloatRegisterImpl::D, F2, to, 16); 4288 __ stf(FloatRegisterImpl::D, F60, to, 24); 4289 __ add(to, 32, to); 4290 __ orn(G0, G2, G2); 4291 __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); 4292 __ mov(G1, to); 4293 4294 __ BIND(L_check_decrypt_loop_end256); 4295 __ add(from, 32, from); 4296 __ add(to, 32, to); 4297 __ subcc(len_reg, 32, len_reg); 4298 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256); 4299 __ delayed()->nop(); 4300 4301 __ BIND(L_cbcdec_end); 4302 // re-init intial vector for next block, 8-byte alignment is guaranteed 4303 __ stx(L0, rvec, 0); 4304 __ stx(L1, rvec, 8); 4305 __ mov(L7, I0); 4306 __ ret(); 4307 __ delayed()->restore(); 4308 4309 return start; 4310 } 4311 4312 address generate_sha1_implCompress(bool multi_block, const char *name) { 4313 __ align(CodeEntryAlignment); 4314 StubCodeMark mark(this, "StubRoutines", name); 4315 address start = __ pc(); 4316 4317 Label L_sha1_loop, L_sha1_unaligned_input, L_sha1_unaligned_input_loop; 4318 int i; 4319 4320 Register buf = O0; // byte[] source+offset 4321 Register state = O1; // int[] SHA.state 4322 Register ofs = O2; // int offset 4323 Register limit = O3; // int limit 4324 4325 // load state into F0-F4 4326 for (i = 0; i < 5; i++) { 4327 __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i)); 4328 } 4329 4330 __ andcc(buf, 7, G0); 4331 __ br(Assembler::notZero, false, Assembler::pn, L_sha1_unaligned_input); 4332 __ delayed()->nop(); 4333 4334 __ BIND(L_sha1_loop); 4335 // load buf into F8-F22 4336 for (i = 0; i < 8; i++) { 4337 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); 4338 } 4339 __ sha1(); 4340 if (multi_block) { 4341 __ add(ofs, 64, ofs); 4342 __ add(buf, 64, buf); 4343 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_loop); 4344 __ mov(ofs, O0); // to be returned 4345 } 4346 4347 // store F0-F4 into state and return 4348 for (i = 0; i < 4; i++) { 4349 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); 4350 } 4351 __ retl(); 4352 __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10); 4353 4354 __ BIND(L_sha1_unaligned_input); 4355 __ alignaddr(buf, G0, buf); 4356 4357 __ BIND(L_sha1_unaligned_input_loop); 4358 // load buf into F8-F22 4359 for (i = 0; i < 9; i++) { 4360 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); 4361 } 4362 for (i = 0; i < 8; i++) { 4363 __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8)); 4364 } 4365 __ sha1(); 4366 if (multi_block) { 4367 __ add(ofs, 64, ofs); 4368 __ add(buf, 64, buf); 4369 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_unaligned_input_loop); 4370 __ mov(ofs, O0); // to be returned 4371 } 4372 4373 // store F0-F4 into state and return 4374 for (i = 0; i < 4; i++) { 4375 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); 4376 } 4377 __ retl(); 4378 __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10); 4379 4380 return start; 4381 } 4382 4383 address generate_sha256_implCompress(bool multi_block, const char *name) { 4384 __ align(CodeEntryAlignment); 4385 StubCodeMark mark(this, "StubRoutines", name); 4386 address start = __ pc(); 4387 4388 Label L_sha256_loop, L_sha256_unaligned_input, L_sha256_unaligned_input_loop; 4389 int i; 4390 4391 Register buf = O0; // byte[] source+offset 4392 Register state = O1; // int[] SHA2.state 4393 Register ofs = O2; // int offset 4394 Register limit = O3; // int limit 4395 4396 // load state into F0-F7 4397 for (i = 0; i < 8; i++) { 4398 __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i)); 4399 } 4400 4401 __ andcc(buf, 7, G0); 4402 __ br(Assembler::notZero, false, Assembler::pn, L_sha256_unaligned_input); 4403 __ delayed()->nop(); 4404 4405 __ BIND(L_sha256_loop); 4406 // load buf into F8-F22 4407 for (i = 0; i < 8; i++) { 4408 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); 4409 } 4410 __ sha256(); 4411 if (multi_block) { 4412 __ add(ofs, 64, ofs); 4413 __ add(buf, 64, buf); 4414 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_loop); 4415 __ mov(ofs, O0); // to be returned 4416 } 4417 4418 // store F0-F7 into state and return 4419 for (i = 0; i < 7; i++) { 4420 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); 4421 } 4422 __ retl(); 4423 __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c); 4424 4425 __ BIND(L_sha256_unaligned_input); 4426 __ alignaddr(buf, G0, buf); 4427 4428 __ BIND(L_sha256_unaligned_input_loop); 4429 // load buf into F8-F22 4430 for (i = 0; i < 9; i++) { 4431 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); 4432 } 4433 for (i = 0; i < 8; i++) { 4434 __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8)); 4435 } 4436 __ sha256(); 4437 if (multi_block) { 4438 __ add(ofs, 64, ofs); 4439 __ add(buf, 64, buf); 4440 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_unaligned_input_loop); 4441 __ mov(ofs, O0); // to be returned 4442 } 4443 4444 // store F0-F7 into state and return 4445 for (i = 0; i < 7; i++) { 4446 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); 4447 } 4448 __ retl(); 4449 __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c); 4450 4451 return start; 4452 } 4453 4454 address generate_sha512_implCompress(bool multi_block, const char *name) { 4455 __ align(CodeEntryAlignment); 4456 StubCodeMark mark(this, "StubRoutines", name); 4457 address start = __ pc(); 4458 4459 Label L_sha512_loop, L_sha512_unaligned_input, L_sha512_unaligned_input_loop; 4460 int i; 4461 4462 Register buf = O0; // byte[] source+offset 4463 Register state = O1; // long[] SHA5.state 4464 Register ofs = O2; // int offset 4465 Register limit = O3; // int limit 4466 4467 // load state into F0-F14 4468 for (i = 0; i < 8; i++) { 4469 __ ldf(FloatRegisterImpl::D, state, i*8, as_FloatRegister(i*2)); 4470 } 4471 4472 __ andcc(buf, 7, G0); 4473 __ br(Assembler::notZero, false, Assembler::pn, L_sha512_unaligned_input); 4474 __ delayed()->nop(); 4475 4476 __ BIND(L_sha512_loop); 4477 // load buf into F16-F46 4478 for (i = 0; i < 16; i++) { 4479 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16)); 4480 } 4481 __ sha512(); 4482 if (multi_block) { 4483 __ add(ofs, 128, ofs); 4484 __ add(buf, 128, buf); 4485 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_loop); 4486 __ mov(ofs, O0); // to be returned 4487 } 4488 4489 // store F0-F14 into state and return 4490 for (i = 0; i < 7; i++) { 4491 __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8); 4492 } 4493 __ retl(); 4494 __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38); 4495 4496 __ BIND(L_sha512_unaligned_input); 4497 __ alignaddr(buf, G0, buf); 4498 4499 __ BIND(L_sha512_unaligned_input_loop); 4500 // load buf into F16-F46 4501 for (i = 0; i < 17; i++) { 4502 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16)); 4503 } 4504 for (i = 0; i < 16; i++) { 4505 __ faligndata(as_FloatRegister(i*2 + 16), as_FloatRegister(i*2 + 18), as_FloatRegister(i*2 + 16)); 4506 } 4507 __ sha512(); 4508 if (multi_block) { 4509 __ add(ofs, 128, ofs); 4510 __ add(buf, 128, buf); 4511 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_unaligned_input_loop); 4512 __ mov(ofs, O0); // to be returned 4513 } 4514 4515 // store F0-F14 into state and return 4516 for (i = 0; i < 7; i++) { 4517 __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8); 4518 } 4519 __ retl(); 4520 __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38); 4521 4522 return start; 4523 } 4524 4525 /* Single and multi-block ghash operations */ 4526 address generate_ghash_processBlocks() { 4527 __ align(CodeEntryAlignment); 4528 Label L_ghash_loop, L_aligned, L_main; 4529 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4530 address start = __ pc(); 4531 4532 Register state = I0; 4533 Register subkeyH = I1; 4534 Register data = I2; 4535 Register len = I3; 4536 4537 __ save_frame(0); 4538 4539 __ ldx(state, 0, O0); 4540 __ ldx(state, 8, O1); 4541 4542 // Loop label for multiblock operations 4543 __ BIND(L_ghash_loop); 4544 4545 // Check if 'data' is unaligned 4546 __ andcc(data, 7, G1); 4547 __ br(Assembler::zero, false, Assembler::pt, L_aligned); 4548 __ delayed()->nop(); 4549 4550 Register left_shift = L1; 4551 Register right_shift = L2; 4552 Register data_ptr = L3; 4553 4554 // Get left and right shift values in bits 4555 __ sll(G1, LogBitsPerByte, left_shift); 4556 __ mov(64, right_shift); 4557 __ sub(right_shift, left_shift, right_shift); 4558 4559 // Align to read 'data' 4560 __ sub(data, G1, data_ptr); 4561 4562 // Load first 8 bytes of 'data' 4563 __ ldx(data_ptr, 0, O4); 4564 __ sllx(O4, left_shift, O4); 4565 __ ldx(data_ptr, 8, O5); 4566 __ srlx(O5, right_shift, G4); 4567 __ bset(G4, O4); 4568 4569 // Load second 8 bytes of 'data' 4570 __ sllx(O5, left_shift, O5); 4571 __ ldx(data_ptr, 16, G4); 4572 __ srlx(G4, right_shift, G4); 4573 __ ba(L_main); 4574 __ delayed()->bset(G4, O5); 4575 4576 // If 'data' is aligned, load normally 4577 __ BIND(L_aligned); 4578 __ ldx(data, 0, O4); 4579 __ ldx(data, 8, O5); 4580 4581 __ BIND(L_main); 4582 __ ldx(subkeyH, 0, O2); 4583 __ ldx(subkeyH, 8, O3); 4584 4585 __ xor3(O0, O4, O0); 4586 __ xor3(O1, O5, O1); 4587 4588 __ xmulxhi(O0, O3, G3); 4589 __ xmulx(O0, O2, O5); 4590 __ xmulxhi(O1, O2, G4); 4591 __ xmulxhi(O1, O3, G5); 4592 __ xmulx(O0, O3, G1); 4593 __ xmulx(O1, O3, G2); 4594 __ xmulx(O1, O2, O3); 4595 __ xmulxhi(O0, O2, O4); 4596 4597 __ mov(0xE1, O0); 4598 __ sllx(O0, 56, O0); 4599 4600 __ xor3(O5, G3, O5); 4601 __ xor3(O5, G4, O5); 4602 __ xor3(G5, G1, G1); 4603 __ xor3(G1, O3, G1); 4604 __ srlx(G2, 63, O1); 4605 __ srlx(G1, 63, G3); 4606 __ sllx(G2, 63, O3); 4607 __ sllx(G2, 58, O2); 4608 __ xor3(O3, O2, O2); 4609 4610 __ sllx(G1, 1, G1); 4611 __ or3(G1, O1, G1); 4612 4613 __ xor3(G1, O2, G1); 4614 4615 __ sllx(G2, 1, G2); 4616 4617 __ xmulxhi(G1, O0, O1); 4618 __ xmulx(G1, O0, O2); 4619 __ xmulxhi(G2, O0, O3); 4620 __ xmulx(G2, O0, G1); 4621 4622 __ xor3(O4, O1, O4); 4623 __ xor3(O5, O2, O5); 4624 __ xor3(O5, O3, O5); 4625 4626 __ sllx(O4, 1, O2); 4627 __ srlx(O5, 63, O3); 4628 4629 __ or3(O2, O3, O0); 4630 4631 __ sllx(O5, 1, O1); 4632 __ srlx(G1, 63, O2); 4633 __ or3(O1, O2, O1); 4634 __ xor3(O1, G3, O1); 4635 4636 __ deccc(len); 4637 __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop); 4638 __ delayed()->add(data, 16, data); 4639 4640 __ stx(O0, I0, 0); 4641 __ stx(O1, I0, 8); 4642 4643 __ ret(); 4644 __ delayed()->restore(); 4645 4646 return start; 4647 } 4648 4649 /** 4650 * Arguments: 4651 * 4652 * Inputs: 4653 * O0 - int crc 4654 * O1 - byte* buf 4655 * O2 - int len 4656 * O3 - int* table 4657 * 4658 * Output: 4659 * O0 - int crc result 4660 */ 4661 address generate_updateBytesCRC32C() { 4662 assert(UseCRC32CIntrinsics, "need CRC32C instruction"); 4663 4664 __ align(CodeEntryAlignment); 4665 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4666 address start = __ pc(); 4667 4668 const Register crc = O0; // crc 4669 const Register buf = O1; // source java byte array address 4670 const Register len = O2; // number of bytes 4671 const Register table = O3; // byteTable 4672 4673 __ kernel_crc32c(crc, buf, len, table); 4674 4675 __ retl(); 4676 __ delayed()->nop(); 4677 4678 return start; 4679 } 4680 4681 #define ADLER32_NUM_TEMPS 16 4682 4683 /** 4684 * Arguments: 4685 * 4686 * Inputs: 4687 * O0 - int adler 4688 * O1 - byte* buff 4689 * O2 - int len 4690 * 4691 * Output: 4692 * O0 - int adler result 4693 */ 4694 address generate_updateBytesAdler32() { 4695 __ align(CodeEntryAlignment); 4696 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4697 address start = __ pc(); 4698 4699 Label L_cleanup_loop, L_cleanup_loop_check; 4700 Label L_main_loop_check, L_main_loop, L_inner_loop, L_inner_loop_check; 4701 Label L_nmax_check_done; 4702 4703 // Aliases 4704 Register s1 = O0; 4705 Register s2 = O3; 4706 Register buff = O1; 4707 Register len = O2; 4708 Register temp[ADLER32_NUM_TEMPS] = {L0, L1, L2, L3, L4, L5, L6, L7, I0, I1, I2, I3, I4, I5, G3, I7}; 4709 4710 // Max number of bytes we can process before having to take the mod 4711 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4712 unsigned long NMAX = 0x15B0; 4713 4714 // Zero-out the upper bits of len 4715 __ clruwu(len); 4716 4717 // Create the mask 0xFFFF 4718 __ set64(0x00FFFF, O4, O5); // O5 is the temp register 4719 4720 // s1 is initialized to the lower 16 bits of adler 4721 // s2 is initialized to the upper 16 bits of adler 4722 __ srlx(O0, 16, O5); // adler >> 16 4723 __ and3(O0, O4, s1); // s1 = (adler & 0xFFFF) 4724 __ and3(O5, O4, s2); // s2 = ((adler >> 16) & 0xFFFF) 4725 4726 // The pipelined loop needs at least 16 elements for 1 iteration 4727 // It does check this, but it is more effective to skip to the cleanup loop 4728 // Setup the constant for cutoff checking 4729 __ mov(15, O4); 4730 4731 // Check if we are above the cutoff, if not go to the cleanup loop immediately 4732 __ cmp_and_br_short(len, O4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_loop_check); 4733 4734 // Free up some registers for our use 4735 for (int i = 0; i < ADLER32_NUM_TEMPS; i++) { 4736 __ movxtod(temp[i], as_FloatRegister(2*i)); 4737 } 4738 4739 // Loop maintenance stuff is done at the end of the loop, so skip to there 4740 __ ba_short(L_main_loop_check); 4741 4742 __ BIND(L_main_loop); 4743 4744 // Prologue for inner loop 4745 __ ldub(buff, 0, L0); 4746 __ dec(O5); 4747 4748 for (int i = 1; i < 8; i++) { 4749 __ ldub(buff, i, temp[i]); 4750 } 4751 4752 __ inc(buff, 8); 4753 4754 // Inner loop processes 16 elements at a time, might never execute if only 16 elements 4755 // to be processed by the outter loop 4756 __ ba_short(L_inner_loop_check); 4757 4758 __ BIND(L_inner_loop); 4759 4760 for (int i = 0; i < 8; i++) { 4761 __ ldub(buff, (2*i), temp[(8+(2*i)) % ADLER32_NUM_TEMPS]); 4762 __ add(s1, temp[i], s1); 4763 __ ldub(buff, (2*i)+1, temp[(8+(2*i)+1) % ADLER32_NUM_TEMPS]); 4764 __ add(s2, s1, s2); 4765 } 4766 4767 // Original temp 0-7 used and new loads to temp 0-7 issued 4768 // temp 8-15 ready to be consumed 4769 __ add(s1, I0, s1); 4770 __ dec(O5); 4771 __ add(s2, s1, s2); 4772 __ add(s1, I1, s1); 4773 __ inc(buff, 16); 4774 __ add(s2, s1, s2); 4775 4776 for (int i = 0; i < 6; i++) { 4777 __ add(s1, temp[10+i], s1); 4778 __ add(s2, s1, s2); 4779 } 4780 4781 __ BIND(L_inner_loop_check); 4782 __ nop(); 4783 __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_inner_loop); 4784 4785 // Epilogue 4786 for (int i = 0; i < 4; i++) { 4787 __ ldub(buff, (2*i), temp[8+(2*i)]); 4788 __ add(s1, temp[i], s1); 4789 __ ldub(buff, (2*i)+1, temp[8+(2*i)+1]); 4790 __ add(s2, s1, s2); 4791 } 4792 4793 __ add(s1, temp[4], s1); 4794 __ inc(buff, 8); 4795 4796 for (int i = 0; i < 11; i++) { 4797 __ add(s2, s1, s2); 4798 __ add(s1, temp[5+i], s1); 4799 } 4800 4801 __ add(s2, s1, s2); 4802 4803 // Take the mod for s1 and s2 4804 __ set64(0xFFF1, L0, L1); 4805 __ udivx(s1, L0, L1); 4806 __ udivx(s2, L0, L2); 4807 __ mulx(L0, L1, L1); 4808 __ mulx(L0, L2, L2); 4809 __ sub(s1, L1, s1); 4810 __ sub(s2, L2, s2); 4811 4812 // Make sure there is something left to process 4813 __ BIND(L_main_loop_check); 4814 __ set64(NMAX, L0, L1); 4815 // k = len < NMAX ? len : NMAX 4816 __ cmp_and_br_short(len, L0, Assembler::greaterEqualUnsigned, Assembler::pt, L_nmax_check_done); 4817 __ andn(len, 0x0F, L0); // only loop a multiple of 16 times 4818 __ BIND(L_nmax_check_done); 4819 __ mov(L0, O5); 4820 __ sub(len, L0, len); // len -= k 4821 4822 __ srlx(O5, 4, O5); // multiplies of 16 4823 __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_main_loop); 4824 4825 // Restore anything we used, take the mod one last time, combine and return 4826 // Restore any registers we saved 4827 for (int i = 0; i < ADLER32_NUM_TEMPS; i++) { 4828 __ movdtox(as_FloatRegister(2*i), temp[i]); 4829 } 4830 4831 // There might be nothing left to process 4832 __ ba_short(L_cleanup_loop_check); 4833 4834 __ BIND(L_cleanup_loop); 4835 __ ldub(buff, 0, O4); // load single byte form buffer 4836 __ inc(buff); // buff++ 4837 __ add(s1, O4, s1); // s1 += *buff++; 4838 __ dec(len); // len-- 4839 __ add(s1, s2, s2); // s2 += s1; 4840 __ BIND(L_cleanup_loop_check); 4841 __ nop(); 4842 __ cmp_and_br_short(len, 0, Assembler::notEqual, Assembler::pt, L_cleanup_loop); 4843 4844 // Take the mod one last time 4845 __ set64(0xFFF1, O1, O2); 4846 __ udivx(s1, O1, O2); 4847 __ udivx(s2, O1, O5); 4848 __ mulx(O1, O2, O2); 4849 __ mulx(O1, O5, O5); 4850 __ sub(s1, O2, s1); 4851 __ sub(s2, O5, s2); 4852 4853 // Combine lower bits and higher bits 4854 __ sllx(s2, 16, s2); // s2 = s2 << 16 4855 __ or3(s1, s2, s1); // adler = s2 | s1 4856 // Final return value is in O0 4857 __ retl(); 4858 __ delayed()->nop(); 4859 4860 return start; 4861 } 4862 4863 /** 4864 * Arguments: 4865 * 4866 * Inputs: 4867 * O0 - int crc 4868 * O1 - byte* buf 4869 * O2 - int len 4870 * O3 - int* table 4871 * 4872 * Output: 4873 * O0 - int crc result 4874 */ 4875 address generate_updateBytesCRC32() { 4876 assert(UseCRC32Intrinsics, "need VIS3 instructions"); 4877 4878 __ align(CodeEntryAlignment); 4879 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4880 address start = __ pc(); 4881 4882 const Register crc = O0; // crc 4883 const Register buf = O1; // source java byte array address 4884 const Register len = O2; // length 4885 const Register table = O3; // crc_table address (reuse register) 4886 4887 __ kernel_crc32(crc, buf, len, table); 4888 4889 __ retl(); 4890 __ delayed()->nop(); 4891 4892 return start; 4893 } 4894 4895 void generate_initial() { 4896 // Generates all stubs and initializes the entry points 4897 4898 //------------------------------------------------------------------------------------------------------------------------ 4899 // entry points that exist in all platforms 4900 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than 4901 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp. 4902 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4903 4904 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address); 4905 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4906 4907 //------------------------------------------------------------------------------------------------------------------------ 4908 // entry points that are platform specific 4909 StubRoutines::Sparc::_test_stop_entry = generate_test_stop(); 4910 4911 StubRoutines::Sparc::_stop_subroutine_entry = generate_stop_subroutine(); 4912 StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows(); 4913 4914 // Build this early so it's available for the interpreter. 4915 StubRoutines::_throw_StackOverflowError_entry = 4916 generate_throw_exception("StackOverflowError throw_exception", 4917 CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError)); 4918 StubRoutines::_throw_delayed_StackOverflowError_entry = 4919 generate_throw_exception("delayed StackOverflowError throw_exception", 4920 CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError)); 4921 4922 if (UseCRC32Intrinsics) { 4923 // set table address before stub generation which use it 4924 StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table; 4925 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4926 } 4927 4928 if (UseCRC32CIntrinsics) { 4929 // set table address before stub generation which use it 4930 StubRoutines::_crc32c_table_addr = (address)StubRoutines::Sparc::_crc32c_table; 4931 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 4932 } 4933 } 4934 4935 4936 void generate_all() { 4937 // Generates all stubs and initializes the entry points 4938 4939 // Generate partial_subtype_check first here since its code depends on 4940 // UseZeroBaseCompressedOops which is defined after heap initialization. 4941 StubRoutines::Sparc::_partial_subtype_check = generate_partial_subtype_check(); 4942 // These entry points require SharedInfo::stack0 to be set up in non-core builds 4943 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError)); 4944 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError)); 4945 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call)); 4946 4947 // support for verify_oop (must happen after universe_init) 4948 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop_subroutine(); 4949 4950 // arraycopy stubs used by compilers 4951 generate_arraycopy_stubs(); 4952 4953 // Don't initialize the platform math functions since sparc 4954 // doesn't have intrinsics for these operations. 4955 4956 // Safefetch stubs. 4957 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4958 &StubRoutines::_safefetch32_fault_pc, 4959 &StubRoutines::_safefetch32_continuation_pc); 4960 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4961 &StubRoutines::_safefetchN_fault_pc, 4962 &StubRoutines::_safefetchN_continuation_pc); 4963 4964 // generate AES intrinsics code 4965 if (UseAESIntrinsics) { 4966 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4967 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4968 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4969 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 4970 } 4971 // generate GHASH intrinsics code 4972 if (UseGHASHIntrinsics) { 4973 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4974 } 4975 4976 // generate SHA1/SHA256/SHA512 intrinsics code 4977 if (UseSHA1Intrinsics) { 4978 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4979 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4980 } 4981 if (UseSHA256Intrinsics) { 4982 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4983 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4984 } 4985 if (UseSHA512Intrinsics) { 4986 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 4987 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 4988 } 4989 // generate Adler32 intrinsics code 4990 if (UseAdler32Intrinsics) { 4991 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 4992 } 4993 } 4994 4995 4996 public: 4997 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4998 // replace the standard masm with a special one: 4999 _masm = new MacroAssembler(code); 5000 5001 _stub_count = !all ? 0x100 : 0x200; 5002 if (all) { 5003 generate_all(); 5004 } else { 5005 generate_initial(); 5006 } 5007 5008 // make sure this stub is available for all local calls 5009 if (_atomic_add_stub.is_unbound()) { 5010 // generate a second time, if necessary 5011 (void) generate_atomic_add(); 5012 } 5013 } 5014 5015 5016 private: 5017 int _stub_count; 5018 void stub_prolog(StubCodeDesc* cdesc) { 5019 # ifdef ASSERT 5020 // put extra information in the stub code, to make it more readable 5021 // Write the high part of the address 5022 // [RGV] Check if there is a dependency on the size of this prolog 5023 __ emit_data((intptr_t)cdesc >> 32, relocInfo::none); 5024 __ emit_data((intptr_t)cdesc, relocInfo::none); 5025 __ emit_data(++_stub_count, relocInfo::none); 5026 # endif 5027 align(true); 5028 } 5029 5030 void align(bool at_header = false) { 5031 // %%%%% move this constant somewhere else 5032 // UltraSPARC cache line size is 8 instructions: 5033 const unsigned int icache_line_size = 32; 5034 const unsigned int icache_half_line_size = 16; 5035 5036 if (at_header) { 5037 while ((intptr_t)(__ pc()) % icache_line_size != 0) { 5038 __ emit_data(0, relocInfo::none); 5039 } 5040 } else { 5041 while ((intptr_t)(__ pc()) % icache_half_line_size != 0) { 5042 __ nop(); 5043 } 5044 } 5045 } 5046 5047 }; // end class declaration 5048 5049 void StubGenerator_generate(CodeBuffer* code, bool all) { 5050 StubGenerator g(code, all); 5051 }