1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/macroAssembler.inline.hpp" 27 #include "gc/shared/cardTable.hpp" 28 #include "gc/shared/cardTableModRefBS.hpp" 29 #include "interpreter/interpreter.hpp" 30 #include "nativeInst_sparc.hpp" 31 #include "oops/instanceOop.hpp" 32 #include "oops/method.hpp" 33 #include "oops/objArrayKlass.hpp" 34 #include "oops/oop.inline.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/frame.inline.hpp" 37 #include "runtime/handles.inline.hpp" 38 #include "runtime/sharedRuntime.hpp" 39 #include "runtime/stubCodeGenerator.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "runtime/thread.inline.hpp" 42 #ifdef COMPILER2 43 #include "opto/runtime.hpp" 44 #endif 45 46 // Declaration and definition of StubGenerator (no .hpp file). 47 // For a more detailed description of the stub routine structure 48 // see the comment in stubRoutines.hpp. 49 50 #define __ _masm-> 51 52 #ifdef PRODUCT 53 #define BLOCK_COMMENT(str) /* nothing */ 54 #else 55 #define BLOCK_COMMENT(str) __ block_comment(str) 56 #endif 57 58 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 59 60 // Note: The register L7 is used as L7_thread_cache, and may not be used 61 // any other way within this module. 62 63 static const Register& Lstub_temp = L2; 64 65 // ------------------------------------------------------------------------------------------------------------------------- 66 // Stub Code definitions 67 68 class StubGenerator: public StubCodeGenerator { 69 private: 70 71 #ifdef PRODUCT 72 #define inc_counter_np(a,b,c) 73 #else 74 #define inc_counter_np(counter, t1, t2) \ 75 BLOCK_COMMENT("inc_counter " #counter); \ 76 __ inc_counter(&counter, t1, t2); 77 #endif 78 79 //---------------------------------------------------------------------------------------------------- 80 // Call stubs are used to call Java from C 81 82 address generate_call_stub(address& return_pc) { 83 StubCodeMark mark(this, "StubRoutines", "call_stub"); 84 address start = __ pc(); 85 86 // Incoming arguments: 87 // 88 // o0 : call wrapper address 89 // o1 : result (address) 90 // o2 : result type 91 // o3 : method 92 // o4 : (interpreter) entry point 93 // o5 : parameters (address) 94 // [sp + 0x5c]: parameter size (in words) 95 // [sp + 0x60]: thread 96 // 97 // +---------------+ <--- sp + 0 98 // | | 99 // . reg save area . 100 // | | 101 // +---------------+ <--- sp + 0x40 102 // | | 103 // . extra 7 slots . 104 // | | 105 // +---------------+ <--- sp + 0x5c 106 // | param. size | 107 // +---------------+ <--- sp + 0x60 108 // | thread | 109 // +---------------+ 110 // | | 111 112 // note: if the link argument position changes, adjust 113 // the code in frame::entry_frame_call_wrapper() 114 115 const Argument link = Argument(0, false); // used only for GC 116 const Argument result = Argument(1, false); 117 const Argument result_type = Argument(2, false); 118 const Argument method = Argument(3, false); 119 const Argument entry_point = Argument(4, false); 120 const Argument parameters = Argument(5, false); 121 const Argument parameter_size = Argument(6, false); 122 const Argument thread = Argument(7, false); 123 124 // setup thread register 125 __ ld_ptr(thread.as_address(), G2_thread); 126 __ reinit_heapbase(); 127 128 #ifdef ASSERT 129 // make sure we have no pending exceptions 130 { const Register t = G3_scratch; 131 Label L; 132 __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t); 133 __ br_null_short(t, Assembler::pt, L); 134 __ stop("StubRoutines::call_stub: entered with pending exception"); 135 __ bind(L); 136 } 137 #endif 138 139 // create activation frame & allocate space for parameters 140 { const Register t = G3_scratch; 141 __ ld_ptr(parameter_size.as_address(), t); // get parameter size (in words) 142 __ add(t, frame::memory_parameter_word_sp_offset, t); // add space for save area (in words) 143 __ round_to(t, WordsPerLong); // make sure it is multiple of 2 (in words) 144 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes 145 __ neg(t); // negate so it can be used with save 146 __ save(SP, t, SP); // setup new frame 147 } 148 149 // +---------------+ <--- sp + 0 150 // | | 151 // . reg save area . 152 // | | 153 // +---------------+ <--- sp + 0x40 154 // | | 155 // . extra 7 slots . 156 // | | 157 // +---------------+ <--- sp + 0x5c 158 // | empty slot | (only if parameter size is even) 159 // +---------------+ 160 // | | 161 // . parameters . 162 // | | 163 // +---------------+ <--- fp + 0 164 // | | 165 // . reg save area . 166 // | | 167 // +---------------+ <--- fp + 0x40 168 // | | 169 // . extra 7 slots . 170 // | | 171 // +---------------+ <--- fp + 0x5c 172 // | param. size | 173 // +---------------+ <--- fp + 0x60 174 // | thread | 175 // +---------------+ 176 // | | 177 178 // pass parameters if any 179 BLOCK_COMMENT("pass parameters if any"); 180 { const Register src = parameters.as_in().as_register(); 181 const Register dst = Lentry_args; 182 const Register tmp = G3_scratch; 183 const Register cnt = G4_scratch; 184 185 // test if any parameters & setup of Lentry_args 186 Label exit; 187 __ ld_ptr(parameter_size.as_in().as_address(), cnt); // parameter counter 188 __ add( FP, STACK_BIAS, dst ); 189 __ cmp_zero_and_br(Assembler::zero, cnt, exit); 190 __ delayed()->sub(dst, BytesPerWord, dst); // setup Lentry_args 191 192 // copy parameters if any 193 Label loop; 194 __ BIND(loop); 195 // Store parameter value 196 __ ld_ptr(src, 0, tmp); 197 __ add(src, BytesPerWord, src); 198 __ st_ptr(tmp, dst, 0); 199 __ deccc(cnt); 200 __ br(Assembler::greater, false, Assembler::pt, loop); 201 __ delayed()->sub(dst, Interpreter::stackElementSize, dst); 202 203 // done 204 __ BIND(exit); 205 } 206 207 // setup parameters, method & call Java function 208 #ifdef ASSERT 209 // layout_activation_impl checks it's notion of saved SP against 210 // this register, so if this changes update it as well. 211 const Register saved_SP = Lscratch; 212 __ mov(SP, saved_SP); // keep track of SP before call 213 #endif 214 215 // setup parameters 216 const Register t = G3_scratch; 217 __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words) 218 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes 219 __ sub(FP, t, Gargs); // setup parameter pointer 220 __ add( Gargs, STACK_BIAS, Gargs ); // Account for LP64 stack bias 221 __ mov(SP, O5_savedSP); 222 223 224 // do the call 225 // 226 // the following register must be setup: 227 // 228 // G2_thread 229 // G5_method 230 // Gargs 231 BLOCK_COMMENT("call Java function"); 232 __ jmpl(entry_point.as_in().as_register(), G0, O7); 233 __ delayed()->mov(method.as_in().as_register(), G5_method); // setup method 234 235 BLOCK_COMMENT("call_stub_return_address:"); 236 return_pc = __ pc(); 237 238 // The callee, if it wasn't interpreted, can return with SP changed so 239 // we can no longer assert of change of SP. 240 241 // store result depending on type 242 // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE 243 // is treated as T_INT) 244 { const Register addr = result .as_in().as_register(); 245 const Register type = result_type.as_in().as_register(); 246 Label is_long, is_float, is_double, is_object, exit; 247 __ cmp(type, T_OBJECT); __ br(Assembler::equal, false, Assembler::pn, is_object); 248 __ delayed()->cmp(type, T_FLOAT); __ br(Assembler::equal, false, Assembler::pn, is_float); 249 __ delayed()->cmp(type, T_DOUBLE); __ br(Assembler::equal, false, Assembler::pn, is_double); 250 __ delayed()->cmp(type, T_LONG); __ br(Assembler::equal, false, Assembler::pn, is_long); 251 __ delayed()->nop(); 252 253 // store int result 254 __ st(O0, addr, G0); 255 256 __ BIND(exit); 257 __ ret(); 258 __ delayed()->restore(); 259 260 __ BIND(is_object); 261 __ ba(exit); 262 __ delayed()->st_ptr(O0, addr, G0); 263 264 __ BIND(is_float); 265 __ ba(exit); 266 __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0); 267 268 __ BIND(is_double); 269 __ ba(exit); 270 __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0); 271 272 __ BIND(is_long); 273 __ ba(exit); 274 __ delayed()->st_long(O0, addr, G0); // store entire long 275 } 276 return start; 277 } 278 279 280 //---------------------------------------------------------------------------------------------------- 281 // Return point for a Java call if there's an exception thrown in Java code. 282 // The exception is caught and transformed into a pending exception stored in 283 // JavaThread that can be tested from within the VM. 284 // 285 // Oexception: exception oop 286 287 address generate_catch_exception() { 288 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 289 290 address start = __ pc(); 291 // verify that thread corresponds 292 __ verify_thread(); 293 294 const Register& temp_reg = Gtemp; 295 Address pending_exception_addr (G2_thread, Thread::pending_exception_offset()); 296 Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset ()); 297 Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset ()); 298 299 // set pending exception 300 __ verify_oop(Oexception); 301 __ st_ptr(Oexception, pending_exception_addr); 302 __ set((intptr_t)__FILE__, temp_reg); 303 __ st_ptr(temp_reg, exception_file_offset_addr); 304 __ set((intptr_t)__LINE__, temp_reg); 305 __ st(temp_reg, exception_line_offset_addr); 306 307 // complete return to VM 308 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before"); 309 310 AddressLiteral stub_ret(StubRoutines::_call_stub_return_address); 311 __ jump_to(stub_ret, temp_reg); 312 __ delayed()->nop(); 313 314 return start; 315 } 316 317 318 //---------------------------------------------------------------------------------------------------- 319 // Continuation point for runtime calls returning with a pending exception 320 // The pending exception check happened in the runtime or native call stub 321 // The pending exception in Thread is converted into a Java-level exception 322 // 323 // Contract with Java-level exception handler: O0 = exception 324 // O1 = throwing pc 325 326 address generate_forward_exception() { 327 StubCodeMark mark(this, "StubRoutines", "forward_exception"); 328 address start = __ pc(); 329 330 // Upon entry, O7 has the return address returning into Java 331 // (interpreted or compiled) code; i.e. the return address 332 // becomes the throwing pc. 333 334 const Register& handler_reg = Gtemp; 335 336 Address exception_addr(G2_thread, Thread::pending_exception_offset()); 337 338 #ifdef ASSERT 339 // make sure that this code is only executed if there is a pending exception 340 { Label L; 341 __ ld_ptr(exception_addr, Gtemp); 342 __ br_notnull_short(Gtemp, Assembler::pt, L); 343 __ stop("StubRoutines::forward exception: no pending exception (1)"); 344 __ bind(L); 345 } 346 #endif 347 348 // compute exception handler into handler_reg 349 __ get_thread(); 350 __ ld_ptr(exception_addr, Oexception); 351 __ verify_oop(Oexception); 352 __ save_frame(0); // compensates for compiler weakness 353 __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC 354 BLOCK_COMMENT("call exception_handler_for_return_address"); 355 __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch); 356 __ mov(O0, handler_reg); 357 __ restore(); // compensates for compiler weakness 358 359 __ ld_ptr(exception_addr, Oexception); 360 __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC 361 362 #ifdef ASSERT 363 // make sure exception is set 364 { Label L; 365 __ br_notnull_short(Oexception, Assembler::pt, L); 366 __ stop("StubRoutines::forward exception: no pending exception (2)"); 367 __ bind(L); 368 } 369 #endif 370 // jump to exception handler 371 __ jmp(handler_reg, 0); 372 // clear pending exception 373 __ delayed()->st_ptr(G0, exception_addr); 374 375 return start; 376 } 377 378 // Safefetch stubs. 379 void generate_safefetch(const char* name, int size, address* entry, 380 address* fault_pc, address* continuation_pc) { 381 // safefetch signatures: 382 // int SafeFetch32(int* adr, int errValue); 383 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 384 // 385 // arguments: 386 // o0 = adr 387 // o1 = errValue 388 // 389 // result: 390 // o0 = *adr or errValue 391 392 StubCodeMark mark(this, "StubRoutines", name); 393 394 // Entry point, pc or function descriptor. 395 __ align(CodeEntryAlignment); 396 *entry = __ pc(); 397 398 __ mov(O0, G1); // g1 = o0 399 __ mov(O1, O0); // o0 = o1 400 // Load *adr into c_rarg1, may fault. 401 *fault_pc = __ pc(); 402 switch (size) { 403 case 4: 404 // int32_t 405 __ ldsw(G1, 0, O0); // o0 = [g1] 406 break; 407 case 8: 408 // int64_t 409 __ ldx(G1, 0, O0); // o0 = [g1] 410 break; 411 default: 412 ShouldNotReachHere(); 413 } 414 415 // return errValue or *adr 416 *continuation_pc = __ pc(); 417 // By convention with the trap handler we ensure there is a non-CTI 418 // instruction in the trap shadow. 419 __ nop(); 420 __ retl(); 421 __ delayed()->nop(); 422 } 423 424 //------------------------------------------------------------------------------------------------------------------------ 425 // Continuation point for throwing of implicit exceptions that are not handled in 426 // the current activation. Fabricates an exception oop and initiates normal 427 // exception dispatching in this frame. Only callee-saved registers are preserved 428 // (through the normal register window / RegisterMap handling). 429 // If the compiler needs all registers to be preserved between the fault 430 // point and the exception handler then it must assume responsibility for that in 431 // AbstractCompiler::continuation_for_implicit_null_exception or 432 // continuation_for_implicit_division_by_zero_exception. All other implicit 433 // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are 434 // either at call sites or otherwise assume that stack unwinding will be initiated, 435 // so caller saved registers were assumed volatile in the compiler. 436 437 // Note that we generate only this stub into a RuntimeStub, because it needs to be 438 // properly traversed and ignored during GC, so we change the meaning of the "__" 439 // macro within this method. 440 #undef __ 441 #define __ masm-> 442 443 address generate_throw_exception(const char* name, address runtime_entry, 444 Register arg1 = noreg, Register arg2 = noreg) { 445 #ifdef ASSERT 446 int insts_size = VerifyThread ? 1 * K : 600; 447 #else 448 int insts_size = VerifyThread ? 1 * K : 256; 449 #endif /* ASSERT */ 450 int locs_size = 32; 451 452 CodeBuffer code(name, insts_size, locs_size); 453 MacroAssembler* masm = new MacroAssembler(&code); 454 455 __ verify_thread(); 456 457 // This is an inlined and slightly modified version of call_VM 458 // which has the ability to fetch the return PC out of thread-local storage 459 __ assert_not_delayed(); 460 461 // Note that we always push a frame because on the SPARC 462 // architecture, for all of our implicit exception kinds at call 463 // sites, the implicit exception is taken before the callee frame 464 // is pushed. 465 __ save_frame(0); 466 467 int frame_complete = __ offset(); 468 469 // Note that we always have a runtime stub frame on the top of stack by this point 470 Register last_java_sp = SP; 471 // 64-bit last_java_sp is biased! 472 __ set_last_Java_frame(last_java_sp, G0); 473 if (VerifyThread) __ mov(G2_thread, O0); // about to be smashed; pass early 474 __ save_thread(noreg); 475 if (arg1 != noreg) { 476 assert(arg2 != O1, "clobbered"); 477 __ mov(arg1, O1); 478 } 479 if (arg2 != noreg) { 480 __ mov(arg2, O2); 481 } 482 // do the call 483 BLOCK_COMMENT("call runtime_entry"); 484 __ call(runtime_entry, relocInfo::runtime_call_type); 485 if (!VerifyThread) 486 __ delayed()->mov(G2_thread, O0); // pass thread as first argument 487 else 488 __ delayed()->nop(); // (thread already passed) 489 __ restore_thread(noreg); 490 __ reset_last_Java_frame(); 491 492 // check for pending exceptions. use Gtemp as scratch register. 493 #ifdef ASSERT 494 Label L; 495 496 Address exception_addr(G2_thread, Thread::pending_exception_offset()); 497 Register scratch_reg = Gtemp; 498 __ ld_ptr(exception_addr, scratch_reg); 499 __ br_notnull_short(scratch_reg, Assembler::pt, L); 500 __ should_not_reach_here(); 501 __ bind(L); 502 #endif // ASSERT 503 BLOCK_COMMENT("call forward_exception_entry"); 504 __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type); 505 // we use O7 linkage so that forward_exception_entry has the issuing PC 506 __ delayed()->restore(); 507 508 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false); 509 return stub->entry_point(); 510 } 511 512 #undef __ 513 #define __ _masm-> 514 515 516 // Generate a routine that sets all the registers so we 517 // can tell if the stop routine prints them correctly. 518 address generate_test_stop() { 519 StubCodeMark mark(this, "StubRoutines", "test_stop"); 520 address start = __ pc(); 521 522 int i; 523 524 __ save_frame(0); 525 526 static jfloat zero = 0.0, one = 1.0; 527 528 // put addr in L0, then load through L0 to F0 529 __ set((intptr_t)&zero, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F0); 530 __ set((intptr_t)&one, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1 531 532 // use add to put 2..18 in F2..F18 533 for ( i = 2; i <= 18; ++i ) { 534 __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1), as_FloatRegister(i)); 535 } 536 537 // Now put double 2 in F16, double 18 in F18 538 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 ); 539 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 ); 540 541 // use add to put 20..32 in F20..F32 542 for (i = 20; i < 32; i += 2) { 543 __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2), as_FloatRegister(i)); 544 } 545 546 // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's 547 for ( i = 0; i < 8; ++i ) { 548 if (i < 6) { 549 __ set( i, as_iRegister(i)); 550 __ set(16 + i, as_oRegister(i)); 551 __ set(24 + i, as_gRegister(i)); 552 } 553 __ set( 8 + i, as_lRegister(i)); 554 } 555 556 __ stop("testing stop"); 557 558 559 __ ret(); 560 __ delayed()->restore(); 561 562 return start; 563 } 564 565 566 address generate_stop_subroutine() { 567 StubCodeMark mark(this, "StubRoutines", "stop_subroutine"); 568 address start = __ pc(); 569 570 __ stop_subroutine(); 571 572 return start; 573 } 574 575 address generate_flush_callers_register_windows() { 576 StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows"); 577 address start = __ pc(); 578 579 __ flushw(); 580 __ retl(false); 581 __ delayed()->add( FP, STACK_BIAS, O0 ); 582 // The returned value must be a stack pointer whose register save area 583 // is flushed, and will stay flushed while the caller executes. 584 585 return start; 586 } 587 588 // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest). 589 // 590 // Arguments: 591 // 592 // exchange_value: O0 593 // dest: O1 594 // 595 // Results: 596 // 597 // O0: the value previously stored in dest 598 // 599 address generate_atomic_xchg() { 600 StubCodeMark mark(this, "StubRoutines", "atomic_xchg"); 601 address start = __ pc(); 602 603 if (UseCASForSwap) { 604 // Use CAS instead of swap, just in case the MP hardware 605 // prefers to work with just one kind of synch. instruction. 606 Label retry; 607 __ BIND(retry); 608 __ mov(O0, O3); // scratch copy of exchange value 609 __ ld(O1, 0, O2); // observe the previous value 610 // try to replace O2 with O3 611 __ cas(O1, O2, O3); 612 __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry); 613 614 __ retl(false); 615 __ delayed()->mov(O2, O0); // report previous value to caller 616 } else { 617 __ retl(false); 618 __ delayed()->swap(O1, 0, O0); 619 } 620 621 return start; 622 } 623 624 625 // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value) 626 // 627 // Arguments: 628 // 629 // exchange_value: O0 630 // dest: O1 631 // compare_value: O2 632 // 633 // Results: 634 // 635 // O0: the value previously stored in dest 636 // 637 address generate_atomic_cmpxchg() { 638 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg"); 639 address start = __ pc(); 640 641 // cmpxchg(dest, compare_value, exchange_value) 642 __ cas(O1, O2, O0); 643 __ retl(false); 644 __ delayed()->nop(); 645 646 return start; 647 } 648 649 // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value) 650 // 651 // Arguments: 652 // 653 // exchange_value: O1:O0 654 // dest: O2 655 // compare_value: O4:O3 656 // 657 // Results: 658 // 659 // O1:O0: the value previously stored in dest 660 // 661 // Overwrites: G1,G2,G3 662 // 663 address generate_atomic_cmpxchg_long() { 664 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long"); 665 address start = __ pc(); 666 667 __ sllx(O0, 32, O0); 668 __ srl(O1, 0, O1); 669 __ or3(O0,O1,O0); // O0 holds 64-bit value from compare_value 670 __ sllx(O3, 32, O3); 671 __ srl(O4, 0, O4); 672 __ or3(O3,O4,O3); // O3 holds 64-bit value from exchange_value 673 __ casx(O2, O3, O0); 674 __ srl(O0, 0, O1); // unpacked return value in O1:O0 675 __ retl(false); 676 __ delayed()->srlx(O0, 32, O0); 677 678 return start; 679 } 680 681 682 // Support for jint Atomic::add(jint add_value, volatile jint* dest). 683 // 684 // Arguments: 685 // 686 // add_value: O0 (e.g., +1 or -1) 687 // dest: O1 688 // 689 // Results: 690 // 691 // O0: the new value stored in dest 692 // 693 // Overwrites: O3 694 // 695 address generate_atomic_add() { 696 StubCodeMark mark(this, "StubRoutines", "atomic_add"); 697 address start = __ pc(); 698 __ BIND(_atomic_add_stub); 699 700 Label(retry); 701 __ BIND(retry); 702 703 __ lduw(O1, 0, O2); 704 __ add(O0, O2, O3); 705 __ cas(O1, O2, O3); 706 __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry); 707 __ retl(false); 708 __ delayed()->add(O0, O2, O0); // note that cas made O2==O3 709 710 return start; 711 } 712 Label _atomic_add_stub; // called from other stubs 713 714 715 // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super ); 716 // Arguments : 717 // 718 // ret : O0, returned 719 // icc/xcc: set as O0 (depending on wordSize) 720 // sub : O1, argument, not changed 721 // super: O2, argument, not changed 722 // raddr: O7, blown by call 723 address generate_partial_subtype_check() { 724 __ align(CodeEntryAlignment); 725 StubCodeMark mark(this, "StubRoutines", "partial_subtype_check"); 726 address start = __ pc(); 727 Label miss; 728 729 __ save_frame(0); 730 Register Rret = I0; 731 Register Rsub = I1; 732 Register Rsuper = I2; 733 734 Register L0_ary_len = L0; 735 Register L1_ary_ptr = L1; 736 Register L2_super = L2; 737 Register L3_index = L3; 738 739 __ check_klass_subtype_slow_path(Rsub, Rsuper, 740 L0, L1, L2, L3, 741 NULL, &miss); 742 743 // Match falls through here. 744 __ addcc(G0,0,Rret); // set Z flags, Z result 745 746 __ ret(); // Result in Rret is zero; flags set to Z 747 __ delayed()->restore(); 748 749 __ BIND(miss); 750 __ addcc(G0,1,Rret); // set NZ flags, NZ result 751 752 __ ret(); // Result in Rret is != 0; flags set to NZ 753 __ delayed()->restore(); 754 755 return start; 756 } 757 758 759 // Called from MacroAssembler::verify_oop 760 // 761 address generate_verify_oop_subroutine() { 762 StubCodeMark mark(this, "StubRoutines", "verify_oop_stub"); 763 764 address start = __ pc(); 765 766 __ verify_oop_subroutine(); 767 768 return start; 769 } 770 771 772 // 773 // Verify that a register contains clean 32-bits positive value 774 // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax). 775 // 776 // Input: 777 // Rint - 32-bits value 778 // Rtmp - scratch 779 // 780 void assert_clean_int(Register Rint, Register Rtmp) { 781 #if defined(ASSERT) 782 __ signx(Rint, Rtmp); 783 __ cmp(Rint, Rtmp); 784 __ breakpoint_trap(Assembler::notEqual, Assembler::xcc); 785 #endif 786 } 787 788 // 789 // Generate overlap test for array copy stubs 790 // 791 // Input: 792 // O0 - array1 793 // O1 - array2 794 // O2 - element count 795 // 796 // Kills temps: O3, O4 797 // 798 void array_overlap_test(address no_overlap_target, int log2_elem_size) { 799 assert(no_overlap_target != NULL, "must be generated"); 800 array_overlap_test(no_overlap_target, NULL, log2_elem_size); 801 } 802 void array_overlap_test(Label& L_no_overlap, int log2_elem_size) { 803 array_overlap_test(NULL, &L_no_overlap, log2_elem_size); 804 } 805 void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) { 806 const Register from = O0; 807 const Register to = O1; 808 const Register count = O2; 809 const Register to_from = O3; // to - from 810 const Register byte_count = O4; // count << log2_elem_size 811 812 __ subcc(to, from, to_from); 813 __ sll_ptr(count, log2_elem_size, byte_count); 814 if (NOLp == NULL) 815 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target); 816 else 817 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp)); 818 __ delayed()->cmp(to_from, byte_count); 819 if (NOLp == NULL) 820 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target); 821 else 822 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp)); 823 __ delayed()->nop(); 824 } 825 826 // Generate code for an array load barrier 827 // 828 // addr - starting address 829 // count - element count 830 // 831 // Destroy no registers! 832 // 833 void gen_load_ref_array_barrier(Register addr, Register count) { 834 BarrierSet* bs = Universe::heap()->barrier_set(); 835 switch (bs->kind()) { 836 case BarrierSet::Z: 837 __ save_frame_and_mov(0, addr, O0, count, O1); 838 // Save the necessary global regs... will be used after. 839 __ call(CAST_FROM_FN_PTR(address, static_cast<void (*)(volatile oop*, size_t)>(ZBarrier::load_barrier_on_oop_array))); 840 __ delayed()->nop(); 841 __ restore(); 842 break; 843 case BarrierSet::G1BarrierSet: 844 case BarrierSet::CardTableModRef: 845 // No barrier 846 break; 847 default: 848 ShouldNotReachHere(); 849 break; 850 } 851 } 852 853 // 854 // Generate pre-write barrier for array. 855 // 856 // Input: 857 // addr - register containing starting address 858 // count - register containing element count 859 // tmp - scratch register 860 // 861 // The input registers are overwritten. 862 // 863 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 864 BarrierSet* bs = Universe::heap()->barrier_set(); 865 switch (bs->kind()) { 866 case BarrierSet::G1BarrierSet: 867 // With G1, don't generate the call if we statically know that the target in uninitialized 868 if (!dest_uninitialized) { 869 Register tmp = O5; 870 assert_different_registers(addr, count, tmp); 871 Label filtered; 872 // Is marking active? 873 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 874 __ ld(G2, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), tmp); 875 } else { 876 guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, 877 "Assumption"); 878 __ ldsb(G2, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), tmp); 879 } 880 // Is marking active? 881 __ cmp_and_br_short(tmp, G0, Assembler::equal, Assembler::pt, filtered); 882 883 __ save_frame(0); 884 // Save the necessary global regs... will be used after. 885 if (addr->is_global()) { 886 __ mov(addr, L0); 887 } 888 if (count->is_global()) { 889 __ mov(count, L1); 890 } 891 __ mov(addr->after_save(), O0); 892 // Get the count into O1 893 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre)); 894 __ delayed()->mov(count->after_save(), O1); 895 if (addr->is_global()) { 896 __ mov(L0, addr); 897 } 898 if (count->is_global()) { 899 __ mov(L1, count); 900 } 901 __ restore(); 902 903 __ bind(filtered); 904 DEBUG_ONLY(__ set(0xDEADC0DE, tmp);) // we have killed tmp 905 } 906 break; 907 case BarrierSet::CardTableModRef: 908 case BarrierSet::Z: 909 break; 910 default: 911 ShouldNotReachHere(); 912 } 913 } 914 // 915 // Generate post-write barrier for array. 916 // 917 // Input: 918 // addr - register containing starting address 919 // count - register containing element count 920 // tmp - scratch register 921 // 922 // The input registers are overwritten. 923 // 924 void gen_write_ref_array_post_barrier(Register addr, Register count, 925 Register tmp) { 926 BarrierSet* bs = Universe::heap()->barrier_set(); 927 928 switch (bs->kind()) { 929 case BarrierSet::G1BarrierSet: 930 { 931 // Get some new fresh output registers. 932 __ save_frame(0); 933 __ mov(addr->after_save(), O0); 934 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post)); 935 __ delayed()->mov(count->after_save(), O1); 936 __ restore(); 937 } 938 break; 939 case BarrierSet::CardTableModRef: 940 { 941 CardTableModRefBS* ctbs = barrier_set_cast<CardTableModRefBS>(bs); 942 CardTable* ct = ctbs->card_table(); 943 assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code"); 944 assert_different_registers(addr, count, tmp); 945 946 Label L_loop, L_done; 947 948 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_done); // zero count - nothing to do 949 950 __ sll_ptr(count, LogBytesPerHeapOop, count); 951 __ sub(count, BytesPerHeapOop, count); 952 __ add(count, addr, count); 953 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.) 954 __ srl_ptr(addr, CardTable::card_shift, addr); 955 __ srl_ptr(count, CardTable::card_shift, count); 956 __ sub(count, addr, count); 957 AddressLiteral rs(ct->byte_map_base()); 958 __ set(rs, tmp); 959 __ BIND(L_loop); 960 __ stb(G0, tmp, addr); 961 __ subcc(count, 1, count); 962 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 963 __ delayed()->add(addr, 1, addr); 964 __ BIND(L_done); 965 } 966 break; 967 case BarrierSet::ModRef: 968 case BarrierSet::Z: 969 break; 970 default: 971 ShouldNotReachHere(); 972 } 973 } 974 975 // 976 // Generate main code for disjoint arraycopy 977 // 978 typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec, 979 Label& L_loop, bool use_prefetch, bool use_bis); 980 981 void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size, 982 int iter_size, StubGenerator::CopyLoopFunc copy_loop_func) { 983 Label L_copy; 984 985 assert(log2_elem_size <= 3, "the following code should be changed"); 986 int count_dec = 16>>log2_elem_size; 987 988 int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance); 989 assert(prefetch_dist < 4096, "invalid value"); 990 prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size 991 int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count 992 993 if (UseBlockCopy) { 994 Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy; 995 996 // 64 bytes tail + bytes copied in one loop iteration 997 int tail_size = 64 + iter_size; 998 int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size; 999 // Use BIS copy only for big arrays since it requires membar. 1000 __ set(block_copy_count, O4); 1001 __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy); 1002 // This code is for disjoint source and destination: 1003 // to <= from || to >= from+count 1004 // but BIS will stomp over 'from' if (to > from-tail_size && to <= from) 1005 __ sub(from, to, O4); 1006 __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm. 1007 __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy); 1008 1009 __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY); 1010 // BIS should not be used to copy tail (64 bytes+iter_size) 1011 // to avoid zeroing of following values. 1012 __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0 1013 1014 if (prefetch_count > 0) { // rounded up to one iteration count 1015 // Do prefetching only if copy size is bigger 1016 // than prefetch distance. 1017 __ set(prefetch_count, O4); 1018 __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy); 1019 __ sub(count, O4, count); 1020 1021 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true); 1022 __ set(prefetch_count, O4); 1023 __ add(count, O4, count); 1024 1025 } // prefetch_count > 0 1026 1027 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true); 1028 __ add(count, (tail_size>>log2_elem_size), count); // restore count 1029 1030 __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT); 1031 // BIS needs membar. 1032 __ membar(Assembler::StoreLoad); 1033 // Copy tail 1034 __ ba_short(L_copy); 1035 1036 __ BIND(L_skip_block_copy); 1037 } // UseBlockCopy 1038 1039 if (prefetch_count > 0) { // rounded up to one iteration count 1040 // Do prefetching only if copy size is bigger 1041 // than prefetch distance. 1042 __ set(prefetch_count, O4); 1043 __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy); 1044 __ sub(count, O4, count); 1045 1046 Label L_copy_prefetch; 1047 (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false); 1048 __ set(prefetch_count, O4); 1049 __ add(count, O4, count); 1050 1051 } // prefetch_count > 0 1052 1053 (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false); 1054 } 1055 1056 1057 1058 // 1059 // Helper methods for copy_16_bytes_forward_with_shift() 1060 // 1061 void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec, 1062 Label& L_loop, bool use_prefetch, bool use_bis) { 1063 1064 const Register left_shift = G1; // left shift bit counter 1065 const Register right_shift = G5; // right shift bit counter 1066 1067 __ align(OptoLoopAlignment); 1068 __ BIND(L_loop); 1069 if (use_prefetch) { 1070 if (ArraycopySrcPrefetchDistance > 0) { 1071 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 1072 } 1073 if (ArraycopyDstPrefetchDistance > 0) { 1074 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 1075 } 1076 } 1077 __ ldx(from, 0, O4); 1078 __ ldx(from, 8, G4); 1079 __ inc(to, 16); 1080 __ inc(from, 16); 1081 __ deccc(count, count_dec); // Can we do next iteration after this one? 1082 __ srlx(O4, right_shift, G3); 1083 __ bset(G3, O3); 1084 __ sllx(O4, left_shift, O4); 1085 __ srlx(G4, right_shift, G3); 1086 __ bset(G3, O4); 1087 if (use_bis) { 1088 __ stxa(O3, to, -16); 1089 __ stxa(O4, to, -8); 1090 } else { 1091 __ stx(O3, to, -16); 1092 __ stx(O4, to, -8); 1093 } 1094 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1095 __ delayed()->sllx(G4, left_shift, O3); 1096 } 1097 1098 // Copy big chunks forward with shift 1099 // 1100 // Inputs: 1101 // from - source arrays 1102 // to - destination array aligned to 8-bytes 1103 // count - elements count to copy >= the count equivalent to 16 bytes 1104 // count_dec - elements count's decrement equivalent to 16 bytes 1105 // L_copy_bytes - copy exit label 1106 // 1107 void copy_16_bytes_forward_with_shift(Register from, Register to, 1108 Register count, int log2_elem_size, Label& L_copy_bytes) { 1109 Label L_aligned_copy, L_copy_last_bytes; 1110 assert(log2_elem_size <= 3, "the following code should be changed"); 1111 int count_dec = 16>>log2_elem_size; 1112 1113 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy 1114 __ andcc(from, 7, G1); // misaligned bytes 1115 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1116 __ delayed()->nop(); 1117 1118 const Register left_shift = G1; // left shift bit counter 1119 const Register right_shift = G5; // right shift bit counter 1120 1121 __ sll(G1, LogBitsPerByte, left_shift); 1122 __ mov(64, right_shift); 1123 __ sub(right_shift, left_shift, right_shift); 1124 1125 // 1126 // Load 2 aligned 8-bytes chunks and use one from previous iteration 1127 // to form 2 aligned 8-bytes chunks to store. 1128 // 1129 __ dec(count, count_dec); // Pre-decrement 'count' 1130 __ andn(from, 7, from); // Align address 1131 __ ldx(from, 0, O3); 1132 __ inc(from, 8); 1133 __ sllx(O3, left_shift, O3); 1134 1135 disjoint_copy_core(from, to, count, log2_elem_size, 16, &StubGenerator::copy_16_bytes_shift_loop); 1136 1137 __ inccc(count, count_dec>>1 ); // + 8 bytes 1138 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 1139 __ delayed()->inc(count, count_dec>>1); // restore 'count' 1140 1141 // copy 8 bytes, part of them already loaded in O3 1142 __ ldx(from, 0, O4); 1143 __ inc(to, 8); 1144 __ inc(from, 8); 1145 __ srlx(O4, right_shift, G3); 1146 __ bset(O3, G3); 1147 __ stx(G3, to, -8); 1148 1149 __ BIND(L_copy_last_bytes); 1150 __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes 1151 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 1152 __ delayed()->sub(from, right_shift, from); // restore address 1153 1154 __ BIND(L_aligned_copy); 1155 } 1156 1157 // Copy big chunks backward with shift 1158 // 1159 // Inputs: 1160 // end_from - source arrays end address 1161 // end_to - destination array end address aligned to 8-bytes 1162 // count - elements count to copy >= the count equivalent to 16 bytes 1163 // count_dec - elements count's decrement equivalent to 16 bytes 1164 // L_aligned_copy - aligned copy exit label 1165 // L_copy_bytes - copy exit label 1166 // 1167 void copy_16_bytes_backward_with_shift(Register end_from, Register end_to, 1168 Register count, int count_dec, 1169 Label& L_aligned_copy, Label& L_copy_bytes) { 1170 Label L_loop, L_copy_last_bytes; 1171 1172 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy 1173 __ andcc(end_from, 7, G1); // misaligned bytes 1174 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1175 __ delayed()->deccc(count, count_dec); // Pre-decrement 'count' 1176 1177 const Register left_shift = G1; // left shift bit counter 1178 const Register right_shift = G5; // right shift bit counter 1179 1180 __ sll(G1, LogBitsPerByte, left_shift); 1181 __ mov(64, right_shift); 1182 __ sub(right_shift, left_shift, right_shift); 1183 1184 // 1185 // Load 2 aligned 8-bytes chunks and use one from previous iteration 1186 // to form 2 aligned 8-bytes chunks to store. 1187 // 1188 __ andn(end_from, 7, end_from); // Align address 1189 __ ldx(end_from, 0, O3); 1190 __ align(OptoLoopAlignment); 1191 __ BIND(L_loop); 1192 __ ldx(end_from, -8, O4); 1193 __ deccc(count, count_dec); // Can we do next iteration after this one? 1194 __ ldx(end_from, -16, G4); 1195 __ dec(end_to, 16); 1196 __ dec(end_from, 16); 1197 __ srlx(O3, right_shift, O3); 1198 __ sllx(O4, left_shift, G3); 1199 __ bset(G3, O3); 1200 __ stx(O3, end_to, 8); 1201 __ srlx(O4, right_shift, O4); 1202 __ sllx(G4, left_shift, G3); 1203 __ bset(G3, O4); 1204 __ stx(O4, end_to, 0); 1205 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1206 __ delayed()->mov(G4, O3); 1207 1208 __ inccc(count, count_dec>>1 ); // + 8 bytes 1209 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes); 1210 __ delayed()->inc(count, count_dec>>1); // restore 'count' 1211 1212 // copy 8 bytes, part of them already loaded in O3 1213 __ ldx(end_from, -8, O4); 1214 __ dec(end_to, 8); 1215 __ dec(end_from, 8); 1216 __ srlx(O3, right_shift, O3); 1217 __ sllx(O4, left_shift, G3); 1218 __ bset(O3, G3); 1219 __ stx(G3, end_to, 0); 1220 1221 __ BIND(L_copy_last_bytes); 1222 __ srl(left_shift, LogBitsPerByte, left_shift); // misaligned bytes 1223 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes); 1224 __ delayed()->add(end_from, left_shift, end_from); // restore address 1225 } 1226 1227 // 1228 // Generate stub for disjoint byte copy. If "aligned" is true, the 1229 // "from" and "to" addresses are assumed to be heapword aligned. 1230 // 1231 // Arguments for generated stub: 1232 // from: O0 1233 // to: O1 1234 // count: O2 treated as signed 1235 // 1236 address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) { 1237 __ align(CodeEntryAlignment); 1238 StubCodeMark mark(this, "StubRoutines", name); 1239 address start = __ pc(); 1240 1241 Label L_skip_alignment, L_align; 1242 Label L_copy_byte, L_copy_byte_loop, L_exit; 1243 1244 const Register from = O0; // source array address 1245 const Register to = O1; // destination array address 1246 const Register count = O2; // elements count 1247 const Register offset = O5; // offset from start of arrays 1248 // O3, O4, G3, G4 are used as temp registers 1249 1250 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1251 1252 if (entry != NULL) { 1253 *entry = __ pc(); 1254 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1255 BLOCK_COMMENT("Entry:"); 1256 } 1257 1258 // for short arrays, just do single element copy 1259 __ cmp(count, 23); // 16 + 7 1260 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte); 1261 __ delayed()->mov(G0, offset); 1262 1263 if (aligned) { 1264 // 'aligned' == true when it is known statically during compilation 1265 // of this arraycopy call site that both 'from' and 'to' addresses 1266 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1267 // 1268 // Aligned arrays have 4 bytes alignment in 32-bits VM 1269 // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM 1270 // 1271 } else { 1272 // copy bytes to align 'to' on 8 byte boundary 1273 __ andcc(to, 7, G1); // misaligned bytes 1274 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1275 __ delayed()->neg(G1); 1276 __ inc(G1, 8); // bytes need to copy to next 8-bytes alignment 1277 __ sub(count, G1, count); 1278 __ BIND(L_align); 1279 __ ldub(from, 0, O3); 1280 __ deccc(G1); 1281 __ inc(from); 1282 __ stb(O3, to, 0); 1283 __ br(Assembler::notZero, false, Assembler::pt, L_align); 1284 __ delayed()->inc(to); 1285 __ BIND(L_skip_alignment); 1286 } 1287 if (!aligned) { 1288 // Copy with shift 16 bytes per iteration if arrays do not have 1289 // the same alignment mod 8, otherwise fall through to the next 1290 // code for aligned copy. 1291 // The compare above (count >= 23) guarantes 'count' >= 16 bytes. 1292 // Also jump over aligned copy after the copy with shift completed. 1293 1294 copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte); 1295 } 1296 1297 // Both array are 8 bytes aligned, copy 16 bytes at a time 1298 __ and3(count, 7, G4); // Save count 1299 __ srl(count, 3, count); 1300 generate_disjoint_long_copy_core(aligned); 1301 __ mov(G4, count); // Restore count 1302 1303 // copy tailing bytes 1304 __ BIND(L_copy_byte); 1305 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1306 __ align(OptoLoopAlignment); 1307 __ BIND(L_copy_byte_loop); 1308 __ ldub(from, offset, O3); 1309 __ deccc(count); 1310 __ stb(O3, to, offset); 1311 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop); 1312 __ delayed()->inc(offset); 1313 1314 __ BIND(L_exit); 1315 // O3, O4 are used as temp registers 1316 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4); 1317 __ retl(); 1318 __ delayed()->mov(G0, O0); // return 0 1319 return start; 1320 } 1321 1322 // 1323 // Generate stub for conjoint byte copy. If "aligned" is true, the 1324 // "from" and "to" addresses are assumed to be heapword aligned. 1325 // 1326 // Arguments for generated stub: 1327 // from: O0 1328 // to: O1 1329 // count: O2 treated as signed 1330 // 1331 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1332 address *entry, const char *name) { 1333 // Do reverse copy. 1334 1335 __ align(CodeEntryAlignment); 1336 StubCodeMark mark(this, "StubRoutines", name); 1337 address start = __ pc(); 1338 1339 Label L_skip_alignment, L_align, L_aligned_copy; 1340 Label L_copy_byte, L_copy_byte_loop, L_exit; 1341 1342 const Register from = O0; // source array address 1343 const Register to = O1; // destination array address 1344 const Register count = O2; // elements count 1345 const Register end_from = from; // source array end address 1346 const Register end_to = to; // destination array end address 1347 1348 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1349 1350 if (entry != NULL) { 1351 *entry = __ pc(); 1352 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1353 BLOCK_COMMENT("Entry:"); 1354 } 1355 1356 array_overlap_test(nooverlap_target, 0); 1357 1358 __ add(to, count, end_to); // offset after last copied element 1359 1360 // for short arrays, just do single element copy 1361 __ cmp(count, 23); // 16 + 7 1362 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte); 1363 __ delayed()->add(from, count, end_from); 1364 1365 { 1366 // Align end of arrays since they could be not aligned even 1367 // when arrays itself are aligned. 1368 1369 // copy bytes to align 'end_to' on 8 byte boundary 1370 __ andcc(end_to, 7, G1); // misaligned bytes 1371 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1372 __ delayed()->nop(); 1373 __ sub(count, G1, count); 1374 __ BIND(L_align); 1375 __ dec(end_from); 1376 __ dec(end_to); 1377 __ ldub(end_from, 0, O3); 1378 __ deccc(G1); 1379 __ brx(Assembler::notZero, false, Assembler::pt, L_align); 1380 __ delayed()->stb(O3, end_to, 0); 1381 __ BIND(L_skip_alignment); 1382 } 1383 if (aligned) { 1384 // Both arrays are aligned to 8-bytes in 64-bits VM. 1385 // The 'count' is decremented in copy_16_bytes_backward_with_shift() 1386 // in unaligned case. 1387 __ dec(count, 16); 1388 } else { 1389 // Copy with shift 16 bytes per iteration if arrays do not have 1390 // the same alignment mod 8, otherwise jump to the next 1391 // code for aligned copy (and substracting 16 from 'count' before jump). 1392 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1393 // Also jump over aligned copy after the copy with shift completed. 1394 1395 copy_16_bytes_backward_with_shift(end_from, end_to, count, 16, 1396 L_aligned_copy, L_copy_byte); 1397 } 1398 // copy 4 elements (16 bytes) at a time 1399 __ align(OptoLoopAlignment); 1400 __ BIND(L_aligned_copy); 1401 __ dec(end_from, 16); 1402 __ ldx(end_from, 8, O3); 1403 __ ldx(end_from, 0, O4); 1404 __ dec(end_to, 16); 1405 __ deccc(count, 16); 1406 __ stx(O3, end_to, 8); 1407 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 1408 __ delayed()->stx(O4, end_to, 0); 1409 __ inc(count, 16); 1410 1411 // copy 1 element (2 bytes) at a time 1412 __ BIND(L_copy_byte); 1413 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1414 __ align(OptoLoopAlignment); 1415 __ BIND(L_copy_byte_loop); 1416 __ dec(end_from); 1417 __ dec(end_to); 1418 __ ldub(end_from, 0, O4); 1419 __ deccc(count); 1420 __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop); 1421 __ delayed()->stb(O4, end_to, 0); 1422 1423 __ BIND(L_exit); 1424 // O3, O4 are used as temp registers 1425 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4); 1426 __ retl(); 1427 __ delayed()->mov(G0, O0); // return 0 1428 return start; 1429 } 1430 1431 // 1432 // Generate stub for disjoint short copy. If "aligned" is true, the 1433 // "from" and "to" addresses are assumed to be heapword aligned. 1434 // 1435 // Arguments for generated stub: 1436 // from: O0 1437 // to: O1 1438 // count: O2 treated as signed 1439 // 1440 address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) { 1441 __ align(CodeEntryAlignment); 1442 StubCodeMark mark(this, "StubRoutines", name); 1443 address start = __ pc(); 1444 1445 Label L_skip_alignment, L_skip_alignment2; 1446 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit; 1447 1448 const Register from = O0; // source array address 1449 const Register to = O1; // destination array address 1450 const Register count = O2; // elements count 1451 const Register offset = O5; // offset from start of arrays 1452 // O3, O4, G3, G4 are used as temp registers 1453 1454 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1455 1456 if (entry != NULL) { 1457 *entry = __ pc(); 1458 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1459 BLOCK_COMMENT("Entry:"); 1460 } 1461 1462 // for short arrays, just do single element copy 1463 __ cmp(count, 11); // 8 + 3 (22 bytes) 1464 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes); 1465 __ delayed()->mov(G0, offset); 1466 1467 if (aligned) { 1468 // 'aligned' == true when it is known statically during compilation 1469 // of this arraycopy call site that both 'from' and 'to' addresses 1470 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1471 // 1472 // Aligned arrays have 4 bytes alignment in 32-bits VM 1473 // and 8 bytes - in 64-bits VM. 1474 // 1475 } else { 1476 // copy 1 element if necessary to align 'to' on an 4 bytes 1477 __ andcc(to, 3, G0); 1478 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1479 __ delayed()->lduh(from, 0, O3); 1480 __ inc(from, 2); 1481 __ inc(to, 2); 1482 __ dec(count); 1483 __ sth(O3, to, -2); 1484 __ BIND(L_skip_alignment); 1485 1486 // copy 2 elements to align 'to' on an 8 byte boundary 1487 __ andcc(to, 7, G0); 1488 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2); 1489 __ delayed()->lduh(from, 0, O3); 1490 __ dec(count, 2); 1491 __ lduh(from, 2, O4); 1492 __ inc(from, 4); 1493 __ inc(to, 4); 1494 __ sth(O3, to, -4); 1495 __ sth(O4, to, -2); 1496 __ BIND(L_skip_alignment2); 1497 } 1498 if (!aligned) { 1499 // Copy with shift 16 bytes per iteration if arrays do not have 1500 // the same alignment mod 8, otherwise fall through to the next 1501 // code for aligned copy. 1502 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1503 // Also jump over aligned copy after the copy with shift completed. 1504 1505 copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes); 1506 } 1507 1508 // Both array are 8 bytes aligned, copy 16 bytes at a time 1509 __ and3(count, 3, G4); // Save 1510 __ srl(count, 2, count); 1511 generate_disjoint_long_copy_core(aligned); 1512 __ mov(G4, count); // restore 1513 1514 // copy 1 element at a time 1515 __ BIND(L_copy_2_bytes); 1516 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1517 __ align(OptoLoopAlignment); 1518 __ BIND(L_copy_2_bytes_loop); 1519 __ lduh(from, offset, O3); 1520 __ deccc(count); 1521 __ sth(O3, to, offset); 1522 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop); 1523 __ delayed()->inc(offset, 2); 1524 1525 __ BIND(L_exit); 1526 // O3, O4 are used as temp registers 1527 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4); 1528 __ retl(); 1529 __ delayed()->mov(G0, O0); // return 0 1530 return start; 1531 } 1532 1533 // 1534 // Generate stub for disjoint short fill. If "aligned" is true, the 1535 // "to" address is assumed to be heapword aligned. 1536 // 1537 // Arguments for generated stub: 1538 // to: O0 1539 // value: O1 1540 // count: O2 treated as signed 1541 // 1542 address generate_fill(BasicType t, bool aligned, const char* name) { 1543 __ align(CodeEntryAlignment); 1544 StubCodeMark mark(this, "StubRoutines", name); 1545 address start = __ pc(); 1546 1547 const Register to = O0; // source array address 1548 const Register value = O1; // fill value 1549 const Register count = O2; // elements count 1550 // O3 is used as a temp register 1551 1552 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1553 1554 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte; 1555 Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes; 1556 1557 int shift = -1; 1558 switch (t) { 1559 case T_BYTE: 1560 shift = 2; 1561 break; 1562 case T_SHORT: 1563 shift = 1; 1564 break; 1565 case T_INT: 1566 shift = 0; 1567 break; 1568 default: ShouldNotReachHere(); 1569 } 1570 1571 BLOCK_COMMENT("Entry:"); 1572 1573 if (t == T_BYTE) { 1574 // Zero extend value 1575 __ and3(value, 0xff, value); 1576 __ sllx(value, 8, O3); 1577 __ or3(value, O3, value); 1578 } 1579 if (t == T_SHORT) { 1580 // Zero extend value 1581 __ sllx(value, 48, value); 1582 __ srlx(value, 48, value); 1583 } 1584 if (t == T_BYTE || t == T_SHORT) { 1585 __ sllx(value, 16, O3); 1586 __ or3(value, O3, value); 1587 } 1588 1589 __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element 1590 __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp 1591 __ delayed()->andcc(count, 1, G0); 1592 1593 if (!aligned && (t == T_BYTE || t == T_SHORT)) { 1594 // align source address at 4 bytes address boundary 1595 if (t == T_BYTE) { 1596 // One byte misalignment happens only for byte arrays 1597 __ andcc(to, 1, G0); 1598 __ br(Assembler::zero, false, Assembler::pt, L_skip_align1); 1599 __ delayed()->nop(); 1600 __ stb(value, to, 0); 1601 __ inc(to, 1); 1602 __ dec(count, 1); 1603 __ BIND(L_skip_align1); 1604 } 1605 // Two bytes misalignment happens only for byte and short (char) arrays 1606 __ andcc(to, 2, G0); 1607 __ br(Assembler::zero, false, Assembler::pt, L_skip_align2); 1608 __ delayed()->nop(); 1609 __ sth(value, to, 0); 1610 __ inc(to, 2); 1611 __ dec(count, 1 << (shift - 1)); 1612 __ BIND(L_skip_align2); 1613 } 1614 if (!aligned) { 1615 // align to 8 bytes, we know we are 4 byte aligned to start 1616 __ andcc(to, 7, G0); 1617 __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes); 1618 __ delayed()->nop(); 1619 __ stw(value, to, 0); 1620 __ inc(to, 4); 1621 __ dec(count, 1 << shift); 1622 __ BIND(L_fill_32_bytes); 1623 } 1624 1625 if (t == T_INT) { 1626 // Zero extend value 1627 __ srl(value, 0, value); 1628 } 1629 if (t == T_BYTE || t == T_SHORT || t == T_INT) { 1630 __ sllx(value, 32, O3); 1631 __ or3(value, O3, value); 1632 } 1633 1634 Label L_check_fill_8_bytes; 1635 // Fill 32-byte chunks 1636 __ subcc(count, 8 << shift, count); 1637 __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes); 1638 __ delayed()->nop(); 1639 1640 Label L_fill_32_bytes_loop, L_fill_4_bytes; 1641 __ align(16); 1642 __ BIND(L_fill_32_bytes_loop); 1643 1644 __ stx(value, to, 0); 1645 __ stx(value, to, 8); 1646 __ stx(value, to, 16); 1647 __ stx(value, to, 24); 1648 1649 __ subcc(count, 8 << shift, count); 1650 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop); 1651 __ delayed()->add(to, 32, to); 1652 1653 __ BIND(L_check_fill_8_bytes); 1654 __ addcc(count, 8 << shift, count); 1655 __ brx(Assembler::zero, false, Assembler::pn, L_exit); 1656 __ delayed()->subcc(count, 1 << (shift + 1), count); 1657 __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes); 1658 __ delayed()->andcc(count, 1<<shift, G0); 1659 1660 // 1661 // length is too short, just fill 8 bytes at a time 1662 // 1663 Label L_fill_8_bytes_loop; 1664 __ BIND(L_fill_8_bytes_loop); 1665 __ stx(value, to, 0); 1666 __ subcc(count, 1 << (shift + 1), count); 1667 __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop); 1668 __ delayed()->add(to, 8, to); 1669 1670 // fill trailing 4 bytes 1671 __ andcc(count, 1<<shift, G0); // in delay slot of branches 1672 if (t == T_INT) { 1673 __ BIND(L_fill_elements); 1674 } 1675 __ BIND(L_fill_4_bytes); 1676 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes); 1677 if (t == T_BYTE || t == T_SHORT) { 1678 __ delayed()->andcc(count, 1<<(shift-1), G0); 1679 } else { 1680 __ delayed()->nop(); 1681 } 1682 __ stw(value, to, 0); 1683 if (t == T_BYTE || t == T_SHORT) { 1684 __ inc(to, 4); 1685 // fill trailing 2 bytes 1686 __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches 1687 __ BIND(L_fill_2_bytes); 1688 __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte); 1689 __ delayed()->andcc(count, 1, count); 1690 __ sth(value, to, 0); 1691 if (t == T_BYTE) { 1692 __ inc(to, 2); 1693 // fill trailing byte 1694 __ andcc(count, 1, count); // in delay slot of branches 1695 __ BIND(L_fill_byte); 1696 __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1697 __ delayed()->nop(); 1698 __ stb(value, to, 0); 1699 } else { 1700 __ BIND(L_fill_byte); 1701 } 1702 } else { 1703 __ BIND(L_fill_2_bytes); 1704 } 1705 __ BIND(L_exit); 1706 __ retl(); 1707 __ delayed()->nop(); 1708 1709 // Handle copies less than 8 bytes. Int is handled elsewhere. 1710 if (t == T_BYTE) { 1711 __ BIND(L_fill_elements); 1712 Label L_fill_2, L_fill_4; 1713 // in delay slot __ andcc(count, 1, G0); 1714 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2); 1715 __ delayed()->andcc(count, 2, G0); 1716 __ stb(value, to, 0); 1717 __ inc(to, 1); 1718 __ BIND(L_fill_2); 1719 __ brx(Assembler::zero, false, Assembler::pt, L_fill_4); 1720 __ delayed()->andcc(count, 4, G0); 1721 __ stb(value, to, 0); 1722 __ stb(value, to, 1); 1723 __ inc(to, 2); 1724 __ BIND(L_fill_4); 1725 __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1726 __ delayed()->nop(); 1727 __ stb(value, to, 0); 1728 __ stb(value, to, 1); 1729 __ stb(value, to, 2); 1730 __ retl(); 1731 __ delayed()->stb(value, to, 3); 1732 } 1733 1734 if (t == T_SHORT) { 1735 Label L_fill_2; 1736 __ BIND(L_fill_elements); 1737 // in delay slot __ andcc(count, 1, G0); 1738 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2); 1739 __ delayed()->andcc(count, 2, G0); 1740 __ sth(value, to, 0); 1741 __ inc(to, 2); 1742 __ BIND(L_fill_2); 1743 __ brx(Assembler::zero, false, Assembler::pt, L_exit); 1744 __ delayed()->nop(); 1745 __ sth(value, to, 0); 1746 __ retl(); 1747 __ delayed()->sth(value, to, 2); 1748 } 1749 return start; 1750 } 1751 1752 // 1753 // Generate stub for conjoint short copy. If "aligned" is true, the 1754 // "from" and "to" addresses are assumed to be heapword aligned. 1755 // 1756 // Arguments for generated stub: 1757 // from: O0 1758 // to: O1 1759 // count: O2 treated as signed 1760 // 1761 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1762 address *entry, const char *name) { 1763 // Do reverse copy. 1764 1765 __ align(CodeEntryAlignment); 1766 StubCodeMark mark(this, "StubRoutines", name); 1767 address start = __ pc(); 1768 1769 Label L_skip_alignment, L_skip_alignment2, L_aligned_copy; 1770 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit; 1771 1772 const Register from = O0; // source array address 1773 const Register to = O1; // destination array address 1774 const Register count = O2; // elements count 1775 const Register end_from = from; // source array end address 1776 const Register end_to = to; // destination array end address 1777 1778 const Register byte_count = O3; // bytes count to copy 1779 1780 assert_clean_int(count, O3); // Make sure 'count' is clean int. 1781 1782 if (entry != NULL) { 1783 *entry = __ pc(); 1784 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1785 BLOCK_COMMENT("Entry:"); 1786 } 1787 1788 array_overlap_test(nooverlap_target, 1); 1789 1790 __ sllx(count, LogBytesPerShort, byte_count); 1791 __ add(to, byte_count, end_to); // offset after last copied element 1792 1793 // for short arrays, just do single element copy 1794 __ cmp(count, 11); // 8 + 3 (22 bytes) 1795 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes); 1796 __ delayed()->add(from, byte_count, end_from); 1797 1798 { 1799 // Align end of arrays since they could be not aligned even 1800 // when arrays itself are aligned. 1801 1802 // copy 1 element if necessary to align 'end_to' on an 4 bytes 1803 __ andcc(end_to, 3, G0); 1804 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1805 __ delayed()->lduh(end_from, -2, O3); 1806 __ dec(end_from, 2); 1807 __ dec(end_to, 2); 1808 __ dec(count); 1809 __ sth(O3, end_to, 0); 1810 __ BIND(L_skip_alignment); 1811 1812 // copy 2 elements to align 'end_to' on an 8 byte boundary 1813 __ andcc(end_to, 7, G0); 1814 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2); 1815 __ delayed()->lduh(end_from, -2, O3); 1816 __ dec(count, 2); 1817 __ lduh(end_from, -4, O4); 1818 __ dec(end_from, 4); 1819 __ dec(end_to, 4); 1820 __ sth(O3, end_to, 2); 1821 __ sth(O4, end_to, 0); 1822 __ BIND(L_skip_alignment2); 1823 } 1824 if (aligned) { 1825 // Both arrays are aligned to 8-bytes in 64-bits VM. 1826 // The 'count' is decremented in copy_16_bytes_backward_with_shift() 1827 // in unaligned case. 1828 __ dec(count, 8); 1829 } else { 1830 // Copy with shift 16 bytes per iteration if arrays do not have 1831 // the same alignment mod 8, otherwise jump to the next 1832 // code for aligned copy (and substracting 8 from 'count' before jump). 1833 // The compare above (count >= 11) guarantes 'count' >= 16 bytes. 1834 // Also jump over aligned copy after the copy with shift completed. 1835 1836 copy_16_bytes_backward_with_shift(end_from, end_to, count, 8, 1837 L_aligned_copy, L_copy_2_bytes); 1838 } 1839 // copy 4 elements (16 bytes) at a time 1840 __ align(OptoLoopAlignment); 1841 __ BIND(L_aligned_copy); 1842 __ dec(end_from, 16); 1843 __ ldx(end_from, 8, O3); 1844 __ ldx(end_from, 0, O4); 1845 __ dec(end_to, 16); 1846 __ deccc(count, 8); 1847 __ stx(O3, end_to, 8); 1848 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 1849 __ delayed()->stx(O4, end_to, 0); 1850 __ inc(count, 8); 1851 1852 // copy 1 element (2 bytes) at a time 1853 __ BIND(L_copy_2_bytes); 1854 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1855 __ BIND(L_copy_2_bytes_loop); 1856 __ dec(end_from, 2); 1857 __ dec(end_to, 2); 1858 __ lduh(end_from, 0, O4); 1859 __ deccc(count); 1860 __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop); 1861 __ delayed()->sth(O4, end_to, 0); 1862 1863 __ BIND(L_exit); 1864 // O3, O4 are used as temp registers 1865 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4); 1866 __ retl(); 1867 __ delayed()->mov(G0, O0); // return 0 1868 return start; 1869 } 1870 1871 // 1872 // Helper methods for generate_disjoint_int_copy_core() 1873 // 1874 void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec, 1875 Label& L_loop, bool use_prefetch, bool use_bis) { 1876 1877 __ align(OptoLoopAlignment); 1878 __ BIND(L_loop); 1879 if (use_prefetch) { 1880 if (ArraycopySrcPrefetchDistance > 0) { 1881 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads); 1882 } 1883 if (ArraycopyDstPrefetchDistance > 0) { 1884 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads); 1885 } 1886 } 1887 __ ldx(from, 4, O4); 1888 __ ldx(from, 12, G4); 1889 __ inc(to, 16); 1890 __ inc(from, 16); 1891 __ deccc(count, 4); // Can we do next iteration after this one? 1892 1893 __ srlx(O4, 32, G3); 1894 __ bset(G3, O3); 1895 __ sllx(O4, 32, O4); 1896 __ srlx(G4, 32, G3); 1897 __ bset(G3, O4); 1898 if (use_bis) { 1899 __ stxa(O3, to, -16); 1900 __ stxa(O4, to, -8); 1901 } else { 1902 __ stx(O3, to, -16); 1903 __ stx(O4, to, -8); 1904 } 1905 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 1906 __ delayed()->sllx(G4, 32, O3); 1907 1908 } 1909 1910 // 1911 // Generate core code for disjoint int copy (and oop copy on 32-bit). 1912 // If "aligned" is true, the "from" and "to" addresses are assumed 1913 // to be heapword aligned. 1914 // 1915 // Arguments: 1916 // from: O0 1917 // to: O1 1918 // count: O2 treated as signed 1919 // 1920 void generate_disjoint_int_copy_core(bool aligned) { 1921 1922 Label L_skip_alignment, L_aligned_copy; 1923 Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 1924 1925 const Register from = O0; // source array address 1926 const Register to = O1; // destination array address 1927 const Register count = O2; // elements count 1928 const Register offset = O5; // offset from start of arrays 1929 // O3, O4, G3, G4 are used as temp registers 1930 1931 // 'aligned' == true when it is known statically during compilation 1932 // of this arraycopy call site that both 'from' and 'to' addresses 1933 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()). 1934 // 1935 // Aligned arrays have 4 bytes alignment in 32-bits VM 1936 // and 8 bytes - in 64-bits VM. 1937 // 1938 if (!aligned) { 1939 // The next check could be put under 'ifndef' since the code in 1940 // generate_disjoint_long_copy_core() has own checks and set 'offset'. 1941 1942 // for short arrays, just do single element copy 1943 __ cmp(count, 5); // 4 + 1 (20 bytes) 1944 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes); 1945 __ delayed()->mov(G0, offset); 1946 1947 // copy 1 element to align 'to' on an 8 byte boundary 1948 __ andcc(to, 7, G0); 1949 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 1950 __ delayed()->ld(from, 0, O3); 1951 __ inc(from, 4); 1952 __ inc(to, 4); 1953 __ dec(count); 1954 __ st(O3, to, -4); 1955 __ BIND(L_skip_alignment); 1956 1957 // if arrays have same alignment mod 8, do 4 elements copy 1958 __ andcc(from, 7, G0); 1959 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 1960 __ delayed()->ld(from, 0, O3); 1961 1962 // 1963 // Load 2 aligned 8-bytes chunks and use one from previous iteration 1964 // to form 2 aligned 8-bytes chunks to store. 1965 // 1966 // copy_16_bytes_forward_with_shift() is not used here since this 1967 // code is more optimal. 1968 1969 // copy with shift 4 elements (16 bytes) at a time 1970 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4 1971 __ sllx(O3, 32, O3); 1972 1973 disjoint_copy_core(from, to, count, 2, 16, &StubGenerator::copy_16_bytes_loop); 1974 1975 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); 1976 __ delayed()->inc(count, 4); // restore 'count' 1977 1978 __ BIND(L_aligned_copy); 1979 } // !aligned 1980 1981 // copy 4 elements (16 bytes) at a time 1982 __ and3(count, 1, G4); // Save 1983 __ srl(count, 1, count); 1984 generate_disjoint_long_copy_core(aligned); 1985 __ mov(G4, count); // Restore 1986 1987 // copy 1 element at a time 1988 __ BIND(L_copy_4_bytes); 1989 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 1990 __ BIND(L_copy_4_bytes_loop); 1991 __ ld(from, offset, O3); 1992 __ deccc(count); 1993 __ st(O3, to, offset); 1994 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop); 1995 __ delayed()->inc(offset, 4); 1996 __ BIND(L_exit); 1997 } 1998 1999 // 2000 // Generate stub for disjoint int copy. If "aligned" is true, the 2001 // "from" and "to" addresses are assumed to be heapword aligned. 2002 // 2003 // Arguments for generated stub: 2004 // from: O0 2005 // to: O1 2006 // count: O2 treated as signed 2007 // 2008 address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) { 2009 __ align(CodeEntryAlignment); 2010 StubCodeMark mark(this, "StubRoutines", name); 2011 address start = __ pc(); 2012 2013 const Register count = O2; 2014 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2015 2016 if (entry != NULL) { 2017 *entry = __ pc(); 2018 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2019 BLOCK_COMMENT("Entry:"); 2020 } 2021 2022 generate_disjoint_int_copy_core(aligned); 2023 2024 // O3, O4 are used as temp registers 2025 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4); 2026 __ retl(); 2027 __ delayed()->mov(G0, O0); // return 0 2028 return start; 2029 } 2030 2031 // 2032 // Generate core code for conjoint int copy (and oop copy on 32-bit). 2033 // If "aligned" is true, the "from" and "to" addresses are assumed 2034 // to be heapword aligned. 2035 // 2036 // Arguments: 2037 // from: O0 2038 // to: O1 2039 // count: O2 treated as signed 2040 // 2041 void generate_conjoint_int_copy_core(bool aligned) { 2042 // Do reverse copy. 2043 2044 Label L_skip_alignment, L_aligned_copy; 2045 Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit; 2046 2047 const Register from = O0; // source array address 2048 const Register to = O1; // destination array address 2049 const Register count = O2; // elements count 2050 const Register end_from = from; // source array end address 2051 const Register end_to = to; // destination array end address 2052 // O3, O4, O5, G3 are used as temp registers 2053 2054 const Register byte_count = O3; // bytes count to copy 2055 2056 __ sllx(count, LogBytesPerInt, byte_count); 2057 __ add(to, byte_count, end_to); // offset after last copied element 2058 2059 __ cmp(count, 5); // for short arrays, just do single element copy 2060 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes); 2061 __ delayed()->add(from, byte_count, end_from); 2062 2063 // copy 1 element to align 'to' on an 8 byte boundary 2064 __ andcc(end_to, 7, G0); 2065 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment); 2066 __ delayed()->nop(); 2067 __ dec(count); 2068 __ dec(end_from, 4); 2069 __ dec(end_to, 4); 2070 __ ld(end_from, 0, O4); 2071 __ st(O4, end_to, 0); 2072 __ BIND(L_skip_alignment); 2073 2074 // Check if 'end_from' and 'end_to' has the same alignment. 2075 __ andcc(end_from, 7, G0); 2076 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy); 2077 __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4 2078 2079 // copy with shift 4 elements (16 bytes) at a time 2080 // 2081 // Load 2 aligned 8-bytes chunks and use one from previous iteration 2082 // to form 2 aligned 8-bytes chunks to store. 2083 // 2084 __ ldx(end_from, -4, O3); 2085 __ align(OptoLoopAlignment); 2086 __ BIND(L_copy_16_bytes); 2087 __ ldx(end_from, -12, O4); 2088 __ deccc(count, 4); 2089 __ ldx(end_from, -20, O5); 2090 __ dec(end_to, 16); 2091 __ dec(end_from, 16); 2092 __ srlx(O3, 32, O3); 2093 __ sllx(O4, 32, G3); 2094 __ bset(G3, O3); 2095 __ stx(O3, end_to, 8); 2096 __ srlx(O4, 32, O4); 2097 __ sllx(O5, 32, G3); 2098 __ bset(O4, G3); 2099 __ stx(G3, end_to, 0); 2100 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); 2101 __ delayed()->mov(O5, O3); 2102 2103 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes); 2104 __ delayed()->inc(count, 4); 2105 2106 // copy 4 elements (16 bytes) at a time 2107 __ align(OptoLoopAlignment); 2108 __ BIND(L_aligned_copy); 2109 __ dec(end_from, 16); 2110 __ ldx(end_from, 8, O3); 2111 __ ldx(end_from, 0, O4); 2112 __ dec(end_to, 16); 2113 __ deccc(count, 4); 2114 __ stx(O3, end_to, 8); 2115 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy); 2116 __ delayed()->stx(O4, end_to, 0); 2117 __ inc(count, 4); 2118 2119 // copy 1 element (4 bytes) at a time 2120 __ BIND(L_copy_4_bytes); 2121 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit); 2122 __ BIND(L_copy_4_bytes_loop); 2123 __ dec(end_from, 4); 2124 __ dec(end_to, 4); 2125 __ ld(end_from, 0, O4); 2126 __ deccc(count); 2127 __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop); 2128 __ delayed()->st(O4, end_to, 0); 2129 __ BIND(L_exit); 2130 } 2131 2132 // 2133 // Generate stub for conjoint int copy. If "aligned" is true, the 2134 // "from" and "to" addresses are assumed to be heapword aligned. 2135 // 2136 // Arguments for generated stub: 2137 // from: O0 2138 // to: O1 2139 // count: O2 treated as signed 2140 // 2141 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 2142 address *entry, const char *name) { 2143 __ align(CodeEntryAlignment); 2144 StubCodeMark mark(this, "StubRoutines", name); 2145 address start = __ pc(); 2146 2147 assert_clean_int(O2, O3); // Make sure 'count' is clean int. 2148 2149 if (entry != NULL) { 2150 *entry = __ pc(); 2151 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2152 BLOCK_COMMENT("Entry:"); 2153 } 2154 2155 array_overlap_test(nooverlap_target, 2); 2156 2157 generate_conjoint_int_copy_core(aligned); 2158 2159 // O3, O4 are used as temp registers 2160 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4); 2161 __ retl(); 2162 __ delayed()->mov(G0, O0); // return 0 2163 return start; 2164 } 2165 2166 // 2167 // Helper methods for generate_disjoint_long_copy_core() 2168 // 2169 void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec, 2170 Label& L_loop, bool use_prefetch, bool use_bis) { 2171 __ align(OptoLoopAlignment); 2172 __ BIND(L_loop); 2173 for (int off = 0; off < 64; off += 16) { 2174 if (use_prefetch && (off & 31) == 0) { 2175 if (ArraycopySrcPrefetchDistance > 0) { 2176 __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads); 2177 } 2178 if (ArraycopyDstPrefetchDistance > 0) { 2179 __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads); 2180 } 2181 } 2182 __ ldx(from, off+0, O4); 2183 __ ldx(from, off+8, O5); 2184 if (use_bis) { 2185 __ stxa(O4, to, off+0); 2186 __ stxa(O5, to, off+8); 2187 } else { 2188 __ stx(O4, to, off+0); 2189 __ stx(O5, to, off+8); 2190 } 2191 } 2192 __ deccc(count, 8); 2193 __ inc(from, 64); 2194 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop); 2195 __ delayed()->inc(to, 64); 2196 } 2197 2198 // 2199 // Generate core code for disjoint long copy (and oop copy on 64-bit). 2200 // "aligned" is ignored, because we must make the stronger 2201 // assumption that both addresses are always 64-bit aligned. 2202 // 2203 // Arguments: 2204 // from: O0 2205 // to: O1 2206 // count: O2 treated as signed 2207 // 2208 // count -= 2; 2209 // if ( count >= 0 ) { // >= 2 elements 2210 // if ( count > 6) { // >= 8 elements 2211 // count -= 6; // original count - 8 2212 // do { 2213 // copy_8_elements; 2214 // count -= 8; 2215 // } while ( count >= 0 ); 2216 // count += 6; 2217 // } 2218 // if ( count >= 0 ) { // >= 2 elements 2219 // do { 2220 // copy_2_elements; 2221 // } while ( (count=count-2) >= 0 ); 2222 // } 2223 // } 2224 // count += 2; 2225 // if ( count != 0 ) { // 1 element left 2226 // copy_1_element; 2227 // } 2228 // 2229 void generate_disjoint_long_copy_core(bool aligned) { 2230 Label L_copy_8_bytes, L_copy_16_bytes, L_exit; 2231 const Register from = O0; // source array address 2232 const Register to = O1; // destination array address 2233 const Register count = O2; // elements count 2234 const Register offset0 = O4; // element offset 2235 const Register offset8 = O5; // next element offset 2236 2237 __ deccc(count, 2); 2238 __ mov(G0, offset0); // offset from start of arrays (0) 2239 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 2240 __ delayed()->add(offset0, 8, offset8); 2241 2242 // Copy by 64 bytes chunks 2243 2244 const Register from64 = O3; // source address 2245 const Register to64 = G3; // destination address 2246 __ subcc(count, 6, O3); 2247 __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes ); 2248 __ delayed()->mov(to, to64); 2249 // Now we can use O4(offset0), O5(offset8) as temps 2250 __ mov(O3, count); 2251 // count >= 0 (original count - 8) 2252 __ mov(from, from64); 2253 2254 disjoint_copy_core(from64, to64, count, 3, 64, &StubGenerator::copy_64_bytes_loop); 2255 2256 // Restore O4(offset0), O5(offset8) 2257 __ sub(from64, from, offset0); 2258 __ inccc(count, 6); // restore count 2259 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes ); 2260 __ delayed()->add(offset0, 8, offset8); 2261 2262 // Copy by 16 bytes chunks 2263 __ align(OptoLoopAlignment); 2264 __ BIND(L_copy_16_bytes); 2265 __ ldx(from, offset0, O3); 2266 __ ldx(from, offset8, G3); 2267 __ deccc(count, 2); 2268 __ stx(O3, to, offset0); 2269 __ inc(offset0, 16); 2270 __ stx(G3, to, offset8); 2271 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes); 2272 __ delayed()->inc(offset8, 16); 2273 2274 // Copy last 8 bytes 2275 __ BIND(L_copy_8_bytes); 2276 __ inccc(count, 2); 2277 __ brx(Assembler::zero, true, Assembler::pn, L_exit ); 2278 __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs 2279 __ ldx(from, offset0, O3); 2280 __ stx(O3, to, offset0); 2281 __ BIND(L_exit); 2282 } 2283 2284 // 2285 // Generate stub for disjoint long copy. 2286 // "aligned" is ignored, because we must make the stronger 2287 // assumption that both addresses are always 64-bit aligned. 2288 // 2289 // Arguments for generated stub: 2290 // from: O0 2291 // to: O1 2292 // count: O2 treated as signed 2293 // 2294 address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) { 2295 __ align(CodeEntryAlignment); 2296 StubCodeMark mark(this, "StubRoutines", name); 2297 address start = __ pc(); 2298 2299 assert_clean_int(O2, O3); // Make sure 'count' is clean int. 2300 2301 if (entry != NULL) { 2302 *entry = __ pc(); 2303 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2304 BLOCK_COMMENT("Entry:"); 2305 } 2306 2307 generate_disjoint_long_copy_core(aligned); 2308 2309 // O3, O4 are used as temp registers 2310 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4); 2311 __ retl(); 2312 __ delayed()->mov(G0, O0); // return 0 2313 return start; 2314 } 2315 2316 // 2317 // Generate core code for conjoint long copy (and oop copy on 64-bit). 2318 // "aligned" is ignored, because we must make the stronger 2319 // assumption that both addresses are always 64-bit aligned. 2320 // 2321 // Arguments: 2322 // from: O0 2323 // to: O1 2324 // count: O2 treated as signed 2325 // 2326 void generate_conjoint_long_copy_core(bool aligned) { 2327 // Do reverse copy. 2328 Label L_copy_8_bytes, L_copy_16_bytes, L_exit; 2329 const Register from = O0; // source array address 2330 const Register to = O1; // destination array address 2331 const Register count = O2; // elements count 2332 const Register offset8 = O4; // element offset 2333 const Register offset0 = O5; // previous element offset 2334 2335 __ subcc(count, 1, count); 2336 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes ); 2337 __ delayed()->sllx(count, LogBytesPerLong, offset8); 2338 __ sub(offset8, 8, offset0); 2339 __ align(OptoLoopAlignment); 2340 __ BIND(L_copy_16_bytes); 2341 __ ldx(from, offset8, O2); 2342 __ ldx(from, offset0, O3); 2343 __ stx(O2, to, offset8); 2344 __ deccc(offset8, 16); // use offset8 as counter 2345 __ stx(O3, to, offset0); 2346 __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes); 2347 __ delayed()->dec(offset0, 16); 2348 2349 __ BIND(L_copy_8_bytes); 2350 __ brx(Assembler::negative, false, Assembler::pn, L_exit ); 2351 __ delayed()->nop(); 2352 __ ldx(from, 0, O3); 2353 __ stx(O3, to, 0); 2354 __ BIND(L_exit); 2355 } 2356 2357 // Generate stub for conjoint long copy. 2358 // "aligned" is ignored, because we must make the stronger 2359 // assumption that both addresses are always 64-bit aligned. 2360 // 2361 // Arguments for generated stub: 2362 // from: O0 2363 // to: O1 2364 // count: O2 treated as signed 2365 // 2366 address generate_conjoint_long_copy(bool aligned, address nooverlap_target, 2367 address *entry, const char *name) { 2368 __ align(CodeEntryAlignment); 2369 StubCodeMark mark(this, "StubRoutines", name); 2370 address start = __ pc(); 2371 2372 assert(aligned, "Should always be aligned"); 2373 2374 assert_clean_int(O2, O3); // Make sure 'count' is clean int. 2375 2376 if (entry != NULL) { 2377 *entry = __ pc(); 2378 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2379 BLOCK_COMMENT("Entry:"); 2380 } 2381 2382 array_overlap_test(nooverlap_target, 3); 2383 2384 generate_conjoint_long_copy_core(aligned); 2385 2386 // O3, O4 are used as temp registers 2387 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4); 2388 __ retl(); 2389 __ delayed()->mov(G0, O0); // return 0 2390 return start; 2391 } 2392 2393 // Generate stub for disjoint oop copy. If "aligned" is true, the 2394 // "from" and "to" addresses are assumed to be heapword aligned. 2395 // 2396 // Arguments for generated stub: 2397 // from: O0 2398 // to: O1 2399 // count: O2 treated as signed 2400 // 2401 address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name, 2402 bool dest_uninitialized = false) { 2403 2404 const Register from = O0; // source array address 2405 const Register to = O1; // destination array address 2406 const Register count = O2; // elements count 2407 2408 __ align(CodeEntryAlignment); 2409 StubCodeMark mark(this, "StubRoutines", name); 2410 address start = __ pc(); 2411 2412 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2413 2414 if (entry != NULL) { 2415 *entry = __ pc(); 2416 // caller can pass a 64-bit byte count here 2417 BLOCK_COMMENT("Entry:"); 2418 } 2419 2420 // save arguments for barrier generation 2421 if (UseZGC) { 2422 __ mov(from, G1); 2423 } else { 2424 __ mov(to, G1); 2425 } 2426 __ mov(count, G5); 2427 gen_load_ref_array_barrier(G1, G5); 2428 gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized); 2429 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2430 if (UseCompressedOops) { 2431 generate_disjoint_int_copy_core(aligned); 2432 } else { 2433 generate_disjoint_long_copy_core(aligned); 2434 } 2435 // O0 is used as temp register 2436 gen_write_ref_array_post_barrier(G1, G5, O0); 2437 2438 // O3, O4 are used as temp registers 2439 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4); 2440 __ retl(); 2441 __ delayed()->mov(G0, O0); // return 0 2442 return start; 2443 } 2444 2445 // Generate stub for conjoint oop copy. If "aligned" is true, the 2446 // "from" and "to" addresses are assumed to be heapword aligned. 2447 // 2448 // Arguments for generated stub: 2449 // from: O0 2450 // to: O1 2451 // count: O2 treated as signed 2452 // 2453 address generate_conjoint_oop_copy(bool aligned, address nooverlap_target, 2454 address *entry, const char *name, 2455 bool dest_uninitialized = false) { 2456 2457 const Register from = O0; // source array address 2458 const Register to = O1; // destination array address 2459 const Register count = O2; // elements count 2460 2461 __ align(CodeEntryAlignment); 2462 StubCodeMark mark(this, "StubRoutines", name); 2463 address start = __ pc(); 2464 2465 assert_clean_int(count, O3); // Make sure 'count' is clean int. 2466 2467 if (entry != NULL) { 2468 *entry = __ pc(); 2469 // caller can pass a 64-bit byte count here 2470 BLOCK_COMMENT("Entry:"); 2471 } 2472 2473 array_overlap_test(nooverlap_target, LogBytesPerHeapOop); 2474 2475 // save arguments for barrier generation 2476 if (UseZGC) { 2477 __ mov(from, G1); 2478 } else { 2479 __ mov(to, G1); 2480 } 2481 __ mov(count, G5); 2482 gen_load_ref_array_barrier(G1, G5); 2483 gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized); 2484 2485 if (UseCompressedOops) { 2486 generate_conjoint_int_copy_core(aligned); 2487 } else { 2488 generate_conjoint_long_copy_core(aligned); 2489 } 2490 2491 // O0 is used as temp register 2492 gen_write_ref_array_post_barrier(G1, G5, O0); 2493 2494 // O3, O4 are used as temp registers 2495 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4); 2496 __ retl(); 2497 __ delayed()->mov(G0, O0); // return 0 2498 return start; 2499 } 2500 2501 2502 // Helper for generating a dynamic type check. 2503 // Smashes only the given temp registers. 2504 void generate_type_check(Register sub_klass, 2505 Register super_check_offset, 2506 Register super_klass, 2507 Register temp, 2508 Label& L_success) { 2509 assert_different_registers(sub_klass, super_check_offset, super_klass, temp); 2510 2511 BLOCK_COMMENT("type_check:"); 2512 2513 Label L_miss, L_pop_to_miss; 2514 2515 assert_clean_int(super_check_offset, temp); 2516 2517 __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg, 2518 &L_success, &L_miss, NULL, 2519 super_check_offset); 2520 2521 BLOCK_COMMENT("type_check_slow_path:"); 2522 __ save_frame(0); 2523 __ check_klass_subtype_slow_path(sub_klass->after_save(), 2524 super_klass->after_save(), 2525 L0, L1, L2, L4, 2526 NULL, &L_pop_to_miss); 2527 __ ba(L_success); 2528 __ delayed()->restore(); 2529 2530 __ bind(L_pop_to_miss); 2531 __ restore(); 2532 2533 // Fall through on failure! 2534 __ BIND(L_miss); 2535 } 2536 2537 2538 // Generate stub for checked oop copy. 2539 // 2540 // Arguments for generated stub: 2541 // from: O0 2542 // to: O1 2543 // count: O2 treated as signed 2544 // ckoff: O3 (super_check_offset) 2545 // ckval: O4 (super_klass) 2546 // ret: O0 zero for success; (-1^K) where K is partial transfer count 2547 // 2548 address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) { 2549 2550 const Register O0_from = O0; // source array address 2551 const Register O1_to = O1; // destination array address 2552 const Register O2_count = O2; // elements count 2553 const Register O3_ckoff = O3; // super_check_offset 2554 const Register O4_ckval = O4; // super_klass 2555 2556 const Register O5_offset = O5; // loop var, with stride wordSize 2557 const Register G1_remain = G1; // loop var, with stride -1 2558 const Register G3_oop = G3; // actual oop copied 2559 const Register G4_klass = G4; // oop._klass 2560 const Register G5_super = G5; // oop._klass._primary_supers[ckval] 2561 2562 __ align(CodeEntryAlignment); 2563 StubCodeMark mark(this, "StubRoutines", name); 2564 address start = __ pc(); 2565 2566 #ifdef ASSERT 2567 // We sometimes save a frame (see generate_type_check below). 2568 // If this will cause trouble, let's fail now instead of later. 2569 __ save_frame(0); 2570 __ restore(); 2571 #endif 2572 2573 assert_clean_int(O2_count, G1); // Make sure 'count' is clean int. 2574 2575 #ifdef ASSERT 2576 // caller guarantees that the arrays really are different 2577 // otherwise, we would have to make conjoint checks 2578 { Label L; 2579 __ mov(O3, G1); // spill: overlap test smashes O3 2580 __ mov(O4, G4); // spill: overlap test smashes O4 2581 array_overlap_test(L, LogBytesPerHeapOop); 2582 __ stop("checkcast_copy within a single array"); 2583 __ bind(L); 2584 __ mov(G1, O3); 2585 __ mov(G4, O4); 2586 } 2587 #endif //ASSERT 2588 2589 if (entry != NULL) { 2590 *entry = __ pc(); 2591 // caller can pass a 64-bit byte count here (from generic stub) 2592 BLOCK_COMMENT("Entry:"); 2593 } 2594 gen_load_ref_array_barrier(O0_from, O2_count); 2595 gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized); 2596 2597 Label load_element, store_element, do_card_marks, fail, done; 2598 __ addcc(O2_count, 0, G1_remain); // initialize loop index, and test it 2599 __ brx(Assembler::notZero, false, Assembler::pt, load_element); 2600 __ delayed()->mov(G0, O5_offset); // offset from start of arrays 2601 2602 // Empty array: Nothing to do. 2603 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4); 2604 __ retl(); 2605 __ delayed()->set(0, O0); // return 0 on (trivial) success 2606 2607 // ======== begin loop ======== 2608 // (Loop is rotated; its entry is load_element.) 2609 // Loop variables: 2610 // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays 2611 // (O2 = len; O2 != 0; O2--) --- number of oops *remaining* 2612 // G3, G4, G5 --- current oop, oop.klass, oop.klass.super 2613 __ align(OptoLoopAlignment); 2614 2615 __ BIND(store_element); 2616 __ deccc(G1_remain); // decrement the count 2617 __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop 2618 __ inc(O5_offset, heapOopSize); // step to next offset 2619 __ brx(Assembler::zero, true, Assembler::pt, do_card_marks); 2620 __ delayed()->set(0, O0); // return -1 on success 2621 2622 // ======== loop entry is here ======== 2623 __ BIND(load_element); 2624 __ load_heap_oop(O0_from, O5_offset, G3_oop); // load the oop 2625 __ br_null_short(G3_oop, Assembler::pt, store_element); 2626 2627 __ load_klass(G3_oop, G4_klass); // query the object klass 2628 2629 generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super, 2630 // branch to this on success: 2631 store_element); 2632 // ======== end loop ======== 2633 2634 // It was a real error; we must depend on the caller to finish the job. 2635 // Register G1 has number of *remaining* oops, O2 number of *total* oops. 2636 // Emit GC store barriers for the oops we have copied (O2 minus G1), 2637 // and report their number to the caller. 2638 __ BIND(fail); 2639 __ subcc(O2_count, G1_remain, O2_count); 2640 __ brx(Assembler::zero, false, Assembler::pt, done); 2641 __ delayed()->not1(O2_count, O0); // report (-1^K) to caller 2642 2643 __ BIND(do_card_marks); 2644 gen_write_ref_array_post_barrier(O1_to, O2_count, O3); // store check on O1[0..O2] 2645 2646 __ BIND(done); 2647 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4); 2648 __ retl(); 2649 __ delayed()->nop(); // return value in 00 2650 2651 return start; 2652 } 2653 2654 2655 // Generate 'unsafe' array copy stub 2656 // Though just as safe as the other stubs, it takes an unscaled 2657 // size_t argument instead of an element count. 2658 // 2659 // Arguments for generated stub: 2660 // from: O0 2661 // to: O1 2662 // count: O2 byte count, treated as ssize_t, can be zero 2663 // 2664 // Examines the alignment of the operands and dispatches 2665 // to a long, int, short, or byte copy loop. 2666 // 2667 address generate_unsafe_copy(const char* name, 2668 address byte_copy_entry, 2669 address short_copy_entry, 2670 address int_copy_entry, 2671 address long_copy_entry) { 2672 2673 const Register O0_from = O0; // source array address 2674 const Register O1_to = O1; // destination array address 2675 const Register O2_count = O2; // elements count 2676 2677 const Register G1_bits = G1; // test copy of low bits 2678 2679 __ align(CodeEntryAlignment); 2680 StubCodeMark mark(this, "StubRoutines", name); 2681 address start = __ pc(); 2682 2683 // bump this on entry, not on exit: 2684 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3); 2685 2686 __ or3(O0_from, O1_to, G1_bits); 2687 __ or3(O2_count, G1_bits, G1_bits); 2688 2689 __ btst(BytesPerLong-1, G1_bits); 2690 __ br(Assembler::zero, true, Assembler::pt, 2691 long_copy_entry, relocInfo::runtime_call_type); 2692 // scale the count on the way out: 2693 __ delayed()->srax(O2_count, LogBytesPerLong, O2_count); 2694 2695 __ btst(BytesPerInt-1, G1_bits); 2696 __ br(Assembler::zero, true, Assembler::pt, 2697 int_copy_entry, relocInfo::runtime_call_type); 2698 // scale the count on the way out: 2699 __ delayed()->srax(O2_count, LogBytesPerInt, O2_count); 2700 2701 __ btst(BytesPerShort-1, G1_bits); 2702 __ br(Assembler::zero, true, Assembler::pt, 2703 short_copy_entry, relocInfo::runtime_call_type); 2704 // scale the count on the way out: 2705 __ delayed()->srax(O2_count, LogBytesPerShort, O2_count); 2706 2707 __ br(Assembler::always, false, Assembler::pt, 2708 byte_copy_entry, relocInfo::runtime_call_type); 2709 __ delayed()->nop(); 2710 2711 return start; 2712 } 2713 2714 2715 // Perform range checks on the proposed arraycopy. 2716 // Kills the two temps, but nothing else. 2717 // Also, clean the sign bits of src_pos and dst_pos. 2718 void arraycopy_range_checks(Register src, // source array oop (O0) 2719 Register src_pos, // source position (O1) 2720 Register dst, // destination array oo (O2) 2721 Register dst_pos, // destination position (O3) 2722 Register length, // length of copy (O4) 2723 Register temp1, Register temp2, 2724 Label& L_failed) { 2725 BLOCK_COMMENT("arraycopy_range_checks:"); 2726 2727 // if (src_pos + length > arrayOop(src)->length() ) FAIL; 2728 2729 const Register array_length = temp1; // scratch 2730 const Register end_pos = temp2; // scratch 2731 2732 // Note: This next instruction may be in the delay slot of a branch: 2733 __ add(length, src_pos, end_pos); // src_pos + length 2734 __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length); 2735 __ cmp(end_pos, array_length); 2736 __ br(Assembler::greater, false, Assembler::pn, L_failed); 2737 2738 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL; 2739 __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length 2740 __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length); 2741 __ cmp(end_pos, array_length); 2742 __ br(Assembler::greater, false, Assembler::pn, L_failed); 2743 2744 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'. 2745 // Move with sign extension can be used since they are positive. 2746 __ delayed()->signx(src_pos, src_pos); 2747 __ signx(dst_pos, dst_pos); 2748 2749 BLOCK_COMMENT("arraycopy_range_checks done"); 2750 } 2751 2752 2753 // 2754 // Generate generic array copy stubs 2755 // 2756 // Input: 2757 // O0 - src oop 2758 // O1 - src_pos 2759 // O2 - dst oop 2760 // O3 - dst_pos 2761 // O4 - element count 2762 // 2763 // Output: 2764 // O0 == 0 - success 2765 // O0 == -1 - need to call System.arraycopy 2766 // 2767 address generate_generic_copy(const char *name, 2768 address entry_jbyte_arraycopy, 2769 address entry_jshort_arraycopy, 2770 address entry_jint_arraycopy, 2771 address entry_oop_arraycopy, 2772 address entry_jlong_arraycopy, 2773 address entry_checkcast_arraycopy) { 2774 Label L_failed, L_objArray; 2775 2776 // Input registers 2777 const Register src = O0; // source array oop 2778 const Register src_pos = O1; // source position 2779 const Register dst = O2; // destination array oop 2780 const Register dst_pos = O3; // destination position 2781 const Register length = O4; // elements count 2782 2783 // registers used as temp 2784 const Register G3_src_klass = G3; // source array klass 2785 const Register G4_dst_klass = G4; // destination array klass 2786 const Register G5_lh = G5; // layout handler 2787 const Register O5_temp = O5; 2788 2789 __ align(CodeEntryAlignment); 2790 StubCodeMark mark(this, "StubRoutines", name); 2791 address start = __ pc(); 2792 2793 // bump this on entry, not on exit: 2794 inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3); 2795 2796 // In principle, the int arguments could be dirty. 2797 //assert_clean_int(src_pos, G1); 2798 //assert_clean_int(dst_pos, G1); 2799 //assert_clean_int(length, G1); 2800 2801 //----------------------------------------------------------------------- 2802 // Assembler stubs will be used for this call to arraycopy 2803 // if the following conditions are met: 2804 // 2805 // (1) src and dst must not be null. 2806 // (2) src_pos must not be negative. 2807 // (3) dst_pos must not be negative. 2808 // (4) length must not be negative. 2809 // (5) src klass and dst klass should be the same and not NULL. 2810 // (6) src and dst should be arrays. 2811 // (7) src_pos + length must not exceed length of src. 2812 // (8) dst_pos + length must not exceed length of dst. 2813 BLOCK_COMMENT("arraycopy initial argument checks"); 2814 2815 // if (src == NULL) return -1; 2816 __ br_null(src, false, Assembler::pn, L_failed); 2817 2818 // if (src_pos < 0) return -1; 2819 __ delayed()->tst(src_pos); 2820 __ br(Assembler::negative, false, Assembler::pn, L_failed); 2821 __ delayed()->nop(); 2822 2823 // if (dst == NULL) return -1; 2824 __ br_null(dst, false, Assembler::pn, L_failed); 2825 2826 // if (dst_pos < 0) return -1; 2827 __ delayed()->tst(dst_pos); 2828 __ br(Assembler::negative, false, Assembler::pn, L_failed); 2829 2830 // if (length < 0) return -1; 2831 __ delayed()->tst(length); 2832 __ br(Assembler::negative, false, Assembler::pn, L_failed); 2833 2834 BLOCK_COMMENT("arraycopy argument klass checks"); 2835 // get src->klass() 2836 if (UseCompressedClassPointers) { 2837 __ delayed()->nop(); // ??? not good 2838 __ load_klass(src, G3_src_klass); 2839 } else { 2840 __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass); 2841 } 2842 2843 #ifdef ASSERT 2844 // assert(src->klass() != NULL); 2845 BLOCK_COMMENT("assert klasses not null"); 2846 { Label L_a, L_b; 2847 __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL 2848 __ bind(L_a); 2849 __ stop("broken null klass"); 2850 __ bind(L_b); 2851 __ load_klass(dst, G4_dst_klass); 2852 __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also 2853 __ delayed()->mov(G0, G4_dst_klass); // scribble the temp 2854 BLOCK_COMMENT("assert done"); 2855 } 2856 #endif 2857 2858 // Load layout helper 2859 // 2860 // |array_tag| | header_size | element_type | |log2_element_size| 2861 // 32 30 24 16 8 2 0 2862 // 2863 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2864 // 2865 2866 int lh_offset = in_bytes(Klass::layout_helper_offset()); 2867 2868 // Load 32-bits signed value. Use br() instruction with it to check icc. 2869 __ lduw(G3_src_klass, lh_offset, G5_lh); 2870 2871 if (UseCompressedClassPointers) { 2872 __ load_klass(dst, G4_dst_klass); 2873 } 2874 // Handle objArrays completely differently... 2875 juint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2876 __ set(objArray_lh, O5_temp); 2877 __ cmp(G5_lh, O5_temp); 2878 __ br(Assembler::equal, false, Assembler::pt, L_objArray); 2879 if (UseCompressedClassPointers) { 2880 __ delayed()->nop(); 2881 } else { 2882 __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass); 2883 } 2884 2885 // if (src->klass() != dst->klass()) return -1; 2886 __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed); 2887 2888 // if (!src->is_Array()) return -1; 2889 __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0 2890 __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed); 2891 2892 // At this point, it is known to be a typeArray (array_tag 0x3). 2893 #ifdef ASSERT 2894 __ delayed()->nop(); 2895 { Label L; 2896 jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2897 __ set(lh_prim_tag_in_place, O5_temp); 2898 __ cmp(G5_lh, O5_temp); 2899 __ br(Assembler::greaterEqual, false, Assembler::pt, L); 2900 __ delayed()->nop(); 2901 __ stop("must be a primitive array"); 2902 __ bind(L); 2903 } 2904 #else 2905 __ delayed(); // match next insn to prev branch 2906 #endif 2907 2908 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2909 O5_temp, G4_dst_klass, L_failed); 2910 2911 // TypeArrayKlass 2912 // 2913 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2914 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2915 // 2916 2917 const Register G4_offset = G4_dst_klass; // array offset 2918 const Register G3_elsize = G3_src_klass; // log2 element size 2919 2920 __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset); 2921 __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset 2922 __ add(src, G4_offset, src); // src array offset 2923 __ add(dst, G4_offset, dst); // dst array offset 2924 __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size 2925 2926 // next registers should be set before the jump to corresponding stub 2927 const Register from = O0; // source array address 2928 const Register to = O1; // destination array address 2929 const Register count = O2; // elements count 2930 2931 // 'from', 'to', 'count' registers should be set in this order 2932 // since they are the same as 'src', 'src_pos', 'dst'. 2933 2934 BLOCK_COMMENT("scale indexes to element size"); 2935 __ sll_ptr(src_pos, G3_elsize, src_pos); 2936 __ sll_ptr(dst_pos, G3_elsize, dst_pos); 2937 __ add(src, src_pos, from); // src_addr 2938 __ add(dst, dst_pos, to); // dst_addr 2939 2940 BLOCK_COMMENT("choose copy loop based on element size"); 2941 __ cmp(G3_elsize, 0); 2942 __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy); 2943 __ delayed()->signx(length, count); // length 2944 2945 __ cmp(G3_elsize, LogBytesPerShort); 2946 __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy); 2947 __ delayed()->signx(length, count); // length 2948 2949 __ cmp(G3_elsize, LogBytesPerInt); 2950 __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy); 2951 __ delayed()->signx(length, count); // length 2952 #ifdef ASSERT 2953 { Label L; 2954 __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L); 2955 __ stop("must be long copy, but elsize is wrong"); 2956 __ bind(L); 2957 } 2958 #endif 2959 __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy); 2960 __ delayed()->signx(length, count); // length 2961 2962 // ObjArrayKlass 2963 __ BIND(L_objArray); 2964 // live at this point: G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length 2965 2966 Label L_plain_copy, L_checkcast_copy; 2967 // test array classes for subtyping 2968 __ cmp(G3_src_klass, G4_dst_klass); // usual case is exact equality 2969 __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy); 2970 __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below 2971 2972 // Identically typed arrays can be copied without element-wise checks. 2973 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2974 O5_temp, G5_lh, L_failed); 2975 2976 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset 2977 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset 2978 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos); 2979 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos); 2980 __ add(src, src_pos, from); // src_addr 2981 __ add(dst, dst_pos, to); // dst_addr 2982 __ BIND(L_plain_copy); 2983 __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy); 2984 __ delayed()->signx(length, count); // length 2985 2986 __ BIND(L_checkcast_copy); 2987 // live at this point: G3_src_klass, G4_dst_klass 2988 { 2989 // Before looking at dst.length, make sure dst is also an objArray. 2990 // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot 2991 __ cmp(G5_lh, O5_temp); 2992 __ br(Assembler::notEqual, false, Assembler::pn, L_failed); 2993 2994 // It is safe to examine both src.length and dst.length. 2995 __ delayed(); // match next insn to prev branch 2996 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2997 O5_temp, G5_lh, L_failed); 2998 2999 // Marshal the base address arguments now, freeing registers. 3000 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset 3001 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset 3002 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos); 3003 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos); 3004 __ add(src, src_pos, from); // src_addr 3005 __ add(dst, dst_pos, to); // dst_addr 3006 __ signx(length, count); // length (reloaded) 3007 3008 Register sco_temp = O3; // this register is free now 3009 assert_different_registers(from, to, count, sco_temp, 3010 G4_dst_klass, G3_src_klass); 3011 3012 // Generate the type check. 3013 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 3014 __ lduw(G4_dst_klass, sco_offset, sco_temp); 3015 generate_type_check(G3_src_klass, sco_temp, G4_dst_klass, 3016 O5_temp, L_plain_copy); 3017 3018 // Fetch destination element klass from the ObjArrayKlass header. 3019 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 3020 3021 // the checkcast_copy loop needs two extra arguments: 3022 __ ld_ptr(G4_dst_klass, ek_offset, O4); // dest elem klass 3023 // lduw(O4, sco_offset, O3); // sco of elem klass 3024 3025 __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy); 3026 __ delayed()->lduw(O4, sco_offset, O3); 3027 } 3028 3029 __ BIND(L_failed); 3030 __ retl(); 3031 __ delayed()->sub(G0, 1, O0); // return -1 3032 return start; 3033 } 3034 3035 // 3036 // Generate stub for heap zeroing. 3037 // "to" address is aligned to jlong (8 bytes). 3038 // 3039 // Arguments for generated stub: 3040 // to: O0 3041 // count: O1 treated as signed (count of HeapWord) 3042 // count could be 0 3043 // 3044 address generate_zero_aligned_words(const char* name) { 3045 __ align(CodeEntryAlignment); 3046 StubCodeMark mark(this, "StubRoutines", name); 3047 address start = __ pc(); 3048 3049 const Register to = O0; // source array address 3050 const Register count = O1; // HeapWords count 3051 const Register temp = O2; // scratch 3052 3053 Label Ldone; 3054 __ sllx(count, LogHeapWordSize, count); // to bytes count 3055 // Use BIS for zeroing 3056 __ bis_zeroing(to, count, temp, Ldone); 3057 __ bind(Ldone); 3058 __ retl(); 3059 __ delayed()->nop(); 3060 return start; 3061 } 3062 3063 void generate_arraycopy_stubs() { 3064 address entry; 3065 address entry_jbyte_arraycopy; 3066 address entry_jshort_arraycopy; 3067 address entry_jint_arraycopy; 3068 address entry_oop_arraycopy; 3069 address entry_jlong_arraycopy; 3070 address entry_checkcast_arraycopy; 3071 3072 //*** jbyte 3073 // Always need aligned and unaligned versions 3074 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 3075 "jbyte_disjoint_arraycopy"); 3076 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 3077 &entry_jbyte_arraycopy, 3078 "jbyte_arraycopy"); 3079 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 3080 "arrayof_jbyte_disjoint_arraycopy"); 3081 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 3082 "arrayof_jbyte_arraycopy"); 3083 3084 //*** jshort 3085 // Always need aligned and unaligned versions 3086 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 3087 "jshort_disjoint_arraycopy"); 3088 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 3089 &entry_jshort_arraycopy, 3090 "jshort_arraycopy"); 3091 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 3092 "arrayof_jshort_disjoint_arraycopy"); 3093 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 3094 "arrayof_jshort_arraycopy"); 3095 3096 //*** jint 3097 // Aligned versions 3098 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 3099 "arrayof_jint_disjoint_arraycopy"); 3100 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 3101 "arrayof_jint_arraycopy"); 3102 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 3103 // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it). 3104 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 3105 "jint_disjoint_arraycopy"); 3106 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 3107 &entry_jint_arraycopy, 3108 "jint_arraycopy"); 3109 3110 //*** jlong 3111 // It is always aligned 3112 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 3113 "arrayof_jlong_disjoint_arraycopy"); 3114 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 3115 "arrayof_jlong_arraycopy"); 3116 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 3117 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 3118 3119 3120 //*** oops 3121 // Aligned versions 3122 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, &entry, 3123 "arrayof_oop_disjoint_arraycopy"); 3124 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy, 3125 "arrayof_oop_arraycopy"); 3126 // Aligned versions without pre-barriers 3127 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry, 3128 "arrayof_oop_disjoint_arraycopy_uninit", 3129 /*dest_uninitialized*/true); 3130 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, entry, NULL, 3131 "arrayof_oop_arraycopy_uninit", 3132 /*dest_uninitialized*/true); 3133 if (UseCompressedOops) { 3134 // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy. 3135 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, &entry, 3136 "oop_disjoint_arraycopy"); 3137 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy, 3138 "oop_arraycopy"); 3139 // Unaligned versions without pre-barriers 3140 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, &entry, 3141 "oop_disjoint_arraycopy_uninit", 3142 /*dest_uninitialized*/true); 3143 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, entry, NULL, 3144 "oop_arraycopy_uninit", 3145 /*dest_uninitialized*/true); 3146 } else { 3147 // oop arraycopy is always aligned on 32bit and 64bit without compressed oops 3148 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 3149 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 3150 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 3151 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 3152 } 3153 3154 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 3155 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 3156 /*dest_uninitialized*/true); 3157 3158 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 3159 entry_jbyte_arraycopy, 3160 entry_jshort_arraycopy, 3161 entry_jint_arraycopy, 3162 entry_jlong_arraycopy); 3163 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 3164 entry_jbyte_arraycopy, 3165 entry_jshort_arraycopy, 3166 entry_jint_arraycopy, 3167 entry_oop_arraycopy, 3168 entry_jlong_arraycopy, 3169 entry_checkcast_arraycopy); 3170 3171 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 3172 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 3173 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 3174 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 3175 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 3176 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 3177 3178 if (UseBlockZeroing) { 3179 StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words"); 3180 } 3181 } 3182 3183 address generate_aescrypt_encryptBlock() { 3184 // required since we read expanded key 'int' array starting first element without alignment considerations 3185 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 3186 "the following code assumes that first element of an int array is aligned to 8 bytes"); 3187 __ align(CodeEntryAlignment); 3188 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 3189 Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output; 3190 address start = __ pc(); 3191 Register from = O0; // source byte array 3192 Register to = O1; // destination byte array 3193 Register key = O2; // expanded key array 3194 const Register keylen = O4; //reg for storing expanded key array length 3195 3196 // read expanded key length 3197 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 3198 3199 // Method to address arbitrary alignment for load instructions: 3200 // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary 3201 // If zero/aligned then continue with double FP load instructions 3202 // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata 3203 // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address 3204 // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address 3205 // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs 3206 3207 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3208 __ andcc(from, 7, G0); 3209 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); 3210 __ delayed()->alignaddr(from, G0, from); 3211 3212 // aligned case: load input into F54-F56 3213 __ ldf(FloatRegisterImpl::D, from, 0, F54); 3214 __ ldf(FloatRegisterImpl::D, from, 8, F56); 3215 __ ba_short(L_load_expanded_key); 3216 3217 __ BIND(L_load_misaligned_input); 3218 __ ldf(FloatRegisterImpl::D, from, 0, F54); 3219 __ ldf(FloatRegisterImpl::D, from, 8, F56); 3220 __ ldf(FloatRegisterImpl::D, from, 16, F58); 3221 __ faligndata(F54, F56, F54); 3222 __ faligndata(F56, F58, F56); 3223 3224 __ BIND(L_load_expanded_key); 3225 // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed 3226 for ( int i = 0; i <= 38; i += 2 ) { 3227 __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i)); 3228 } 3229 3230 // perform cipher transformation 3231 __ fxor(FloatRegisterImpl::D, F0, F54, F54); 3232 __ fxor(FloatRegisterImpl::D, F2, F56, F56); 3233 // rounds 1 through 8 3234 for ( int i = 4; i <= 28; i += 8 ) { 3235 __ aes_eround01(as_FloatRegister(i), F54, F56, F58); 3236 __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60); 3237 __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54); 3238 __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56); 3239 } 3240 __ aes_eround01(F36, F54, F56, F58); //round 9 3241 __ aes_eround23(F38, F54, F56, F60); 3242 3243 // 128-bit original key size 3244 __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit); 3245 3246 for ( int i = 40; i <= 50; i += 2 ) { 3247 __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) ); 3248 } 3249 __ aes_eround01(F40, F58, F60, F54); //round 10 3250 __ aes_eround23(F42, F58, F60, F56); 3251 __ aes_eround01(F44, F54, F56, F58); //round 11 3252 __ aes_eround23(F46, F54, F56, F60); 3253 3254 // 192-bit original key size 3255 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput); 3256 3257 __ ldf(FloatRegisterImpl::D, key, 208, F52); 3258 __ aes_eround01(F48, F58, F60, F54); //round 12 3259 __ aes_eround23(F50, F58, F60, F56); 3260 __ ldf(FloatRegisterImpl::D, key, 216, F46); 3261 __ ldf(FloatRegisterImpl::D, key, 224, F48); 3262 __ ldf(FloatRegisterImpl::D, key, 232, F50); 3263 __ aes_eround01(F52, F54, F56, F58); //round 13 3264 __ aes_eround23(F46, F54, F56, F60); 3265 __ ba_short(L_storeOutput); 3266 3267 __ BIND(L_doLast128bit); 3268 __ ldf(FloatRegisterImpl::D, key, 160, F48); 3269 __ ldf(FloatRegisterImpl::D, key, 168, F50); 3270 3271 __ BIND(L_storeOutput); 3272 // perform last round of encryption common for all key sizes 3273 __ aes_eround01_l(F48, F58, F60, F54); //last round 3274 __ aes_eround23_l(F50, F58, F60, F56); 3275 3276 // Method to address arbitrary alignment for store instructions: 3277 // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary 3278 // If zero/aligned then continue with double FP store instructions 3279 // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case) 3280 // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001 3281 // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case 3282 // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case. 3283 // Set GSR.align to (8-n) using alignaddr 3284 // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf 3285 // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address 3286 // Store (partial) the original first (8-n) bytes starting at the original 'dest' address 3287 // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address 3288 // We need to execute this process for both the 8-byte result values 3289 3290 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3291 __ andcc(to, 7, O5); 3292 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); 3293 __ delayed()->edge8n(to, G0, O3); 3294 3295 // aligned case: store output into the destination array 3296 __ stf(FloatRegisterImpl::D, F54, to, 0); 3297 __ retl(); 3298 __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8); 3299 3300 __ BIND(L_store_misaligned_output); 3301 __ add(to, 8, O4); 3302 __ mov(8, O2); 3303 __ sub(O2, O5, O2); 3304 __ alignaddr(O2, G0, O2); 3305 __ faligndata(F54, F54, F54); 3306 __ faligndata(F56, F56, F56); 3307 __ and3(to, -8, to); 3308 __ and3(O4, -8, O4); 3309 __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); 3310 __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); 3311 __ add(to, 8, to); 3312 __ add(O4, 8, O4); 3313 __ orn(G0, O3, O3); 3314 __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); 3315 __ retl(); 3316 __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); 3317 3318 return start; 3319 } 3320 3321 address generate_aescrypt_decryptBlock() { 3322 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 3323 "the following code assumes that first element of an int array is aligned to 8 bytes"); 3324 // required since we read original key 'byte' array as well in the decryption stubs 3325 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 3326 "the following code assumes that first element of a byte array is aligned to 8 bytes"); 3327 __ align(CodeEntryAlignment); 3328 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 3329 address start = __ pc(); 3330 Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input; 3331 Label L_256bit_transform, L_common_transform, L_store_misaligned_output; 3332 Register from = O0; // source byte array 3333 Register to = O1; // destination byte array 3334 Register key = O2; // expanded key array 3335 Register original_key = O3; // original key array only required during decryption 3336 const Register keylen = O4; // reg for storing expanded key array length 3337 3338 // read expanded key array length 3339 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 3340 3341 // save 'from' since we may need to recheck alignment in case of 256-bit decryption 3342 __ mov(from, G1); 3343 3344 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3345 __ andcc(from, 7, G0); 3346 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); 3347 __ delayed()->alignaddr(from, G0, from); 3348 3349 // aligned case: load input into F52-F54 3350 __ ldf(FloatRegisterImpl::D, from, 0, F52); 3351 __ ldf(FloatRegisterImpl::D, from, 8, F54); 3352 __ ba_short(L_load_original_key); 3353 3354 __ BIND(L_load_misaligned_input); 3355 __ ldf(FloatRegisterImpl::D, from, 0, F52); 3356 __ ldf(FloatRegisterImpl::D, from, 8, F54); 3357 __ ldf(FloatRegisterImpl::D, from, 16, F56); 3358 __ faligndata(F52, F54, F52); 3359 __ faligndata(F54, F56, F54); 3360 3361 __ BIND(L_load_original_key); 3362 // load original key from SunJCE expanded decryption key 3363 // Since we load original key buffer starting first element, 8-byte alignment is guaranteed 3364 for ( int i = 0; i <= 3; i++ ) { 3365 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 3366 } 3367 3368 // 256-bit original key size 3369 __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit); 3370 3371 // 192-bit original key size 3372 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit); 3373 3374 // 128-bit original key size 3375 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 3376 for ( int i = 0; i <= 36; i += 4 ) { 3377 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4)); 3378 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6)); 3379 } 3380 3381 // perform 128-bit key specific inverse cipher transformation 3382 __ fxor(FloatRegisterImpl::D, F42, F54, F54); 3383 __ fxor(FloatRegisterImpl::D, F40, F52, F52); 3384 __ ba_short(L_common_transform); 3385 3386 __ BIND(L_expand192bit); 3387 3388 // start loading rest of the 192-bit key 3389 __ ldf(FloatRegisterImpl::S, original_key, 16, F4); 3390 __ ldf(FloatRegisterImpl::S, original_key, 20, F5); 3391 3392 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 3393 for ( int i = 0; i <= 36; i += 6 ) { 3394 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6)); 3395 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8)); 3396 __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10)); 3397 } 3398 __ aes_kexpand1(F42, F46, 7, F48); 3399 __ aes_kexpand2(F44, F48, F50); 3400 3401 // perform 192-bit key specific inverse cipher transformation 3402 __ fxor(FloatRegisterImpl::D, F50, F54, F54); 3403 __ fxor(FloatRegisterImpl::D, F48, F52, F52); 3404 __ aes_dround23(F46, F52, F54, F58); 3405 __ aes_dround01(F44, F52, F54, F56); 3406 __ aes_dround23(F42, F56, F58, F54); 3407 __ aes_dround01(F40, F56, F58, F52); 3408 __ ba_short(L_common_transform); 3409 3410 __ BIND(L_expand256bit); 3411 3412 // load rest of the 256-bit key 3413 for ( int i = 4; i <= 7; i++ ) { 3414 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 3415 } 3416 3417 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 3418 for ( int i = 0; i <= 40; i += 8 ) { 3419 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8)); 3420 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10)); 3421 __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12)); 3422 __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14)); 3423 } 3424 __ aes_kexpand1(F48, F54, 6, F56); 3425 __ aes_kexpand2(F50, F56, F58); 3426 3427 for ( int i = 0; i <= 6; i += 2 ) { 3428 __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); 3429 } 3430 3431 // reload original 'from' address 3432 __ mov(G1, from); 3433 3434 // re-check 8-byte alignment 3435 __ andcc(from, 7, G0); 3436 __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input); 3437 __ delayed()->alignaddr(from, G0, from); 3438 3439 // aligned case: load input into F52-F54 3440 __ ldf(FloatRegisterImpl::D, from, 0, F52); 3441 __ ldf(FloatRegisterImpl::D, from, 8, F54); 3442 __ ba_short(L_256bit_transform); 3443 3444 __ BIND(L_reload_misaligned_input); 3445 __ ldf(FloatRegisterImpl::D, from, 0, F52); 3446 __ ldf(FloatRegisterImpl::D, from, 8, F54); 3447 __ ldf(FloatRegisterImpl::D, from, 16, F56); 3448 __ faligndata(F52, F54, F52); 3449 __ faligndata(F54, F56, F54); 3450 3451 // perform 256-bit key specific inverse cipher transformation 3452 __ BIND(L_256bit_transform); 3453 __ fxor(FloatRegisterImpl::D, F0, F54, F54); 3454 __ fxor(FloatRegisterImpl::D, F2, F52, F52); 3455 __ aes_dround23(F4, F52, F54, F58); 3456 __ aes_dround01(F6, F52, F54, F56); 3457 __ aes_dround23(F50, F56, F58, F54); 3458 __ aes_dround01(F48, F56, F58, F52); 3459 __ aes_dround23(F46, F52, F54, F58); 3460 __ aes_dround01(F44, F52, F54, F56); 3461 __ aes_dround23(F42, F56, F58, F54); 3462 __ aes_dround01(F40, F56, F58, F52); 3463 3464 for ( int i = 0; i <= 7; i++ ) { 3465 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 3466 } 3467 3468 // perform inverse cipher transformations common for all key sizes 3469 __ BIND(L_common_transform); 3470 for ( int i = 38; i >= 6; i -= 8 ) { 3471 __ aes_dround23(as_FloatRegister(i), F52, F54, F58); 3472 __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56); 3473 if ( i != 6) { 3474 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54); 3475 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52); 3476 } else { 3477 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54); 3478 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52); 3479 } 3480 } 3481 3482 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3483 __ andcc(to, 7, O5); 3484 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); 3485 __ delayed()->edge8n(to, G0, O3); 3486 3487 // aligned case: store output into the destination array 3488 __ stf(FloatRegisterImpl::D, F52, to, 0); 3489 __ retl(); 3490 __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8); 3491 3492 __ BIND(L_store_misaligned_output); 3493 __ add(to, 8, O4); 3494 __ mov(8, O2); 3495 __ sub(O2, O5, O2); 3496 __ alignaddr(O2, G0, O2); 3497 __ faligndata(F52, F52, F52); 3498 __ faligndata(F54, F54, F54); 3499 __ and3(to, -8, to); 3500 __ and3(O4, -8, O4); 3501 __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); 3502 __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); 3503 __ add(to, 8, to); 3504 __ add(O4, 8, O4); 3505 __ orn(G0, O3, O3); 3506 __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); 3507 __ retl(); 3508 __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); 3509 3510 return start; 3511 } 3512 3513 address generate_cipherBlockChaining_encryptAESCrypt() { 3514 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 3515 "the following code assumes that first element of an int array is aligned to 8 bytes"); 3516 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 3517 "the following code assumes that first element of a byte array is aligned to 8 bytes"); 3518 __ align(CodeEntryAlignment); 3519 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 3520 Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit; 3521 Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform; 3522 Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit; 3523 Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit; 3524 address start = __ pc(); 3525 Register from = I0; // source byte array 3526 Register to = I1; // destination byte array 3527 Register key = I2; // expanded key array 3528 Register rvec = I3; // init vector 3529 const Register len_reg = I4; // cipher length 3530 const Register keylen = I5; // reg for storing expanded key array length 3531 3532 __ save_frame(0); 3533 // save cipher len to return in the end 3534 __ mov(len_reg, L0); 3535 3536 // read expanded key length 3537 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 3538 3539 // load initial vector, 8-byte alignment is guranteed 3540 __ ldf(FloatRegisterImpl::D, rvec, 0, F60); 3541 __ ldf(FloatRegisterImpl::D, rvec, 8, F62); 3542 // load key, 8-byte alignment is guranteed 3543 __ ldx(key,0,G1); 3544 __ ldx(key,8,G5); 3545 3546 // start loading expanded key, 8-byte alignment is guranteed 3547 for ( int i = 0, j = 16; i <= 38; i += 2, j += 8 ) { 3548 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 3549 } 3550 3551 // 128-bit original key size 3552 __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128); 3553 3554 for ( int i = 40, j = 176; i <= 46; i += 2, j += 8 ) { 3555 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 3556 } 3557 3558 // 192-bit original key size 3559 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192); 3560 3561 for ( int i = 48, j = 208; i <= 54; i += 2, j += 8 ) { 3562 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 3563 } 3564 3565 // 256-bit original key size 3566 __ ba_short(L_cbcenc256); 3567 3568 __ align(OptoLoopAlignment); 3569 __ BIND(L_cbcenc128); 3570 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3571 __ andcc(from, 7, G0); 3572 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit); 3573 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 3574 3575 // aligned case: load input into G3 and G4 3576 __ ldx(from,0,G3); 3577 __ ldx(from,8,G4); 3578 __ ba_short(L_128bit_transform); 3579 3580 __ BIND(L_load_misaligned_input_128bit); 3581 // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption 3582 __ alignaddr(from, G0, from); 3583 __ ldf(FloatRegisterImpl::D, from, 0, F48); 3584 __ ldf(FloatRegisterImpl::D, from, 8, F50); 3585 __ ldf(FloatRegisterImpl::D, from, 16, F52); 3586 __ faligndata(F48, F50, F48); 3587 __ faligndata(F50, F52, F50); 3588 __ movdtox(F48, G3); 3589 __ movdtox(F50, G4); 3590 __ mov(L1, from); 3591 3592 __ BIND(L_128bit_transform); 3593 __ xor3(G1,G3,G3); 3594 __ xor3(G5,G4,G4); 3595 __ movxtod(G3,F56); 3596 __ movxtod(G4,F58); 3597 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 3598 __ fxor(FloatRegisterImpl::D, F62, F58, F62); 3599 3600 // TEN_EROUNDS 3601 for ( int i = 0; i <= 32; i += 8 ) { 3602 __ aes_eround01(as_FloatRegister(i), F60, F62, F56); 3603 __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); 3604 if (i != 32 ) { 3605 __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); 3606 __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); 3607 } else { 3608 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 3609 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 3610 } 3611 } 3612 3613 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3614 __ andcc(to, 7, L1); 3615 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit); 3616 __ delayed()->edge8n(to, G0, L2); 3617 3618 // aligned case: store output into the destination array 3619 __ stf(FloatRegisterImpl::D, F60, to, 0); 3620 __ stf(FloatRegisterImpl::D, F62, to, 8); 3621 __ ba_short(L_check_loop_end_128bit); 3622 3623 __ BIND(L_store_misaligned_output_128bit); 3624 __ add(to, 8, L3); 3625 __ mov(8, L4); 3626 __ sub(L4, L1, L4); 3627 __ alignaddr(L4, G0, L4); 3628 // save cipher text before circular right shift 3629 // as it needs to be stored as iv for next block (see code before next retl) 3630 __ movdtox(F60, L6); 3631 __ movdtox(F62, L7); 3632 __ faligndata(F60, F60, F60); 3633 __ faligndata(F62, F62, F62); 3634 __ mov(to, L5); 3635 __ and3(to, -8, to); 3636 __ and3(L3, -8, L3); 3637 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3638 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3639 __ add(to, 8, to); 3640 __ add(L3, 8, L3); 3641 __ orn(G0, L2, L2); 3642 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3643 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3644 __ mov(L5, to); 3645 __ movxtod(L6, F60); 3646 __ movxtod(L7, F62); 3647 3648 __ BIND(L_check_loop_end_128bit); 3649 __ add(from, 16, from); 3650 __ add(to, 16, to); 3651 __ subcc(len_reg, 16, len_reg); 3652 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128); 3653 __ delayed()->nop(); 3654 // re-init intial vector for next block, 8-byte alignment is guaranteed 3655 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 3656 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 3657 __ mov(L0, I0); 3658 __ ret(); 3659 __ delayed()->restore(); 3660 3661 __ align(OptoLoopAlignment); 3662 __ BIND(L_cbcenc192); 3663 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3664 __ andcc(from, 7, G0); 3665 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit); 3666 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 3667 3668 // aligned case: load input into G3 and G4 3669 __ ldx(from,0,G3); 3670 __ ldx(from,8,G4); 3671 __ ba_short(L_192bit_transform); 3672 3673 __ BIND(L_load_misaligned_input_192bit); 3674 // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption 3675 __ alignaddr(from, G0, from); 3676 __ ldf(FloatRegisterImpl::D, from, 0, F48); 3677 __ ldf(FloatRegisterImpl::D, from, 8, F50); 3678 __ ldf(FloatRegisterImpl::D, from, 16, F52); 3679 __ faligndata(F48, F50, F48); 3680 __ faligndata(F50, F52, F50); 3681 __ movdtox(F48, G3); 3682 __ movdtox(F50, G4); 3683 __ mov(L1, from); 3684 3685 __ BIND(L_192bit_transform); 3686 __ xor3(G1,G3,G3); 3687 __ xor3(G5,G4,G4); 3688 __ movxtod(G3,F56); 3689 __ movxtod(G4,F58); 3690 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 3691 __ fxor(FloatRegisterImpl::D, F62, F58, F62); 3692 3693 // TWELEVE_EROUNDS 3694 for ( int i = 0; i <= 40; i += 8 ) { 3695 __ aes_eround01(as_FloatRegister(i), F60, F62, F56); 3696 __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); 3697 if (i != 40 ) { 3698 __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); 3699 __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); 3700 } else { 3701 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 3702 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 3703 } 3704 } 3705 3706 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3707 __ andcc(to, 7, L1); 3708 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit); 3709 __ delayed()->edge8n(to, G0, L2); 3710 3711 // aligned case: store output into the destination array 3712 __ stf(FloatRegisterImpl::D, F60, to, 0); 3713 __ stf(FloatRegisterImpl::D, F62, to, 8); 3714 __ ba_short(L_check_loop_end_192bit); 3715 3716 __ BIND(L_store_misaligned_output_192bit); 3717 __ add(to, 8, L3); 3718 __ mov(8, L4); 3719 __ sub(L4, L1, L4); 3720 __ alignaddr(L4, G0, L4); 3721 __ movdtox(F60, L6); 3722 __ movdtox(F62, L7); 3723 __ faligndata(F60, F60, F60); 3724 __ faligndata(F62, F62, F62); 3725 __ mov(to, L5); 3726 __ and3(to, -8, to); 3727 __ and3(L3, -8, L3); 3728 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3729 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3730 __ add(to, 8, to); 3731 __ add(L3, 8, L3); 3732 __ orn(G0, L2, L2); 3733 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3734 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3735 __ mov(L5, to); 3736 __ movxtod(L6, F60); 3737 __ movxtod(L7, F62); 3738 3739 __ BIND(L_check_loop_end_192bit); 3740 __ add(from, 16, from); 3741 __ subcc(len_reg, 16, len_reg); 3742 __ add(to, 16, to); 3743 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192); 3744 __ delayed()->nop(); 3745 // re-init intial vector for next block, 8-byte alignment is guaranteed 3746 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 3747 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 3748 __ mov(L0, I0); 3749 __ ret(); 3750 __ delayed()->restore(); 3751 3752 __ align(OptoLoopAlignment); 3753 __ BIND(L_cbcenc256); 3754 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3755 __ andcc(from, 7, G0); 3756 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit); 3757 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 3758 3759 // aligned case: load input into G3 and G4 3760 __ ldx(from,0,G3); 3761 __ ldx(from,8,G4); 3762 __ ba_short(L_256bit_transform); 3763 3764 __ BIND(L_load_misaligned_input_256bit); 3765 // cannot clobber F48, F50 and F52. F56, F58 can be used though 3766 __ alignaddr(from, G0, from); 3767 __ movdtox(F60, L2); // save F60 before overwriting 3768 __ ldf(FloatRegisterImpl::D, from, 0, F56); 3769 __ ldf(FloatRegisterImpl::D, from, 8, F58); 3770 __ ldf(FloatRegisterImpl::D, from, 16, F60); 3771 __ faligndata(F56, F58, F56); 3772 __ faligndata(F58, F60, F58); 3773 __ movdtox(F56, G3); 3774 __ movdtox(F58, G4); 3775 __ mov(L1, from); 3776 __ movxtod(L2, F60); 3777 3778 __ BIND(L_256bit_transform); 3779 __ xor3(G1,G3,G3); 3780 __ xor3(G5,G4,G4); 3781 __ movxtod(G3,F56); 3782 __ movxtod(G4,F58); 3783 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 3784 __ fxor(FloatRegisterImpl::D, F62, F58, F62); 3785 3786 // FOURTEEN_EROUNDS 3787 for ( int i = 0; i <= 48; i += 8 ) { 3788 __ aes_eround01(as_FloatRegister(i), F60, F62, F56); 3789 __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); 3790 if (i != 48 ) { 3791 __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); 3792 __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); 3793 } else { 3794 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); 3795 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); 3796 } 3797 } 3798 3799 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 3800 __ andcc(to, 7, L1); 3801 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit); 3802 __ delayed()->edge8n(to, G0, L2); 3803 3804 // aligned case: store output into the destination array 3805 __ stf(FloatRegisterImpl::D, F60, to, 0); 3806 __ stf(FloatRegisterImpl::D, F62, to, 8); 3807 __ ba_short(L_check_loop_end_256bit); 3808 3809 __ BIND(L_store_misaligned_output_256bit); 3810 __ add(to, 8, L3); 3811 __ mov(8, L4); 3812 __ sub(L4, L1, L4); 3813 __ alignaddr(L4, G0, L4); 3814 __ movdtox(F60, L6); 3815 __ movdtox(F62, L7); 3816 __ faligndata(F60, F60, F60); 3817 __ faligndata(F62, F62, F62); 3818 __ mov(to, L5); 3819 __ and3(to, -8, to); 3820 __ and3(L3, -8, L3); 3821 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3822 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3823 __ add(to, 8, to); 3824 __ add(L3, 8, L3); 3825 __ orn(G0, L2, L2); 3826 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 3827 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 3828 __ mov(L5, to); 3829 __ movxtod(L6, F60); 3830 __ movxtod(L7, F62); 3831 3832 __ BIND(L_check_loop_end_256bit); 3833 __ add(from, 16, from); 3834 __ subcc(len_reg, 16, len_reg); 3835 __ add(to, 16, to); 3836 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256); 3837 __ delayed()->nop(); 3838 // re-init intial vector for next block, 8-byte alignment is guaranteed 3839 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 3840 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 3841 __ mov(L0, I0); 3842 __ ret(); 3843 __ delayed()->restore(); 3844 3845 return start; 3846 } 3847 3848 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 3849 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 3850 "the following code assumes that first element of an int array is aligned to 8 bytes"); 3851 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 3852 "the following code assumes that first element of a byte array is aligned to 8 bytes"); 3853 __ align(CodeEntryAlignment); 3854 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 3855 Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start; 3856 Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256; 3857 Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128; 3858 Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256; 3859 Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128; 3860 Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192; 3861 Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256; 3862 address start = __ pc(); 3863 Register from = I0; // source byte array 3864 Register to = I1; // destination byte array 3865 Register key = I2; // expanded key array 3866 Register rvec = I3; // init vector 3867 const Register len_reg = I4; // cipher length 3868 const Register original_key = I5; // original key array only required during decryption 3869 const Register keylen = L6; // reg for storing expanded key array length 3870 3871 __ save_frame(0); //args are read from I* registers since we save the frame in the beginning 3872 // save cipher len to return in the end 3873 __ mov(len_reg, L7); 3874 3875 // load original key from SunJCE expanded decryption key 3876 // Since we load original key buffer starting first element, 8-byte alignment is guaranteed 3877 for ( int i = 0; i <= 3; i++ ) { 3878 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 3879 } 3880 3881 // load initial vector, 8-byte alignment is guaranteed 3882 __ ldx(rvec,0,L0); 3883 __ ldx(rvec,8,L1); 3884 3885 // read expanded key array length 3886 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 3887 3888 // 256-bit original key size 3889 __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit); 3890 3891 // 192-bit original key size 3892 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit); 3893 3894 // 128-bit original key size 3895 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 3896 for ( int i = 0; i <= 36; i += 4 ) { 3897 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4)); 3898 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6)); 3899 } 3900 3901 // load expanded key[last-1] and key[last] elements 3902 __ movdtox(F40,L2); 3903 __ movdtox(F42,L3); 3904 3905 __ and3(len_reg, 16, L4); 3906 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128); 3907 __ nop(); 3908 3909 __ ba_short(L_dec_first_block_start); 3910 3911 __ BIND(L_expand192bit); 3912 // load rest of the 192-bit key 3913 __ ldf(FloatRegisterImpl::S, original_key, 16, F4); 3914 __ ldf(FloatRegisterImpl::S, original_key, 20, F5); 3915 3916 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 3917 for ( int i = 0; i <= 36; i += 6 ) { 3918 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6)); 3919 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8)); 3920 __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10)); 3921 } 3922 __ aes_kexpand1(F42, F46, 7, F48); 3923 __ aes_kexpand2(F44, F48, F50); 3924 3925 // load expanded key[last-1] and key[last] elements 3926 __ movdtox(F48,L2); 3927 __ movdtox(F50,L3); 3928 3929 __ and3(len_reg, 16, L4); 3930 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192); 3931 __ nop(); 3932 3933 __ ba_short(L_dec_first_block_start); 3934 3935 __ BIND(L_expand256bit); 3936 // load rest of the 256-bit key 3937 for ( int i = 4; i <= 7; i++ ) { 3938 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 3939 } 3940 3941 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions 3942 for ( int i = 0; i <= 40; i += 8 ) { 3943 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8)); 3944 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10)); 3945 __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12)); 3946 __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14)); 3947 } 3948 __ aes_kexpand1(F48, F54, 6, F56); 3949 __ aes_kexpand2(F50, F56, F58); 3950 3951 // load expanded key[last-1] and key[last] elements 3952 __ movdtox(F56,L2); 3953 __ movdtox(F58,L3); 3954 3955 __ and3(len_reg, 16, L4); 3956 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256); 3957 3958 __ BIND(L_dec_first_block_start); 3959 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 3960 __ andcc(from, 7, G0); 3961 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block); 3962 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 3963 3964 // aligned case: load input into L4 and L5 3965 __ ldx(from,0,L4); 3966 __ ldx(from,8,L5); 3967 __ ba_short(L_transform_first_block); 3968 3969 __ BIND(L_load_misaligned_input_first_block); 3970 __ alignaddr(from, G0, from); 3971 // F58, F60, F62 can be clobbered 3972 __ ldf(FloatRegisterImpl::D, from, 0, F58); 3973 __ ldf(FloatRegisterImpl::D, from, 8, F60); 3974 __ ldf(FloatRegisterImpl::D, from, 16, F62); 3975 __ faligndata(F58, F60, F58); 3976 __ faligndata(F60, F62, F60); 3977 __ movdtox(F58, L4); 3978 __ movdtox(F60, L5); 3979 __ mov(G1, from); 3980 3981 __ BIND(L_transform_first_block); 3982 __ xor3(L2,L4,G1); 3983 __ movxtod(G1,F60); 3984 __ xor3(L3,L5,G1); 3985 __ movxtod(G1,F62); 3986 3987 // 128-bit original key size 3988 __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128); 3989 3990 // 192-bit original key size 3991 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192); 3992 3993 __ aes_dround23(F54, F60, F62, F58); 3994 __ aes_dround01(F52, F60, F62, F56); 3995 __ aes_dround23(F50, F56, F58, F62); 3996 __ aes_dround01(F48, F56, F58, F60); 3997 3998 __ BIND(L_dec_first_block192); 3999 __ aes_dround23(F46, F60, F62, F58); 4000 __ aes_dround01(F44, F60, F62, F56); 4001 __ aes_dround23(F42, F56, F58, F62); 4002 __ aes_dround01(F40, F56, F58, F60); 4003 4004 __ BIND(L_dec_first_block128); 4005 for ( int i = 38; i >= 6; i -= 8 ) { 4006 __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 4007 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 4008 if ( i != 6) { 4009 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 4010 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 4011 } else { 4012 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); 4013 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); 4014 } 4015 } 4016 4017 __ movxtod(L0,F56); 4018 __ movxtod(L1,F58); 4019 __ mov(L4,L0); 4020 __ mov(L5,L1); 4021 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 4022 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 4023 4024 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 4025 __ andcc(to, 7, G1); 4026 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block); 4027 __ delayed()->edge8n(to, G0, G2); 4028 4029 // aligned case: store output into the destination array 4030 __ stf(FloatRegisterImpl::D, F60, to, 0); 4031 __ stf(FloatRegisterImpl::D, F62, to, 8); 4032 __ ba_short(L_check_decrypt_end); 4033 4034 __ BIND(L_store_misaligned_output_first_block); 4035 __ add(to, 8, G3); 4036 __ mov(8, G4); 4037 __ sub(G4, G1, G4); 4038 __ alignaddr(G4, G0, G4); 4039 __ faligndata(F60, F60, F60); 4040 __ faligndata(F62, F62, F62); 4041 __ mov(to, G1); 4042 __ and3(to, -8, to); 4043 __ and3(G3, -8, G3); 4044 __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); 4045 __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); 4046 __ add(to, 8, to); 4047 __ add(G3, 8, G3); 4048 __ orn(G0, G2, G2); 4049 __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); 4050 __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); 4051 __ mov(G1, to); 4052 4053 __ BIND(L_check_decrypt_end); 4054 __ add(from, 16, from); 4055 __ add(to, 16, to); 4056 __ subcc(len_reg, 16, len_reg); 4057 __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end); 4058 __ delayed()->nop(); 4059 4060 // 256-bit original key size 4061 __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256); 4062 4063 // 192-bit original key size 4064 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192); 4065 4066 __ align(OptoLoopAlignment); 4067 __ BIND(L_dec_next2_blocks128); 4068 __ nop(); 4069 4070 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 4071 __ andcc(from, 7, G0); 4072 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128); 4073 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 4074 4075 // aligned case: load input into G4, G5, L4 and L5 4076 __ ldx(from,0,G4); 4077 __ ldx(from,8,G5); 4078 __ ldx(from,16,L4); 4079 __ ldx(from,24,L5); 4080 __ ba_short(L_transform_next2_blocks128); 4081 4082 __ BIND(L_load_misaligned_next2_blocks128); 4083 __ alignaddr(from, G0, from); 4084 // F40, F42, F58, F60, F62 can be clobbered 4085 __ ldf(FloatRegisterImpl::D, from, 0, F40); 4086 __ ldf(FloatRegisterImpl::D, from, 8, F42); 4087 __ ldf(FloatRegisterImpl::D, from, 16, F60); 4088 __ ldf(FloatRegisterImpl::D, from, 24, F62); 4089 __ ldf(FloatRegisterImpl::D, from, 32, F58); 4090 __ faligndata(F40, F42, F40); 4091 __ faligndata(F42, F60, F42); 4092 __ faligndata(F60, F62, F60); 4093 __ faligndata(F62, F58, F62); 4094 __ movdtox(F40, G4); 4095 __ movdtox(F42, G5); 4096 __ movdtox(F60, L4); 4097 __ movdtox(F62, L5); 4098 __ mov(G1, from); 4099 4100 __ BIND(L_transform_next2_blocks128); 4101 // F40:F42 used for first 16-bytes 4102 __ xor3(L2,G4,G1); 4103 __ movxtod(G1,F40); 4104 __ xor3(L3,G5,G1); 4105 __ movxtod(G1,F42); 4106 4107 // F60:F62 used for next 16-bytes 4108 __ xor3(L2,L4,G1); 4109 __ movxtod(G1,F60); 4110 __ xor3(L3,L5,G1); 4111 __ movxtod(G1,F62); 4112 4113 for ( int i = 38; i >= 6; i -= 8 ) { 4114 __ aes_dround23(as_FloatRegister(i), F40, F42, F44); 4115 __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46); 4116 __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 4117 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 4118 if (i != 6 ) { 4119 __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42); 4120 __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40); 4121 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 4122 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 4123 } else { 4124 __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42); 4125 __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40); 4126 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); 4127 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); 4128 } 4129 } 4130 4131 __ movxtod(L0,F46); 4132 __ movxtod(L1,F44); 4133 __ fxor(FloatRegisterImpl::D, F46, F40, F40); 4134 __ fxor(FloatRegisterImpl::D, F44, F42, F42); 4135 4136 __ movxtod(G4,F56); 4137 __ movxtod(G5,F58); 4138 __ mov(L4,L0); 4139 __ mov(L5,L1); 4140 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 4141 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 4142 4143 // For mis-aligned store of 32 bytes of result we can do: 4144 // Circular right-shift all 4 FP registers so that 'head' and 'tail' 4145 // parts that need to be stored starting at mis-aligned address are in a FP reg 4146 // the other 3 FP regs can thus be stored using regular store 4147 // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts 4148 4149 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 4150 __ andcc(to, 7, G1); 4151 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128); 4152 __ delayed()->edge8n(to, G0, G2); 4153 4154 // aligned case: store output into the destination array 4155 __ stf(FloatRegisterImpl::D, F40, to, 0); 4156 __ stf(FloatRegisterImpl::D, F42, to, 8); 4157 __ stf(FloatRegisterImpl::D, F60, to, 16); 4158 __ stf(FloatRegisterImpl::D, F62, to, 24); 4159 __ ba_short(L_check_decrypt_loop_end128); 4160 4161 __ BIND(L_store_misaligned_output_next2_blocks128); 4162 __ mov(8, G4); 4163 __ sub(G4, G1, G4); 4164 __ alignaddr(G4, G0, G4); 4165 __ faligndata(F40, F42, F56); // F56 can be clobbered 4166 __ faligndata(F42, F60, F42); 4167 __ faligndata(F60, F62, F60); 4168 __ faligndata(F62, F40, F40); 4169 __ mov(to, G1); 4170 __ and3(to, -8, to); 4171 __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); 4172 __ stf(FloatRegisterImpl::D, F56, to, 8); 4173 __ stf(FloatRegisterImpl::D, F42, to, 16); 4174 __ stf(FloatRegisterImpl::D, F60, to, 24); 4175 __ add(to, 32, to); 4176 __ orn(G0, G2, G2); 4177 __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); 4178 __ mov(G1, to); 4179 4180 __ BIND(L_check_decrypt_loop_end128); 4181 __ add(from, 32, from); 4182 __ add(to, 32, to); 4183 __ subcc(len_reg, 32, len_reg); 4184 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128); 4185 __ delayed()->nop(); 4186 __ ba_short(L_cbcdec_end); 4187 4188 __ align(OptoLoopAlignment); 4189 __ BIND(L_dec_next2_blocks192); 4190 __ nop(); 4191 4192 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 4193 __ andcc(from, 7, G0); 4194 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192); 4195 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 4196 4197 // aligned case: load input into G4, G5, L4 and L5 4198 __ ldx(from,0,G4); 4199 __ ldx(from,8,G5); 4200 __ ldx(from,16,L4); 4201 __ ldx(from,24,L5); 4202 __ ba_short(L_transform_next2_blocks192); 4203 4204 __ BIND(L_load_misaligned_next2_blocks192); 4205 __ alignaddr(from, G0, from); 4206 // F48, F50, F52, F60, F62 can be clobbered 4207 __ ldf(FloatRegisterImpl::D, from, 0, F48); 4208 __ ldf(FloatRegisterImpl::D, from, 8, F50); 4209 __ ldf(FloatRegisterImpl::D, from, 16, F60); 4210 __ ldf(FloatRegisterImpl::D, from, 24, F62); 4211 __ ldf(FloatRegisterImpl::D, from, 32, F52); 4212 __ faligndata(F48, F50, F48); 4213 __ faligndata(F50, F60, F50); 4214 __ faligndata(F60, F62, F60); 4215 __ faligndata(F62, F52, F62); 4216 __ movdtox(F48, G4); 4217 __ movdtox(F50, G5); 4218 __ movdtox(F60, L4); 4219 __ movdtox(F62, L5); 4220 __ mov(G1, from); 4221 4222 __ BIND(L_transform_next2_blocks192); 4223 // F48:F50 used for first 16-bytes 4224 __ xor3(L2,G4,G1); 4225 __ movxtod(G1,F48); 4226 __ xor3(L3,G5,G1); 4227 __ movxtod(G1,F50); 4228 4229 // F60:F62 used for next 16-bytes 4230 __ xor3(L2,L4,G1); 4231 __ movxtod(G1,F60); 4232 __ xor3(L3,L5,G1); 4233 __ movxtod(G1,F62); 4234 4235 for ( int i = 46; i >= 6; i -= 8 ) { 4236 __ aes_dround23(as_FloatRegister(i), F48, F50, F52); 4237 __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54); 4238 __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 4239 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 4240 if (i != 6 ) { 4241 __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50); 4242 __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48); 4243 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 4244 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 4245 } else { 4246 __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50); 4247 __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48); 4248 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); 4249 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); 4250 } 4251 } 4252 4253 __ movxtod(L0,F54); 4254 __ movxtod(L1,F52); 4255 __ fxor(FloatRegisterImpl::D, F54, F48, F48); 4256 __ fxor(FloatRegisterImpl::D, F52, F50, F50); 4257 4258 __ movxtod(G4,F56); 4259 __ movxtod(G5,F58); 4260 __ mov(L4,L0); 4261 __ mov(L5,L1); 4262 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 4263 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 4264 4265 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 4266 __ andcc(to, 7, G1); 4267 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192); 4268 __ delayed()->edge8n(to, G0, G2); 4269 4270 // aligned case: store output into the destination array 4271 __ stf(FloatRegisterImpl::D, F48, to, 0); 4272 __ stf(FloatRegisterImpl::D, F50, to, 8); 4273 __ stf(FloatRegisterImpl::D, F60, to, 16); 4274 __ stf(FloatRegisterImpl::D, F62, to, 24); 4275 __ ba_short(L_check_decrypt_loop_end192); 4276 4277 __ BIND(L_store_misaligned_output_next2_blocks192); 4278 __ mov(8, G4); 4279 __ sub(G4, G1, G4); 4280 __ alignaddr(G4, G0, G4); 4281 __ faligndata(F48, F50, F56); // F56 can be clobbered 4282 __ faligndata(F50, F60, F50); 4283 __ faligndata(F60, F62, F60); 4284 __ faligndata(F62, F48, F48); 4285 __ mov(to, G1); 4286 __ and3(to, -8, to); 4287 __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); 4288 __ stf(FloatRegisterImpl::D, F56, to, 8); 4289 __ stf(FloatRegisterImpl::D, F50, to, 16); 4290 __ stf(FloatRegisterImpl::D, F60, to, 24); 4291 __ add(to, 32, to); 4292 __ orn(G0, G2, G2); 4293 __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); 4294 __ mov(G1, to); 4295 4296 __ BIND(L_check_decrypt_loop_end192); 4297 __ add(from, 32, from); 4298 __ add(to, 32, to); 4299 __ subcc(len_reg, 32, len_reg); 4300 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192); 4301 __ delayed()->nop(); 4302 __ ba_short(L_cbcdec_end); 4303 4304 __ align(OptoLoopAlignment); 4305 __ BIND(L_dec_next2_blocks256); 4306 __ nop(); 4307 4308 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 4309 __ andcc(from, 7, G0); 4310 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256); 4311 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 4312 4313 // aligned case: load input into G4, G5, L4 and L5 4314 __ ldx(from,0,G4); 4315 __ ldx(from,8,G5); 4316 __ ldx(from,16,L4); 4317 __ ldx(from,24,L5); 4318 __ ba_short(L_transform_next2_blocks256); 4319 4320 __ BIND(L_load_misaligned_next2_blocks256); 4321 __ alignaddr(from, G0, from); 4322 // F0, F2, F4, F60, F62 can be clobbered 4323 __ ldf(FloatRegisterImpl::D, from, 0, F0); 4324 __ ldf(FloatRegisterImpl::D, from, 8, F2); 4325 __ ldf(FloatRegisterImpl::D, from, 16, F60); 4326 __ ldf(FloatRegisterImpl::D, from, 24, F62); 4327 __ ldf(FloatRegisterImpl::D, from, 32, F4); 4328 __ faligndata(F0, F2, F0); 4329 __ faligndata(F2, F60, F2); 4330 __ faligndata(F60, F62, F60); 4331 __ faligndata(F62, F4, F62); 4332 __ movdtox(F0, G4); 4333 __ movdtox(F2, G5); 4334 __ movdtox(F60, L4); 4335 __ movdtox(F62, L5); 4336 __ mov(G1, from); 4337 4338 __ BIND(L_transform_next2_blocks256); 4339 // F0:F2 used for first 16-bytes 4340 __ xor3(L2,G4,G1); 4341 __ movxtod(G1,F0); 4342 __ xor3(L3,G5,G1); 4343 __ movxtod(G1,F2); 4344 4345 // F60:F62 used for next 16-bytes 4346 __ xor3(L2,L4,G1); 4347 __ movxtod(G1,F60); 4348 __ xor3(L3,L5,G1); 4349 __ movxtod(G1,F62); 4350 4351 __ aes_dround23(F54, F0, F2, F4); 4352 __ aes_dround01(F52, F0, F2, F6); 4353 __ aes_dround23(F54, F60, F62, F58); 4354 __ aes_dround01(F52, F60, F62, F56); 4355 __ aes_dround23(F50, F6, F4, F2); 4356 __ aes_dround01(F48, F6, F4, F0); 4357 __ aes_dround23(F50, F56, F58, F62); 4358 __ aes_dround01(F48, F56, F58, F60); 4359 // save F48:F54 in temp registers 4360 __ movdtox(F54,G2); 4361 __ movdtox(F52,G3); 4362 __ movdtox(F50,G6); 4363 __ movdtox(F48,G1); 4364 for ( int i = 46; i >= 14; i -= 8 ) { 4365 __ aes_dround23(as_FloatRegister(i), F0, F2, F4); 4366 __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6); 4367 __ aes_dround23(as_FloatRegister(i), F60, F62, F58); 4368 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); 4369 __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2); 4370 __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0); 4371 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); 4372 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); 4373 } 4374 // init F48:F54 with F0:F6 values (original key) 4375 __ ldf(FloatRegisterImpl::D, original_key, 0, F48); 4376 __ ldf(FloatRegisterImpl::D, original_key, 8, F50); 4377 __ ldf(FloatRegisterImpl::D, original_key, 16, F52); 4378 __ ldf(FloatRegisterImpl::D, original_key, 24, F54); 4379 __ aes_dround23(F54, F0, F2, F4); 4380 __ aes_dround01(F52, F0, F2, F6); 4381 __ aes_dround23(F54, F60, F62, F58); 4382 __ aes_dround01(F52, F60, F62, F56); 4383 __ aes_dround23_l(F50, F6, F4, F2); 4384 __ aes_dround01_l(F48, F6, F4, F0); 4385 __ aes_dround23_l(F50, F56, F58, F62); 4386 __ aes_dround01_l(F48, F56, F58, F60); 4387 // re-init F48:F54 with their original values 4388 __ movxtod(G2,F54); 4389 __ movxtod(G3,F52); 4390 __ movxtod(G6,F50); 4391 __ movxtod(G1,F48); 4392 4393 __ movxtod(L0,F6); 4394 __ movxtod(L1,F4); 4395 __ fxor(FloatRegisterImpl::D, F6, F0, F0); 4396 __ fxor(FloatRegisterImpl::D, F4, F2, F2); 4397 4398 __ movxtod(G4,F56); 4399 __ movxtod(G5,F58); 4400 __ mov(L4,L0); 4401 __ mov(L5,L1); 4402 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 4403 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 4404 4405 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 4406 __ andcc(to, 7, G1); 4407 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256); 4408 __ delayed()->edge8n(to, G0, G2); 4409 4410 // aligned case: store output into the destination array 4411 __ stf(FloatRegisterImpl::D, F0, to, 0); 4412 __ stf(FloatRegisterImpl::D, F2, to, 8); 4413 __ stf(FloatRegisterImpl::D, F60, to, 16); 4414 __ stf(FloatRegisterImpl::D, F62, to, 24); 4415 __ ba_short(L_check_decrypt_loop_end256); 4416 4417 __ BIND(L_store_misaligned_output_next2_blocks256); 4418 __ mov(8, G4); 4419 __ sub(G4, G1, G4); 4420 __ alignaddr(G4, G0, G4); 4421 __ faligndata(F0, F2, F56); // F56 can be clobbered 4422 __ faligndata(F2, F60, F2); 4423 __ faligndata(F60, F62, F60); 4424 __ faligndata(F62, F0, F0); 4425 __ mov(to, G1); 4426 __ and3(to, -8, to); 4427 __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); 4428 __ stf(FloatRegisterImpl::D, F56, to, 8); 4429 __ stf(FloatRegisterImpl::D, F2, to, 16); 4430 __ stf(FloatRegisterImpl::D, F60, to, 24); 4431 __ add(to, 32, to); 4432 __ orn(G0, G2, G2); 4433 __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); 4434 __ mov(G1, to); 4435 4436 __ BIND(L_check_decrypt_loop_end256); 4437 __ add(from, 32, from); 4438 __ add(to, 32, to); 4439 __ subcc(len_reg, 32, len_reg); 4440 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256); 4441 __ delayed()->nop(); 4442 4443 __ BIND(L_cbcdec_end); 4444 // re-init intial vector for next block, 8-byte alignment is guaranteed 4445 __ stx(L0, rvec, 0); 4446 __ stx(L1, rvec, 8); 4447 __ mov(L7, I0); 4448 __ ret(); 4449 __ delayed()->restore(); 4450 4451 return start; 4452 } 4453 4454 address generate_sha1_implCompress(bool multi_block, const char *name) { 4455 __ align(CodeEntryAlignment); 4456 StubCodeMark mark(this, "StubRoutines", name); 4457 address start = __ pc(); 4458 4459 Label L_sha1_loop, L_sha1_unaligned_input, L_sha1_unaligned_input_loop; 4460 int i; 4461 4462 Register buf = O0; // byte[] source+offset 4463 Register state = O1; // int[] SHA.state 4464 Register ofs = O2; // int offset 4465 Register limit = O3; // int limit 4466 4467 // load state into F0-F4 4468 for (i = 0; i < 5; i++) { 4469 __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i)); 4470 } 4471 4472 __ andcc(buf, 7, G0); 4473 __ br(Assembler::notZero, false, Assembler::pn, L_sha1_unaligned_input); 4474 __ delayed()->nop(); 4475 4476 __ BIND(L_sha1_loop); 4477 // load buf into F8-F22 4478 for (i = 0; i < 8; i++) { 4479 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); 4480 } 4481 __ sha1(); 4482 if (multi_block) { 4483 __ add(ofs, 64, ofs); 4484 __ add(buf, 64, buf); 4485 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_loop); 4486 __ mov(ofs, O0); // to be returned 4487 } 4488 4489 // store F0-F4 into state and return 4490 for (i = 0; i < 4; i++) { 4491 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); 4492 } 4493 __ retl(); 4494 __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10); 4495 4496 __ BIND(L_sha1_unaligned_input); 4497 __ alignaddr(buf, G0, buf); 4498 4499 __ BIND(L_sha1_unaligned_input_loop); 4500 // load buf into F8-F22 4501 for (i = 0; i < 9; i++) { 4502 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); 4503 } 4504 for (i = 0; i < 8; i++) { 4505 __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8)); 4506 } 4507 __ sha1(); 4508 if (multi_block) { 4509 __ add(ofs, 64, ofs); 4510 __ add(buf, 64, buf); 4511 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_unaligned_input_loop); 4512 __ mov(ofs, O0); // to be returned 4513 } 4514 4515 // store F0-F4 into state and return 4516 for (i = 0; i < 4; i++) { 4517 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); 4518 } 4519 __ retl(); 4520 __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10); 4521 4522 return start; 4523 } 4524 4525 address generate_sha256_implCompress(bool multi_block, const char *name) { 4526 __ align(CodeEntryAlignment); 4527 StubCodeMark mark(this, "StubRoutines", name); 4528 address start = __ pc(); 4529 4530 Label L_sha256_loop, L_sha256_unaligned_input, L_sha256_unaligned_input_loop; 4531 int i; 4532 4533 Register buf = O0; // byte[] source+offset 4534 Register state = O1; // int[] SHA2.state 4535 Register ofs = O2; // int offset 4536 Register limit = O3; // int limit 4537 4538 // load state into F0-F7 4539 for (i = 0; i < 8; i++) { 4540 __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i)); 4541 } 4542 4543 __ andcc(buf, 7, G0); 4544 __ br(Assembler::notZero, false, Assembler::pn, L_sha256_unaligned_input); 4545 __ delayed()->nop(); 4546 4547 __ BIND(L_sha256_loop); 4548 // load buf into F8-F22 4549 for (i = 0; i < 8; i++) { 4550 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); 4551 } 4552 __ sha256(); 4553 if (multi_block) { 4554 __ add(ofs, 64, ofs); 4555 __ add(buf, 64, buf); 4556 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_loop); 4557 __ mov(ofs, O0); // to be returned 4558 } 4559 4560 // store F0-F7 into state and return 4561 for (i = 0; i < 7; i++) { 4562 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); 4563 } 4564 __ retl(); 4565 __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c); 4566 4567 __ BIND(L_sha256_unaligned_input); 4568 __ alignaddr(buf, G0, buf); 4569 4570 __ BIND(L_sha256_unaligned_input_loop); 4571 // load buf into F8-F22 4572 for (i = 0; i < 9; i++) { 4573 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8)); 4574 } 4575 for (i = 0; i < 8; i++) { 4576 __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8)); 4577 } 4578 __ sha256(); 4579 if (multi_block) { 4580 __ add(ofs, 64, ofs); 4581 __ add(buf, 64, buf); 4582 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_unaligned_input_loop); 4583 __ mov(ofs, O0); // to be returned 4584 } 4585 4586 // store F0-F7 into state and return 4587 for (i = 0; i < 7; i++) { 4588 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4); 4589 } 4590 __ retl(); 4591 __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c); 4592 4593 return start; 4594 } 4595 4596 address generate_sha512_implCompress(bool multi_block, const char *name) { 4597 __ align(CodeEntryAlignment); 4598 StubCodeMark mark(this, "StubRoutines", name); 4599 address start = __ pc(); 4600 4601 Label L_sha512_loop, L_sha512_unaligned_input, L_sha512_unaligned_input_loop; 4602 int i; 4603 4604 Register buf = O0; // byte[] source+offset 4605 Register state = O1; // long[] SHA5.state 4606 Register ofs = O2; // int offset 4607 Register limit = O3; // int limit 4608 4609 // load state into F0-F14 4610 for (i = 0; i < 8; i++) { 4611 __ ldf(FloatRegisterImpl::D, state, i*8, as_FloatRegister(i*2)); 4612 } 4613 4614 __ andcc(buf, 7, G0); 4615 __ br(Assembler::notZero, false, Assembler::pn, L_sha512_unaligned_input); 4616 __ delayed()->nop(); 4617 4618 __ BIND(L_sha512_loop); 4619 // load buf into F16-F46 4620 for (i = 0; i < 16; i++) { 4621 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16)); 4622 } 4623 __ sha512(); 4624 if (multi_block) { 4625 __ add(ofs, 128, ofs); 4626 __ add(buf, 128, buf); 4627 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_loop); 4628 __ mov(ofs, O0); // to be returned 4629 } 4630 4631 // store F0-F14 into state and return 4632 for (i = 0; i < 7; i++) { 4633 __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8); 4634 } 4635 __ retl(); 4636 __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38); 4637 4638 __ BIND(L_sha512_unaligned_input); 4639 __ alignaddr(buf, G0, buf); 4640 4641 __ BIND(L_sha512_unaligned_input_loop); 4642 // load buf into F16-F46 4643 for (i = 0; i < 17; i++) { 4644 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16)); 4645 } 4646 for (i = 0; i < 16; i++) { 4647 __ faligndata(as_FloatRegister(i*2 + 16), as_FloatRegister(i*2 + 18), as_FloatRegister(i*2 + 16)); 4648 } 4649 __ sha512(); 4650 if (multi_block) { 4651 __ add(ofs, 128, ofs); 4652 __ add(buf, 128, buf); 4653 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_unaligned_input_loop); 4654 __ mov(ofs, O0); // to be returned 4655 } 4656 4657 // store F0-F14 into state and return 4658 for (i = 0; i < 7; i++) { 4659 __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8); 4660 } 4661 __ retl(); 4662 __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38); 4663 4664 return start; 4665 } 4666 4667 /* Single and multi-block ghash operations */ 4668 address generate_ghash_processBlocks() { 4669 __ align(CodeEntryAlignment); 4670 Label L_ghash_loop, L_aligned, L_main; 4671 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4672 address start = __ pc(); 4673 4674 Register state = I0; 4675 Register subkeyH = I1; 4676 Register data = I2; 4677 Register len = I3; 4678 4679 __ save_frame(0); 4680 4681 __ ldx(state, 0, O0); 4682 __ ldx(state, 8, O1); 4683 4684 // Loop label for multiblock operations 4685 __ BIND(L_ghash_loop); 4686 4687 // Check if 'data' is unaligned 4688 __ andcc(data, 7, G1); 4689 __ br(Assembler::zero, false, Assembler::pt, L_aligned); 4690 __ delayed()->nop(); 4691 4692 Register left_shift = L1; 4693 Register right_shift = L2; 4694 Register data_ptr = L3; 4695 4696 // Get left and right shift values in bits 4697 __ sll(G1, LogBitsPerByte, left_shift); 4698 __ mov(64, right_shift); 4699 __ sub(right_shift, left_shift, right_shift); 4700 4701 // Align to read 'data' 4702 __ sub(data, G1, data_ptr); 4703 4704 // Load first 8 bytes of 'data' 4705 __ ldx(data_ptr, 0, O4); 4706 __ sllx(O4, left_shift, O4); 4707 __ ldx(data_ptr, 8, O5); 4708 __ srlx(O5, right_shift, G4); 4709 __ bset(G4, O4); 4710 4711 // Load second 8 bytes of 'data' 4712 __ sllx(O5, left_shift, O5); 4713 __ ldx(data_ptr, 16, G4); 4714 __ srlx(G4, right_shift, G4); 4715 __ ba(L_main); 4716 __ delayed()->bset(G4, O5); 4717 4718 // If 'data' is aligned, load normally 4719 __ BIND(L_aligned); 4720 __ ldx(data, 0, O4); 4721 __ ldx(data, 8, O5); 4722 4723 __ BIND(L_main); 4724 __ ldx(subkeyH, 0, O2); 4725 __ ldx(subkeyH, 8, O3); 4726 4727 __ xor3(O0, O4, O0); 4728 __ xor3(O1, O5, O1); 4729 4730 __ xmulxhi(O0, O3, G3); 4731 __ xmulx(O0, O2, O5); 4732 __ xmulxhi(O1, O2, G4); 4733 __ xmulxhi(O1, O3, G5); 4734 __ xmulx(O0, O3, G1); 4735 __ xmulx(O1, O3, G2); 4736 __ xmulx(O1, O2, O3); 4737 __ xmulxhi(O0, O2, O4); 4738 4739 __ mov(0xE1, O0); 4740 __ sllx(O0, 56, O0); 4741 4742 __ xor3(O5, G3, O5); 4743 __ xor3(O5, G4, O5); 4744 __ xor3(G5, G1, G1); 4745 __ xor3(G1, O3, G1); 4746 __ srlx(G2, 63, O1); 4747 __ srlx(G1, 63, G3); 4748 __ sllx(G2, 63, O3); 4749 __ sllx(G2, 58, O2); 4750 __ xor3(O3, O2, O2); 4751 4752 __ sllx(G1, 1, G1); 4753 __ or3(G1, O1, G1); 4754 4755 __ xor3(G1, O2, G1); 4756 4757 __ sllx(G2, 1, G2); 4758 4759 __ xmulxhi(G1, O0, O1); 4760 __ xmulx(G1, O0, O2); 4761 __ xmulxhi(G2, O0, O3); 4762 __ xmulx(G2, O0, G1); 4763 4764 __ xor3(O4, O1, O4); 4765 __ xor3(O5, O2, O5); 4766 __ xor3(O5, O3, O5); 4767 4768 __ sllx(O4, 1, O2); 4769 __ srlx(O5, 63, O3); 4770 4771 __ or3(O2, O3, O0); 4772 4773 __ sllx(O5, 1, O1); 4774 __ srlx(G1, 63, O2); 4775 __ or3(O1, O2, O1); 4776 __ xor3(O1, G3, O1); 4777 4778 __ deccc(len); 4779 __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop); 4780 __ delayed()->add(data, 16, data); 4781 4782 __ stx(O0, I0, 0); 4783 __ stx(O1, I0, 8); 4784 4785 __ ret(); 4786 __ delayed()->restore(); 4787 4788 return start; 4789 } 4790 4791 /** 4792 * Arguments: 4793 * 4794 * Inputs: 4795 * O0 - int crc 4796 * O1 - byte* buf 4797 * O2 - int len 4798 * O3 - int* table 4799 * 4800 * Output: 4801 * O0 - int crc result 4802 */ 4803 address generate_updateBytesCRC32C() { 4804 assert(UseCRC32CIntrinsics, "need CRC32C instruction"); 4805 4806 __ align(CodeEntryAlignment); 4807 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4808 address start = __ pc(); 4809 4810 const Register crc = O0; // crc 4811 const Register buf = O1; // source java byte array address 4812 const Register len = O2; // number of bytes 4813 const Register table = O3; // byteTable 4814 4815 __ kernel_crc32c(crc, buf, len, table); 4816 4817 __ retl(); 4818 __ delayed()->nop(); 4819 4820 return start; 4821 } 4822 4823 #define ADLER32_NUM_TEMPS 16 4824 4825 /** 4826 * Arguments: 4827 * 4828 * Inputs: 4829 * O0 - int adler 4830 * O1 - byte* buff 4831 * O2 - int len 4832 * 4833 * Output: 4834 * O0 - int adler result 4835 */ 4836 address generate_updateBytesAdler32() { 4837 __ align(CodeEntryAlignment); 4838 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4839 address start = __ pc(); 4840 4841 Label L_cleanup_loop, L_cleanup_loop_check; 4842 Label L_main_loop_check, L_main_loop, L_inner_loop, L_inner_loop_check; 4843 Label L_nmax_check_done; 4844 4845 // Aliases 4846 Register s1 = O0; 4847 Register s2 = O3; 4848 Register buff = O1; 4849 Register len = O2; 4850 Register temp[ADLER32_NUM_TEMPS] = {L0, L1, L2, L3, L4, L5, L6, L7, I0, I1, I2, I3, I4, I5, G3, I7}; 4851 4852 // Max number of bytes we can process before having to take the mod 4853 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4854 unsigned long NMAX = 0x15B0; 4855 4856 // Zero-out the upper bits of len 4857 __ clruwu(len); 4858 4859 // Create the mask 0xFFFF 4860 __ set64(0x00FFFF, O4, O5); // O5 is the temp register 4861 4862 // s1 is initialized to the lower 16 bits of adler 4863 // s2 is initialized to the upper 16 bits of adler 4864 __ srlx(O0, 16, O5); // adler >> 16 4865 __ and3(O0, O4, s1); // s1 = (adler & 0xFFFF) 4866 __ and3(O5, O4, s2); // s2 = ((adler >> 16) & 0xFFFF) 4867 4868 // The pipelined loop needs at least 16 elements for 1 iteration 4869 // It does check this, but it is more effective to skip to the cleanup loop 4870 // Setup the constant for cutoff checking 4871 __ mov(15, O4); 4872 4873 // Check if we are above the cutoff, if not go to the cleanup loop immediately 4874 __ cmp_and_br_short(len, O4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_loop_check); 4875 4876 // Free up some registers for our use 4877 for (int i = 0; i < ADLER32_NUM_TEMPS; i++) { 4878 __ movxtod(temp[i], as_FloatRegister(2*i)); 4879 } 4880 4881 // Loop maintenance stuff is done at the end of the loop, so skip to there 4882 __ ba_short(L_main_loop_check); 4883 4884 __ BIND(L_main_loop); 4885 4886 // Prologue for inner loop 4887 __ ldub(buff, 0, L0); 4888 __ dec(O5); 4889 4890 for (int i = 1; i < 8; i++) { 4891 __ ldub(buff, i, temp[i]); 4892 } 4893 4894 __ inc(buff, 8); 4895 4896 // Inner loop processes 16 elements at a time, might never execute if only 16 elements 4897 // to be processed by the outter loop 4898 __ ba_short(L_inner_loop_check); 4899 4900 __ BIND(L_inner_loop); 4901 4902 for (int i = 0; i < 8; i++) { 4903 __ ldub(buff, (2*i), temp[(8+(2*i)) % ADLER32_NUM_TEMPS]); 4904 __ add(s1, temp[i], s1); 4905 __ ldub(buff, (2*i)+1, temp[(8+(2*i)+1) % ADLER32_NUM_TEMPS]); 4906 __ add(s2, s1, s2); 4907 } 4908 4909 // Original temp 0-7 used and new loads to temp 0-7 issued 4910 // temp 8-15 ready to be consumed 4911 __ add(s1, I0, s1); 4912 __ dec(O5); 4913 __ add(s2, s1, s2); 4914 __ add(s1, I1, s1); 4915 __ inc(buff, 16); 4916 __ add(s2, s1, s2); 4917 4918 for (int i = 0; i < 6; i++) { 4919 __ add(s1, temp[10+i], s1); 4920 __ add(s2, s1, s2); 4921 } 4922 4923 __ BIND(L_inner_loop_check); 4924 __ nop(); 4925 __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_inner_loop); 4926 4927 // Epilogue 4928 for (int i = 0; i < 4; i++) { 4929 __ ldub(buff, (2*i), temp[8+(2*i)]); 4930 __ add(s1, temp[i], s1); 4931 __ ldub(buff, (2*i)+1, temp[8+(2*i)+1]); 4932 __ add(s2, s1, s2); 4933 } 4934 4935 __ add(s1, temp[4], s1); 4936 __ inc(buff, 8); 4937 4938 for (int i = 0; i < 11; i++) { 4939 __ add(s2, s1, s2); 4940 __ add(s1, temp[5+i], s1); 4941 } 4942 4943 __ add(s2, s1, s2); 4944 4945 // Take the mod for s1 and s2 4946 __ set64(0xFFF1, L0, L1); 4947 __ udivx(s1, L0, L1); 4948 __ udivx(s2, L0, L2); 4949 __ mulx(L0, L1, L1); 4950 __ mulx(L0, L2, L2); 4951 __ sub(s1, L1, s1); 4952 __ sub(s2, L2, s2); 4953 4954 // Make sure there is something left to process 4955 __ BIND(L_main_loop_check); 4956 __ set64(NMAX, L0, L1); 4957 // k = len < NMAX ? len : NMAX 4958 __ cmp_and_br_short(len, L0, Assembler::greaterEqualUnsigned, Assembler::pt, L_nmax_check_done); 4959 __ andn(len, 0x0F, L0); // only loop a multiple of 16 times 4960 __ BIND(L_nmax_check_done); 4961 __ mov(L0, O5); 4962 __ sub(len, L0, len); // len -= k 4963 4964 __ srlx(O5, 4, O5); // multiplies of 16 4965 __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_main_loop); 4966 4967 // Restore anything we used, take the mod one last time, combine and return 4968 // Restore any registers we saved 4969 for (int i = 0; i < ADLER32_NUM_TEMPS; i++) { 4970 __ movdtox(as_FloatRegister(2*i), temp[i]); 4971 } 4972 4973 // There might be nothing left to process 4974 __ ba_short(L_cleanup_loop_check); 4975 4976 __ BIND(L_cleanup_loop); 4977 __ ldub(buff, 0, O4); // load single byte form buffer 4978 __ inc(buff); // buff++ 4979 __ add(s1, O4, s1); // s1 += *buff++; 4980 __ dec(len); // len-- 4981 __ add(s1, s2, s2); // s2 += s1; 4982 __ BIND(L_cleanup_loop_check); 4983 __ nop(); 4984 __ cmp_and_br_short(len, 0, Assembler::notEqual, Assembler::pt, L_cleanup_loop); 4985 4986 // Take the mod one last time 4987 __ set64(0xFFF1, O1, O2); 4988 __ udivx(s1, O1, O2); 4989 __ udivx(s2, O1, O5); 4990 __ mulx(O1, O2, O2); 4991 __ mulx(O1, O5, O5); 4992 __ sub(s1, O2, s1); 4993 __ sub(s2, O5, s2); 4994 4995 // Combine lower bits and higher bits 4996 __ sllx(s2, 16, s2); // s2 = s2 << 16 4997 __ or3(s1, s2, s1); // adler = s2 | s1 4998 // Final return value is in O0 4999 __ retl(); 5000 __ delayed()->nop(); 5001 5002 return start; 5003 } 5004 5005 /** 5006 * Arguments: 5007 * 5008 * Inputs: 5009 * O0 - int crc 5010 * O1 - byte* buf 5011 * O2 - int len 5012 * O3 - int* table 5013 * 5014 * Output: 5015 * O0 - int crc result 5016 */ 5017 address generate_updateBytesCRC32() { 5018 assert(UseCRC32Intrinsics, "need VIS3 instructions"); 5019 5020 __ align(CodeEntryAlignment); 5021 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 5022 address start = __ pc(); 5023 5024 const Register crc = O0; // crc 5025 const Register buf = O1; // source java byte array address 5026 const Register len = O2; // length 5027 const Register table = O3; // crc_table address (reuse register) 5028 5029 __ kernel_crc32(crc, buf, len, table); 5030 5031 __ retl(); 5032 __ delayed()->nop(); 5033 5034 return start; 5035 } 5036 5037 /** 5038 * Arguments: 5039 * 5040 * Inputs: 5041 * I0 - int* x-addr 5042 * I1 - int x-len 5043 * I2 - int* y-addr 5044 * I3 - int y-len 5045 * I4 - int* z-addr (output vector) 5046 * I5 - int z-len 5047 */ 5048 address generate_multiplyToLen() { 5049 assert(UseMultiplyToLenIntrinsic, "need VIS3 instructions"); 5050 5051 __ align(CodeEntryAlignment); 5052 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 5053 address start = __ pc(); 5054 5055 __ save_frame(0); 5056 5057 const Register xptr = I0; // input address 5058 const Register xlen = I1; // ...and length in 32b-words 5059 const Register yptr = I2; // 5060 const Register ylen = I3; // 5061 const Register zptr = I4; // output address 5062 const Register zlen = I5; // ...and length in 32b-words 5063 5064 /* The minimal "limb" representation suggest that odd length vectors are as 5065 * likely as even length dittos. This in turn suggests that we need to cope 5066 * with odd/even length arrays and data not aligned properly for 64-bit read 5067 * and write operations. We thus use a number of different kernels: 5068 * 5069 * if (is_even(x.len) && is_even(y.len)) 5070 * if (is_align64(x) && is_align64(y) && is_align64(z)) 5071 * if (x.len == y.len && 16 <= x.len && x.len <= 64) 5072 * memv_mult_mpmul(...) 5073 * else 5074 * memv_mult_64x64(...) 5075 * else 5076 * memv_mult_64x64u(...) 5077 * else 5078 * memv_mult_32x32(...) 5079 * 5080 * Here we assume VIS3 support (for 'umulxhi', 'addxc' and 'addxccc'). 5081 * In case CBCOND instructions are supported, we will use 'cxbX'. If the 5082 * MPMUL instruction is supported, we will generate a kernel using 'mpmul' 5083 * (for vectors with proper characteristics). 5084 */ 5085 const Register tmp0 = L0; 5086 const Register tmp1 = L1; 5087 5088 Label L_mult_32x32; 5089 Label L_mult_64x64u; 5090 Label L_mult_64x64; 5091 Label L_exit; 5092 5093 if_both_even(xlen, ylen, tmp0, false, L_mult_32x32); 5094 if_all3_aligned(xptr, yptr, zptr, tmp1, 64, false, L_mult_64x64u); 5095 5096 if (UseMPMUL) { 5097 if_eq(xlen, ylen, false, L_mult_64x64); 5098 if_in_rng(xlen, 16, 64, tmp0, tmp1, false, L_mult_64x64); 5099 5100 // 1. Multiply naturally aligned 64b-datums using a generic 'mpmul' kernel, 5101 // operating on equal length vectors of size [16..64]. 5102 gen_mult_mpmul(xlen, xptr, yptr, zptr, L_exit); 5103 } 5104 5105 // 2. Multiply naturally aligned 64-bit datums (64x64). 5106 __ bind(L_mult_64x64); 5107 gen_mult_64x64(xptr, xlen, yptr, ylen, zptr, zlen, L_exit); 5108 5109 // 3. Multiply unaligned 64-bit datums (64x64). 5110 __ bind(L_mult_64x64u); 5111 gen_mult_64x64_unaligned(xptr, xlen, yptr, ylen, zptr, zlen, L_exit); 5112 5113 // 4. Multiply naturally aligned 32-bit datums (32x32). 5114 __ bind(L_mult_32x32); 5115 gen_mult_32x32(xptr, xlen, yptr, ylen, zptr, zlen, L_exit); 5116 5117 __ bind(L_exit); 5118 __ ret(); 5119 __ delayed()->restore(); 5120 5121 return start; 5122 } 5123 5124 // Additional help functions used by multiplyToLen generation. 5125 5126 void if_both_even(Register r1, Register r2, Register tmp, bool iseven, Label &L) 5127 { 5128 __ or3(r1, r2, tmp); 5129 __ andcc(tmp, 0x1, tmp); 5130 __ br_icc_zero(iseven, Assembler::pn, L); 5131 } 5132 5133 void if_all3_aligned(Register r1, Register r2, Register r3, 5134 Register tmp, uint align, bool isalign, Label &L) 5135 { 5136 __ or3(r1, r2, tmp); 5137 __ or3(r3, tmp, tmp); 5138 __ andcc(tmp, (align - 1), tmp); 5139 __ br_icc_zero(isalign, Assembler::pn, L); 5140 } 5141 5142 void if_eq(Register x, Register y, bool iseq, Label &L) 5143 { 5144 Assembler::Condition cf = (iseq ? Assembler::equal : Assembler::notEqual); 5145 __ cmp_and_br_short(x, y, cf, Assembler::pt, L); 5146 } 5147 5148 void if_in_rng(Register x, int lb, int ub, Register t1, Register t2, bool inrng, Label &L) 5149 { 5150 assert(Assembler::is_simm13(lb), "Small ints only!"); 5151 assert(Assembler::is_simm13(ub), "Small ints only!"); 5152 // Compute (x - lb) * (ub - x) >= 0 5153 // NOTE: With the local use of this routine, we rely on small integers to 5154 // guarantee that we do not overflow in the multiplication. 5155 __ add(G0, ub, t2); 5156 __ sub(x, lb, t1); 5157 __ sub(t2, x, t2); 5158 __ mulx(t1, t2, t1); 5159 Assembler::Condition cf = (inrng ? Assembler::greaterEqual : Assembler::less); 5160 __ cmp_and_br_short(t1, G0, cf, Assembler::pt, L); 5161 } 5162 5163 void ldd_entry(Register base, Register offs, FloatRegister dest) 5164 { 5165 __ ldd(base, offs, dest); 5166 __ inc(offs, 8); 5167 } 5168 5169 void ldx_entry(Register base, Register offs, Register dest) 5170 { 5171 __ ldx(base, offs, dest); 5172 __ inc(offs, 8); 5173 } 5174 5175 void mpmul_entry(int m, Label &next) 5176 { 5177 __ mpmul(m); 5178 __ cbcond(Assembler::equal, Assembler::icc, G0, G0, next); 5179 } 5180 5181 void stx_entry(Label &L, Register r1, Register r2, Register base, Register offs) 5182 { 5183 __ bind(L); 5184 __ stx(r1, base, offs); 5185 __ inc(offs, 8); 5186 __ stx(r2, base, offs); 5187 __ inc(offs, 8); 5188 } 5189 5190 void offs_entry(Label &Lbl0, Label &Lbl1) 5191 { 5192 assert(Lbl0.is_bound(), "must be"); 5193 assert(Lbl1.is_bound(), "must be"); 5194 5195 int offset = Lbl0.loc_pos() - Lbl1.loc_pos(); 5196 5197 __ emit_data(offset); 5198 } 5199 5200 /* Generate the actual multiplication kernels for BigInteger vectors: 5201 * 5202 * 1. gen_mult_mpmul(...) 5203 * 5204 * 2. gen_mult_64x64(...) 5205 * 5206 * 3. gen_mult_64x64_unaligned(...) 5207 * 5208 * 4. gen_mult_32x32(...) 5209 */ 5210 void gen_mult_mpmul(Register len, Register xptr, Register yptr, Register zptr, 5211 Label &L_exit) 5212 { 5213 const Register zero = G0; 5214 const Register gxp = G1; // Need to use global registers across RWs. 5215 const Register gyp = G2; 5216 const Register gzp = G3; 5217 const Register disp = G4; 5218 const Register offs = G5; 5219 5220 __ mov(xptr, gxp); 5221 __ mov(yptr, gyp); 5222 __ mov(zptr, gzp); 5223 5224 /* Compute jump vector entry: 5225 * 5226 * 1. mpmul input size (0..31) x 64b 5227 * 2. vector input size in 32b limbs (even number) 5228 * 3. branch entries in reverse order (31..0), using two 5229 * instructions per entry (2 * 4 bytes). 5230 * 5231 * displacement = byte_offset(bra_offset(len)) 5232 * = byte_offset((64 - len)/2) 5233 * = 8 * (64 - len)/2 5234 * = 4 * (64 - len) 5235 */ 5236 Register temp = I5; // Alright to use input regs. in first batch. 5237 5238 __ sub(zero, len, temp); 5239 __ add(temp, 64, temp); 5240 __ sllx(temp, 2, disp); // disp := (64 - len) << 2 5241 5242 // Dispatch relative current PC, into instruction table below. 5243 __ rdpc(temp); 5244 __ add(temp, 16, temp); 5245 __ jmp(temp, disp); 5246 __ delayed()->clr(offs); 5247 5248 ldd_entry(gxp, offs, F22); 5249 ldd_entry(gxp, offs, F20); 5250 ldd_entry(gxp, offs, F18); 5251 ldd_entry(gxp, offs, F16); 5252 ldd_entry(gxp, offs, F14); 5253 ldd_entry(gxp, offs, F12); 5254 ldd_entry(gxp, offs, F10); 5255 ldd_entry(gxp, offs, F8); 5256 ldd_entry(gxp, offs, F6); 5257 ldd_entry(gxp, offs, F4); 5258 ldx_entry(gxp, offs, I5); 5259 ldx_entry(gxp, offs, I4); 5260 ldx_entry(gxp, offs, I3); 5261 ldx_entry(gxp, offs, I2); 5262 ldx_entry(gxp, offs, I1); 5263 ldx_entry(gxp, offs, I0); 5264 ldx_entry(gxp, offs, L7); 5265 ldx_entry(gxp, offs, L6); 5266 ldx_entry(gxp, offs, L5); 5267 ldx_entry(gxp, offs, L4); 5268 ldx_entry(gxp, offs, L3); 5269 ldx_entry(gxp, offs, L2); 5270 ldx_entry(gxp, offs, L1); 5271 ldx_entry(gxp, offs, L0); 5272 ldd_entry(gxp, offs, F2); 5273 ldd_entry(gxp, offs, F0); 5274 ldx_entry(gxp, offs, O5); 5275 ldx_entry(gxp, offs, O4); 5276 ldx_entry(gxp, offs, O3); 5277 ldx_entry(gxp, offs, O2); 5278 ldx_entry(gxp, offs, O1); 5279 ldx_entry(gxp, offs, O0); 5280 5281 __ save(SP, -176, SP); 5282 5283 const Register addr = gxp; // Alright to reuse 'gxp'. 5284 5285 // Dispatch relative current PC, into instruction table below. 5286 __ rdpc(addr); 5287 __ add(addr, 16, addr); 5288 __ jmp(addr, disp); 5289 __ delayed()->clr(offs); 5290 5291 ldd_entry(gyp, offs, F58); 5292 ldd_entry(gyp, offs, F56); 5293 ldd_entry(gyp, offs, F54); 5294 ldd_entry(gyp, offs, F52); 5295 ldd_entry(gyp, offs, F50); 5296 ldd_entry(gyp, offs, F48); 5297 ldd_entry(gyp, offs, F46); 5298 ldd_entry(gyp, offs, F44); 5299 ldd_entry(gyp, offs, F42); 5300 ldd_entry(gyp, offs, F40); 5301 ldd_entry(gyp, offs, F38); 5302 ldd_entry(gyp, offs, F36); 5303 ldd_entry(gyp, offs, F34); 5304 ldd_entry(gyp, offs, F32); 5305 ldd_entry(gyp, offs, F30); 5306 ldd_entry(gyp, offs, F28); 5307 ldd_entry(gyp, offs, F26); 5308 ldd_entry(gyp, offs, F24); 5309 ldx_entry(gyp, offs, O5); 5310 ldx_entry(gyp, offs, O4); 5311 ldx_entry(gyp, offs, O3); 5312 ldx_entry(gyp, offs, O2); 5313 ldx_entry(gyp, offs, O1); 5314 ldx_entry(gyp, offs, O0); 5315 ldx_entry(gyp, offs, L7); 5316 ldx_entry(gyp, offs, L6); 5317 ldx_entry(gyp, offs, L5); 5318 ldx_entry(gyp, offs, L4); 5319 ldx_entry(gyp, offs, L3); 5320 ldx_entry(gyp, offs, L2); 5321 ldx_entry(gyp, offs, L1); 5322 ldx_entry(gyp, offs, L0); 5323 5324 __ save(SP, -176, SP); 5325 __ save(SP, -176, SP); 5326 __ save(SP, -176, SP); 5327 __ save(SP, -176, SP); 5328 __ save(SP, -176, SP); 5329 5330 Label L_mpmul_restore_4, L_mpmul_restore_3, L_mpmul_restore_2; 5331 Label L_mpmul_restore_1, L_mpmul_restore_0; 5332 5333 // Dispatch relative current PC, into instruction table below. 5334 __ rdpc(addr); 5335 __ add(addr, 16, addr); 5336 __ jmp(addr, disp); 5337 __ delayed()->clr(offs); 5338 5339 mpmul_entry(31, L_mpmul_restore_0); 5340 mpmul_entry(30, L_mpmul_restore_0); 5341 mpmul_entry(29, L_mpmul_restore_0); 5342 mpmul_entry(28, L_mpmul_restore_0); 5343 mpmul_entry(27, L_mpmul_restore_1); 5344 mpmul_entry(26, L_mpmul_restore_1); 5345 mpmul_entry(25, L_mpmul_restore_1); 5346 mpmul_entry(24, L_mpmul_restore_1); 5347 mpmul_entry(23, L_mpmul_restore_1); 5348 mpmul_entry(22, L_mpmul_restore_1); 5349 mpmul_entry(21, L_mpmul_restore_1); 5350 mpmul_entry(20, L_mpmul_restore_2); 5351 mpmul_entry(19, L_mpmul_restore_2); 5352 mpmul_entry(18, L_mpmul_restore_2); 5353 mpmul_entry(17, L_mpmul_restore_2); 5354 mpmul_entry(16, L_mpmul_restore_2); 5355 mpmul_entry(15, L_mpmul_restore_2); 5356 mpmul_entry(14, L_mpmul_restore_2); 5357 mpmul_entry(13, L_mpmul_restore_3); 5358 mpmul_entry(12, L_mpmul_restore_3); 5359 mpmul_entry(11, L_mpmul_restore_3); 5360 mpmul_entry(10, L_mpmul_restore_3); 5361 mpmul_entry( 9, L_mpmul_restore_3); 5362 mpmul_entry( 8, L_mpmul_restore_3); 5363 mpmul_entry( 7, L_mpmul_restore_3); 5364 mpmul_entry( 6, L_mpmul_restore_4); 5365 mpmul_entry( 5, L_mpmul_restore_4); 5366 mpmul_entry( 4, L_mpmul_restore_4); 5367 mpmul_entry( 3, L_mpmul_restore_4); 5368 mpmul_entry( 2, L_mpmul_restore_4); 5369 mpmul_entry( 1, L_mpmul_restore_4); 5370 mpmul_entry( 0, L_mpmul_restore_4); 5371 5372 Label L_z31, L_z30, L_z29, L_z28, L_z27, L_z26, L_z25, L_z24; 5373 Label L_z23, L_z22, L_z21, L_z20, L_z19, L_z18, L_z17, L_z16; 5374 Label L_z15, L_z14, L_z13, L_z12, L_z11, L_z10, L_z09, L_z08; 5375 Label L_z07, L_z06, L_z05, L_z04, L_z03, L_z02, L_z01, L_z00; 5376 5377 Label L_zst_base; // Store sequence base address. 5378 __ bind(L_zst_base); 5379 5380 stx_entry(L_z31, L7, L6, gzp, offs); 5381 stx_entry(L_z30, L5, L4, gzp, offs); 5382 stx_entry(L_z29, L3, L2, gzp, offs); 5383 stx_entry(L_z28, L1, L0, gzp, offs); 5384 __ restore(); 5385 stx_entry(L_z27, O5, O4, gzp, offs); 5386 stx_entry(L_z26, O3, O2, gzp, offs); 5387 stx_entry(L_z25, O1, O0, gzp, offs); 5388 stx_entry(L_z24, L7, L6, gzp, offs); 5389 stx_entry(L_z23, L5, L4, gzp, offs); 5390 stx_entry(L_z22, L3, L2, gzp, offs); 5391 stx_entry(L_z21, L1, L0, gzp, offs); 5392 __ restore(); 5393 stx_entry(L_z20, O5, O4, gzp, offs); 5394 stx_entry(L_z19, O3, O2, gzp, offs); 5395 stx_entry(L_z18, O1, O0, gzp, offs); 5396 stx_entry(L_z17, L7, L6, gzp, offs); 5397 stx_entry(L_z16, L5, L4, gzp, offs); 5398 stx_entry(L_z15, L3, L2, gzp, offs); 5399 stx_entry(L_z14, L1, L0, gzp, offs); 5400 __ restore(); 5401 stx_entry(L_z13, O5, O4, gzp, offs); 5402 stx_entry(L_z12, O3, O2, gzp, offs); 5403 stx_entry(L_z11, O1, O0, gzp, offs); 5404 stx_entry(L_z10, L7, L6, gzp, offs); 5405 stx_entry(L_z09, L5, L4, gzp, offs); 5406 stx_entry(L_z08, L3, L2, gzp, offs); 5407 stx_entry(L_z07, L1, L0, gzp, offs); 5408 __ restore(); 5409 stx_entry(L_z06, O5, O4, gzp, offs); 5410 stx_entry(L_z05, O3, O2, gzp, offs); 5411 stx_entry(L_z04, O1, O0, gzp, offs); 5412 stx_entry(L_z03, L7, L6, gzp, offs); 5413 stx_entry(L_z02, L5, L4, gzp, offs); 5414 stx_entry(L_z01, L3, L2, gzp, offs); 5415 stx_entry(L_z00, L1, L0, gzp, offs); 5416 5417 __ restore(); 5418 __ restore(); 5419 // Exit out of 'mpmul' routine, back to multiplyToLen. 5420 __ ba_short(L_exit); 5421 5422 Label L_zst_offs; 5423 __ bind(L_zst_offs); 5424 5425 offs_entry(L_z31, L_zst_base); // index 31: 2048x2048 5426 offs_entry(L_z30, L_zst_base); 5427 offs_entry(L_z29, L_zst_base); 5428 offs_entry(L_z28, L_zst_base); 5429 offs_entry(L_z27, L_zst_base); 5430 offs_entry(L_z26, L_zst_base); 5431 offs_entry(L_z25, L_zst_base); 5432 offs_entry(L_z24, L_zst_base); 5433 offs_entry(L_z23, L_zst_base); 5434 offs_entry(L_z22, L_zst_base); 5435 offs_entry(L_z21, L_zst_base); 5436 offs_entry(L_z20, L_zst_base); 5437 offs_entry(L_z19, L_zst_base); 5438 offs_entry(L_z18, L_zst_base); 5439 offs_entry(L_z17, L_zst_base); 5440 offs_entry(L_z16, L_zst_base); 5441 offs_entry(L_z15, L_zst_base); 5442 offs_entry(L_z14, L_zst_base); 5443 offs_entry(L_z13, L_zst_base); 5444 offs_entry(L_z12, L_zst_base); 5445 offs_entry(L_z11, L_zst_base); 5446 offs_entry(L_z10, L_zst_base); 5447 offs_entry(L_z09, L_zst_base); 5448 offs_entry(L_z08, L_zst_base); 5449 offs_entry(L_z07, L_zst_base); 5450 offs_entry(L_z06, L_zst_base); 5451 offs_entry(L_z05, L_zst_base); 5452 offs_entry(L_z04, L_zst_base); 5453 offs_entry(L_z03, L_zst_base); 5454 offs_entry(L_z02, L_zst_base); 5455 offs_entry(L_z01, L_zst_base); 5456 offs_entry(L_z00, L_zst_base); // index 0: 64x64 5457 5458 __ bind(L_mpmul_restore_4); 5459 __ restore(); 5460 __ bind(L_mpmul_restore_3); 5461 __ restore(); 5462 __ bind(L_mpmul_restore_2); 5463 __ restore(); 5464 __ bind(L_mpmul_restore_1); 5465 __ restore(); 5466 __ bind(L_mpmul_restore_0); 5467 5468 // Dispatch via offset vector entry, into z-store sequence. 5469 Label L_zst_rdpc; 5470 __ bind(L_zst_rdpc); 5471 5472 assert(L_zst_base.is_bound(), "must be"); 5473 assert(L_zst_offs.is_bound(), "must be"); 5474 assert(L_zst_rdpc.is_bound(), "must be"); 5475 5476 int dbase = L_zst_rdpc.loc_pos() - L_zst_base.loc_pos(); 5477 int doffs = L_zst_rdpc.loc_pos() - L_zst_offs.loc_pos(); 5478 5479 temp = gyp; // Alright to reuse 'gyp'. 5480 5481 __ rdpc(addr); 5482 __ sub(addr, doffs, temp); 5483 __ srlx(disp, 1, disp); 5484 __ lduw(temp, disp, offs); 5485 __ sub(addr, dbase, temp); 5486 __ jmp(temp, offs); 5487 __ delayed()->clr(offs); 5488 } 5489 5490 void gen_mult_64x64(Register xp, Register xn, 5491 Register yp, Register yn, 5492 Register zp, Register zn, Label &L_exit) 5493 { 5494 // Assuming that a stack frame has already been created, i.e. local and 5495 // output registers are available for immediate use. 5496 5497 const Register ri = L0; // Outer loop index, xv[i] 5498 const Register rj = L1; // Inner loop index, yv[j] 5499 const Register rk = L2; // Output loop index, zv[k] 5500 const Register rx = L4; // x-vector datum [i] 5501 const Register ry = L5; // y-vector datum [j] 5502 const Register rz = L6; // z-vector datum [k] 5503 const Register rc = L7; // carry over (to z-vector datum [k-1]) 5504 5505 const Register lop = O0; // lo-64b product 5506 const Register hip = O1; // hi-64b product 5507 5508 const Register zero = G0; 5509 5510 Label L_loop_i, L_exit_loop_i; 5511 Label L_loop_j; 5512 Label L_loop_i2, L_exit_loop_i2; 5513 5514 __ srlx(xn, 1, xn); // index for u32 to u64 ditto 5515 __ srlx(yn, 1, yn); // index for u32 to u64 ditto 5516 __ srlx(zn, 1, zn); // index for u32 to u64 ditto 5517 __ dec(xn); // Adjust [0..(N/2)-1] 5518 __ dec(yn); 5519 __ dec(zn); 5520 __ clr(rc); // u64 c = 0 5521 __ sllx(xn, 3, ri); // int i = xn (byte offset i = 8*xn) 5522 __ sllx(yn, 3, rj); // int j = yn (byte offset i = 8*xn) 5523 __ sllx(zn, 3, rk); // int k = zn (byte offset k = 8*zn) 5524 __ ldx(yp, rj, ry); // u64 y = yp[yn] 5525 5526 // for (int i = xn; i >= 0; i--) 5527 __ bind(L_loop_i); 5528 5529 __ cmp_and_br_short(ri, 0, // i >= 0 5530 Assembler::less, Assembler::pn, L_exit_loop_i); 5531 __ ldx(xp, ri, rx); // x = xp[i] 5532 __ mulx(rx, ry, lop); // lo-64b-part of result 64x64 5533 __ umulxhi(rx, ry, hip); // hi-64b-part of result 64x64 5534 __ addcc(rc, lop, lop); // Accumulate lower order bits (producing carry) 5535 __ addxc(hip, zero, rc); // carry over to next datum [k-1] 5536 __ stx(lop, zp, rk); // z[k] = lop 5537 __ dec(rk, 8); // k-- 5538 __ dec(ri, 8); // i-- 5539 __ ba_short(L_loop_i); 5540 5541 __ bind(L_exit_loop_i); 5542 __ stx(rc, zp, rk); // z[k] = c 5543 5544 // for (int j = yn - 1; j >= 0; j--) 5545 __ sllx(yn, 3, rj); // int j = yn - 1 (byte offset j = 8*yn) 5546 __ dec(rj, 8); 5547 5548 __ bind(L_loop_j); 5549 5550 __ cmp_and_br_short(rj, 0, // j >= 0 5551 Assembler::less, Assembler::pn, L_exit); 5552 __ clr(rc); // u64 c = 0 5553 __ ldx(yp, rj, ry); // u64 y = yp[j] 5554 5555 // for (int i = xn, k = --zn; i >= 0; i--) 5556 __ dec(zn); // --zn 5557 __ sllx(xn, 3, ri); // int i = xn (byte offset i = 8*xn) 5558 __ sllx(zn, 3, rk); // int k = zn (byte offset k = 8*zn) 5559 5560 __ bind(L_loop_i2); 5561 5562 __ cmp_and_br_short(ri, 0, // i >= 0 5563 Assembler::less, Assembler::pn, L_exit_loop_i2); 5564 __ ldx(xp, ri, rx); // x = xp[i] 5565 __ ldx(zp, rk, rz); // z = zp[k], accumulator 5566 __ mulx(rx, ry, lop); // lo-64b-part of result 64x64 5567 __ umulxhi(rx, ry, hip); // hi-64b-part of result 64x64 5568 __ addcc(rz, rc, rz); // Accumulate lower order bits, 5569 __ addxc(hip, zero, rc); // Accumulate higher order bits to carry 5570 __ addcc(rz, lop, rz); // z += lo(p) + c 5571 __ addxc(rc, zero, rc); 5572 __ stx(rz, zp, rk); // zp[k] = z 5573 __ dec(rk, 8); // k-- 5574 __ dec(ri, 8); // i-- 5575 __ ba_short(L_loop_i2); 5576 5577 __ bind(L_exit_loop_i2); 5578 __ stx(rc, zp, rk); // z[k] = c 5579 __ dec(rj, 8); // j-- 5580 __ ba_short(L_loop_j); 5581 } 5582 5583 void gen_mult_64x64_unaligned(Register xp, Register xn, 5584 Register yp, Register yn, 5585 Register zp, Register zn, Label &L_exit) 5586 { 5587 // Assuming that a stack frame has already been created, i.e. local and 5588 // output registers are available for use. 5589 5590 const Register xpc = L0; // Outer loop cursor, xp[i] 5591 const Register ypc = L1; // Inner loop cursor, yp[j] 5592 const Register zpc = L2; // Output loop cursor, zp[k] 5593 const Register rx = L4; // x-vector datum [i] 5594 const Register ry = L5; // y-vector datum [j] 5595 const Register rz = L6; // z-vector datum [k] 5596 const Register rc = L7; // carry over (to z-vector datum [k-1]) 5597 const Register rt = O2; 5598 5599 const Register lop = O0; // lo-64b product 5600 const Register hip = O1; // hi-64b product 5601 5602 const Register zero = G0; 5603 5604 Label L_loop_i, L_exit_loop_i; 5605 Label L_loop_j; 5606 Label L_loop_i2, L_exit_loop_i2; 5607 5608 __ srlx(xn, 1, xn); // index for u32 to u64 ditto 5609 __ srlx(yn, 1, yn); // index for u32 to u64 ditto 5610 __ srlx(zn, 1, zn); // index for u32 to u64 ditto 5611 __ dec(xn); // Adjust [0..(N/2)-1] 5612 __ dec(yn); 5613 __ dec(zn); 5614 __ clr(rc); // u64 c = 0 5615 __ sllx(xn, 3, xpc); // u32* xpc = &xp[xn] (byte offset 8*xn) 5616 __ add(xp, xpc, xpc); 5617 __ sllx(yn, 3, ypc); // u32* ypc = &yp[yn] (byte offset 8*yn) 5618 __ add(yp, ypc, ypc); 5619 __ sllx(zn, 3, zpc); // u32* zpc = &zp[zn] (byte offset 8*zn) 5620 __ add(zp, zpc, zpc); 5621 __ lduw(ypc, 0, rt); // u64 y = yp[yn] 5622 __ lduw(ypc, 4, ry); // ... 5623 __ sllx(rt, 32, rt); 5624 __ or3(rt, ry, ry); 5625 5626 // for (int i = xn; i >= 0; i--) 5627 __ bind(L_loop_i); 5628 5629 __ cmp_and_brx_short(xpc, xp,// i >= 0 5630 Assembler::lessUnsigned, Assembler::pn, L_exit_loop_i); 5631 __ lduw(xpc, 0, rt); // u64 x = xp[i] 5632 __ lduw(xpc, 4, rx); // ... 5633 __ sllx(rt, 32, rt); 5634 __ or3(rt, rx, rx); 5635 __ mulx(rx, ry, lop); // lo-64b-part of result 64x64 5636 __ umulxhi(rx, ry, hip); // hi-64b-part of result 64x64 5637 __ addcc(rc, lop, lop); // Accumulate lower order bits (producing carry) 5638 __ addxc(hip, zero, rc); // carry over to next datum [k-1] 5639 __ srlx(lop, 32, rt); 5640 __ stw(rt, zpc, 0); // z[k] = lop 5641 __ stw(lop, zpc, 4); // ... 5642 __ dec(zpc, 8); // k-- (zpc--) 5643 __ dec(xpc, 8); // i-- (xpc--) 5644 __ ba_short(L_loop_i); 5645 5646 __ bind(L_exit_loop_i); 5647 __ srlx(rc, 32, rt); 5648 __ stw(rt, zpc, 0); // z[k] = c 5649 __ stw(rc, zpc, 4); 5650 5651 // for (int j = yn - 1; j >= 0; j--) 5652 __ sllx(yn, 3, ypc); // u32* ypc = &yp[yn] (byte offset 8*yn) 5653 __ add(yp, ypc, ypc); 5654 __ dec(ypc, 8); // yn - 1 (ypc--) 5655 5656 __ bind(L_loop_j); 5657 5658 __ cmp_and_brx_short(ypc, yp,// j >= 0 5659 Assembler::lessUnsigned, Assembler::pn, L_exit); 5660 __ clr(rc); // u64 c = 0 5661 __ lduw(ypc, 0, rt); // u64 y = yp[j] (= *ypc) 5662 __ lduw(ypc, 4, ry); // ... 5663 __ sllx(rt, 32, rt); 5664 __ or3(rt, ry, ry); 5665 5666 // for (int i = xn, k = --zn; i >= 0; i--) 5667 __ sllx(xn, 3, xpc); // u32* xpc = &xp[xn] (byte offset 8*xn) 5668 __ add(xp, xpc, xpc); 5669 __ dec(zn); // --zn 5670 __ sllx(zn, 3, zpc); // u32* zpc = &zp[zn] (byte offset 8*zn) 5671 __ add(zp, zpc, zpc); 5672 5673 __ bind(L_loop_i2); 5674 5675 __ cmp_and_brx_short(xpc, xp,// i >= 0 5676 Assembler::lessUnsigned, Assembler::pn, L_exit_loop_i2); 5677 __ lduw(xpc, 0, rt); // u64 x = xp[i] (= *xpc) 5678 __ lduw(xpc, 4, rx); // ... 5679 __ sllx(rt, 32, rt); 5680 __ or3(rt, rx, rx); 5681 5682 __ lduw(zpc, 0, rt); // u64 z = zp[k] (= *zpc) 5683 __ lduw(zpc, 4, rz); // ... 5684 __ sllx(rt, 32, rt); 5685 __ or3(rt, rz, rz); 5686 5687 __ mulx(rx, ry, lop); // lo-64b-part of result 64x64 5688 __ umulxhi(rx, ry, hip); // hi-64b-part of result 64x64 5689 __ addcc(rz, rc, rz); // Accumulate lower order bits... 5690 __ addxc(hip, zero, rc); // Accumulate higher order bits to carry 5691 __ addcc(rz, lop, rz); // ... z += lo(p) + c 5692 __ addxccc(rc, zero, rc); 5693 __ srlx(rz, 32, rt); 5694 __ stw(rt, zpc, 0); // zp[k] = z (*zpc = z) 5695 __ stw(rz, zpc, 4); 5696 __ dec(zpc, 8); // k-- (zpc--) 5697 __ dec(xpc, 8); // i-- (xpc--) 5698 __ ba_short(L_loop_i2); 5699 5700 __ bind(L_exit_loop_i2); 5701 __ srlx(rc, 32, rt); 5702 __ stw(rt, zpc, 0); // z[k] = c 5703 __ stw(rc, zpc, 4); 5704 __ dec(ypc, 8); // j-- (ypc--) 5705 __ ba_short(L_loop_j); 5706 } 5707 5708 void gen_mult_32x32(Register xp, Register xn, 5709 Register yp, Register yn, 5710 Register zp, Register zn, Label &L_exit) 5711 { 5712 // Assuming that a stack frame has already been created, i.e. local and 5713 // output registers are available for use. 5714 5715 const Register ri = L0; // Outer loop index, xv[i] 5716 const Register rj = L1; // Inner loop index, yv[j] 5717 const Register rk = L2; // Output loop index, zv[k] 5718 const Register rx = L4; // x-vector datum [i] 5719 const Register ry = L5; // y-vector datum [j] 5720 const Register rz = L6; // z-vector datum [k] 5721 const Register rc = L7; // carry over (to z-vector datum [k-1]) 5722 5723 const Register p64 = O0; // 64b product 5724 const Register z65 = O1; // carry+64b accumulator 5725 const Register c65 = O2; // carry at bit 65 5726 const Register c33 = O2; // carry at bit 33 (after shift) 5727 5728 const Register zero = G0; 5729 5730 Label L_loop_i, L_exit_loop_i; 5731 Label L_loop_j; 5732 Label L_loop_i2, L_exit_loop_i2; 5733 5734 __ dec(xn); // Adjust [0..N-1] 5735 __ dec(yn); 5736 __ dec(zn); 5737 __ clr(rc); // u32 c = 0 5738 __ sllx(xn, 2, ri); // int i = xn (byte offset i = 4*xn) 5739 __ sllx(yn, 2, rj); // int j = yn (byte offset i = 4*xn) 5740 __ sllx(zn, 2, rk); // int k = zn (byte offset k = 4*zn) 5741 __ lduw(yp, rj, ry); // u32 y = yp[yn] 5742 5743 // for (int i = xn; i >= 0; i--) 5744 __ bind(L_loop_i); 5745 5746 __ cmp_and_br_short(ri, 0, // i >= 0 5747 Assembler::less, Assembler::pn, L_exit_loop_i); 5748 __ lduw(xp, ri, rx); // x = xp[i] 5749 __ mulx(rx, ry, p64); // 64b result of 32x32 5750 __ addcc(rc, p64, z65); // Accumulate to 65 bits (producing carry) 5751 __ addxc(zero, zero, c65); // Materialise carry (in bit 65) into lsb, 5752 __ sllx(c65, 32, c33); // and shift into bit 33 5753 __ srlx(z65, 32, rc); // carry = c33 | hi(z65) >> 32 5754 __ add(c33, rc, rc); // carry over to next datum [k-1] 5755 __ stw(z65, zp, rk); // z[k] = lo(z65) 5756 __ dec(rk, 4); // k-- 5757 __ dec(ri, 4); // i-- 5758 __ ba_short(L_loop_i); 5759 5760 __ bind(L_exit_loop_i); 5761 __ stw(rc, zp, rk); // z[k] = c 5762 5763 // for (int j = yn - 1; j >= 0; j--) 5764 __ sllx(yn, 2, rj); // int j = yn - 1 (byte offset j = 4*yn) 5765 __ dec(rj, 4); 5766 5767 __ bind(L_loop_j); 5768 5769 __ cmp_and_br_short(rj, 0, // j >= 0 5770 Assembler::less, Assembler::pn, L_exit); 5771 __ clr(rc); // u32 c = 0 5772 __ lduw(yp, rj, ry); // u32 y = yp[j] 5773 5774 // for (int i = xn, k = --zn; i >= 0; i--) 5775 __ dec(zn); // --zn 5776 __ sllx(xn, 2, ri); // int i = xn (byte offset i = 4*xn) 5777 __ sllx(zn, 2, rk); // int k = zn (byte offset k = 4*zn) 5778 5779 __ bind(L_loop_i2); 5780 5781 __ cmp_and_br_short(ri, 0, // i >= 0 5782 Assembler::less, Assembler::pn, L_exit_loop_i2); 5783 __ lduw(xp, ri, rx); // x = xp[i] 5784 __ lduw(zp, rk, rz); // z = zp[k], accumulator 5785 __ mulx(rx, ry, p64); // 64b result of 32x32 5786 __ add(rz, rc, rz); // Accumulate lower order bits, 5787 __ addcc(rz, p64, z65); // z += lo(p64) + c 5788 __ addxc(zero, zero, c65); // Materialise carry (in bit 65) into lsb, 5789 __ sllx(c65, 32, c33); // and shift into bit 33 5790 __ srlx(z65, 32, rc); // carry = c33 | hi(z65) >> 32 5791 __ add(c33, rc, rc); // carry over to next datum [k-1] 5792 __ stw(z65, zp, rk); // zp[k] = lo(z65) 5793 __ dec(rk, 4); // k-- 5794 __ dec(ri, 4); // i-- 5795 __ ba_short(L_loop_i2); 5796 5797 __ bind(L_exit_loop_i2); 5798 __ stw(rc, zp, rk); // z[k] = c 5799 __ dec(rj, 4); // j-- 5800 __ ba_short(L_loop_j); 5801 } 5802 5803 5804 void generate_initial() { 5805 // Generates all stubs and initializes the entry points 5806 5807 //------------------------------------------------------------------------------------------------------------------------ 5808 // entry points that exist in all platforms 5809 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than 5810 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp. 5811 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5812 5813 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address); 5814 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5815 5816 //------------------------------------------------------------------------------------------------------------------------ 5817 // entry points that are platform specific 5818 StubRoutines::Sparc::_test_stop_entry = generate_test_stop(); 5819 5820 StubRoutines::Sparc::_stop_subroutine_entry = generate_stop_subroutine(); 5821 StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows(); 5822 5823 // Build this early so it's available for the interpreter. 5824 StubRoutines::_throw_StackOverflowError_entry = 5825 generate_throw_exception("StackOverflowError throw_exception", 5826 CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError)); 5827 StubRoutines::_throw_delayed_StackOverflowError_entry = 5828 generate_throw_exception("delayed StackOverflowError throw_exception", 5829 CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError)); 5830 5831 if (UseCRC32Intrinsics) { 5832 // set table address before stub generation which use it 5833 StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table; 5834 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5835 } 5836 5837 if (UseCRC32CIntrinsics) { 5838 // set table address before stub generation which use it 5839 StubRoutines::_crc32c_table_addr = (address)StubRoutines::Sparc::_crc32c_table; 5840 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5841 } 5842 } 5843 5844 5845 void generate_all() { 5846 // Generates all stubs and initializes the entry points 5847 5848 // Generate partial_subtype_check first here since its code depends on 5849 // UseZeroBaseCompressedOops which is defined after heap initialization. 5850 StubRoutines::Sparc::_partial_subtype_check = generate_partial_subtype_check(); 5851 // These entry points require SharedInfo::stack0 to be set up in non-core builds 5852 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError)); 5853 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError)); 5854 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call)); 5855 5856 // support for verify_oop (must happen after universe_init) 5857 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop_subroutine(); 5858 5859 // arraycopy stubs used by compilers 5860 generate_arraycopy_stubs(); 5861 5862 // Don't initialize the platform math functions since sparc 5863 // doesn't have intrinsics for these operations. 5864 5865 // Safefetch stubs. 5866 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5867 &StubRoutines::_safefetch32_fault_pc, 5868 &StubRoutines::_safefetch32_continuation_pc); 5869 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5870 &StubRoutines::_safefetchN_fault_pc, 5871 &StubRoutines::_safefetchN_continuation_pc); 5872 5873 // generate AES intrinsics code 5874 if (UseAESIntrinsics) { 5875 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5876 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5877 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5878 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 5879 } 5880 // generate GHASH intrinsics code 5881 if (UseGHASHIntrinsics) { 5882 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5883 } 5884 5885 // generate SHA1/SHA256/SHA512 intrinsics code 5886 if (UseSHA1Intrinsics) { 5887 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5888 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5889 } 5890 if (UseSHA256Intrinsics) { 5891 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5892 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5893 } 5894 if (UseSHA512Intrinsics) { 5895 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 5896 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 5897 } 5898 // generate Adler32 intrinsics code 5899 if (UseAdler32Intrinsics) { 5900 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5901 } 5902 5903 #ifdef COMPILER2 5904 // Intrinsics supported by C2 only: 5905 if (UseMultiplyToLenIntrinsic) { 5906 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5907 } 5908 #endif // COMPILER2 5909 } 5910 5911 public: 5912 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5913 // replace the standard masm with a special one: 5914 _masm = new MacroAssembler(code); 5915 5916 _stub_count = !all ? 0x100 : 0x200; 5917 if (all) { 5918 generate_all(); 5919 } else { 5920 generate_initial(); 5921 } 5922 5923 // make sure this stub is available for all local calls 5924 if (_atomic_add_stub.is_unbound()) { 5925 // generate a second time, if necessary 5926 (void) generate_atomic_add(); 5927 } 5928 } 5929 5930 5931 private: 5932 int _stub_count; 5933 void stub_prolog(StubCodeDesc* cdesc) { 5934 # ifdef ASSERT 5935 // put extra information in the stub code, to make it more readable 5936 // Write the high part of the address 5937 // [RGV] Check if there is a dependency on the size of this prolog 5938 __ emit_data((intptr_t)cdesc >> 32, relocInfo::none); 5939 __ emit_data((intptr_t)cdesc, relocInfo::none); 5940 __ emit_data(++_stub_count, relocInfo::none); 5941 # endif 5942 align(true); 5943 } 5944 5945 void align(bool at_header = false) { 5946 // %%%%% move this constant somewhere else 5947 // UltraSPARC cache line size is 8 instructions: 5948 const unsigned int icache_line_size = 32; 5949 const unsigned int icache_half_line_size = 16; 5950 5951 if (at_header) { 5952 while ((intptr_t)(__ pc()) % icache_line_size != 0) { 5953 __ emit_data(0, relocInfo::none); 5954 } 5955 } else { 5956 while ((intptr_t)(__ pc()) % icache_half_line_size != 0) { 5957 __ nop(); 5958 } 5959 } 5960 } 5961 5962 }; // end class declaration 5963 5964 void StubGenerator_generate(CodeBuffer* code, bool all) { 5965 StubGenerator g(code, all); 5966 }