1 /* 2 * Copyright (c) 1999, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "interpreter/interpreter.hpp" 31 #include "nativeInst_x86.hpp" 32 #include "oops/instanceOop.hpp" 33 #include "oops/method.hpp" 34 #include "oops/objArrayKlass.hpp" 35 #include "oops/oop.inline.hpp" 36 #include "prims/methodHandles.hpp" 37 #include "runtime/frame.inline.hpp" 38 #include "runtime/handles.inline.hpp" 39 #include "runtime/sharedRuntime.hpp" 40 #include "runtime/stubCodeGenerator.hpp" 41 #include "runtime/stubRoutines.hpp" 42 #include "runtime/thread.inline.hpp" 43 #ifdef COMPILER2 44 #include "opto/runtime.hpp" 45 #endif 46 47 // Declaration and definition of StubGenerator (no .hpp file). 48 // For a more detailed description of the stub routine structure 49 // see the comment in stubRoutines.hpp 50 51 #define __ _masm-> 52 #define a__ ((Assembler*)_masm)-> 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) /* nothing */ 56 #else 57 #define BLOCK_COMMENT(str) __ block_comment(str) 58 #endif 59 60 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 61 62 const int MXCSR_MASK = 0xFFC0; // Mask out any pending exceptions 63 const int FPU_CNTRL_WRD_MASK = 0xFFFF; 64 65 // ------------------------------------------------------------------------------------------------------------------------- 66 // Stub Code definitions 67 68 class StubGenerator: public StubCodeGenerator { 69 private: 70 71 #ifdef PRODUCT 72 #define inc_counter_np(counter) ((void)0) 73 #else 74 void inc_counter_np_(int& counter) { 75 __ incrementl(ExternalAddress((address)&counter)); 76 } 77 #define inc_counter_np(counter) \ 78 BLOCK_COMMENT("inc_counter " #counter); \ 79 inc_counter_np_(counter); 80 #endif //PRODUCT 81 82 void inc_copy_counter_np(BasicType t) { 83 #ifndef PRODUCT 84 switch (t) { 85 case T_BYTE: inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); return; 86 case T_SHORT: inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); return; 87 case T_INT: inc_counter_np(SharedRuntime::_jint_array_copy_ctr); return; 88 case T_LONG: inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); return; 89 case T_OBJECT: inc_counter_np(SharedRuntime::_oop_array_copy_ctr); return; 90 } 91 ShouldNotReachHere(); 92 #endif //PRODUCT 93 } 94 95 //------------------------------------------------------------------------------------------------------------------------ 96 // Call stubs are used to call Java from C 97 // 98 // [ return_from_Java ] <--- rsp 99 // [ argument word n ] 100 // ... 101 // -N [ argument word 1 ] 102 // -7 [ Possible padding for stack alignment ] 103 // -6 [ Possible padding for stack alignment ] 104 // -5 [ Possible padding for stack alignment ] 105 // -4 [ mxcsr save ] <--- rsp_after_call 106 // -3 [ saved rbx, ] 107 // -2 [ saved rsi ] 108 // -1 [ saved rdi ] 109 // 0 [ saved rbp, ] <--- rbp, 110 // 1 [ return address ] 111 // 2 [ ptr. to call wrapper ] 112 // 3 [ result ] 113 // 4 [ result_type ] 114 // 5 [ method ] 115 // 6 [ entry_point ] 116 // 7 [ parameters ] 117 // 8 [ parameter_size ] 118 // 9 [ thread ] 119 120 121 address generate_call_stub(address& return_address) { 122 StubCodeMark mark(this, "StubRoutines", "call_stub"); 123 address start = __ pc(); 124 125 // stub code parameters / addresses 126 assert(frame::entry_frame_call_wrapper_offset == 2, "adjust this code"); 127 bool sse_save = false; 128 const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_catch_exception()! 129 const int locals_count_in_bytes (4*wordSize); 130 const Address mxcsr_save (rbp, -4 * wordSize); 131 const Address saved_rbx (rbp, -3 * wordSize); 132 const Address saved_rsi (rbp, -2 * wordSize); 133 const Address saved_rdi (rbp, -1 * wordSize); 134 const Address result (rbp, 3 * wordSize); 135 const Address result_type (rbp, 4 * wordSize); 136 const Address method (rbp, 5 * wordSize); 137 const Address entry_point (rbp, 6 * wordSize); 138 const Address parameters (rbp, 7 * wordSize); 139 const Address parameter_size(rbp, 8 * wordSize); 140 const Address thread (rbp, 9 * wordSize); // same as in generate_catch_exception()! 141 sse_save = UseSSE > 0; 142 143 // stub code 144 __ enter(); 145 __ movptr(rcx, parameter_size); // parameter counter 146 __ shlptr(rcx, Interpreter::logStackElementSize); // convert parameter count to bytes 147 __ addptr(rcx, locals_count_in_bytes); // reserve space for register saves 148 __ subptr(rsp, rcx); 149 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 150 151 // save rdi, rsi, & rbx, according to C calling conventions 152 __ movptr(saved_rdi, rdi); 153 __ movptr(saved_rsi, rsi); 154 __ movptr(saved_rbx, rbx); 155 156 // provide initial value for required masks 157 if (UseAVX > 2) { 158 __ movl(rbx, 0xffff); 159 __ kmovwl(k1, rbx); 160 } 161 162 // save and initialize %mxcsr 163 if (sse_save) { 164 Label skip_ldmx; 165 __ stmxcsr(mxcsr_save); 166 __ movl(rax, mxcsr_save); 167 __ andl(rax, MXCSR_MASK); // Only check control and mask bits 168 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std()); 169 __ cmp32(rax, mxcsr_std); 170 __ jcc(Assembler::equal, skip_ldmx); 171 __ ldmxcsr(mxcsr_std); 172 __ bind(skip_ldmx); 173 } 174 175 // make sure the control word is correct. 176 __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); 177 178 #ifdef ASSERT 179 // make sure we have no pending exceptions 180 { Label L; 181 __ movptr(rcx, thread); 182 __ cmpptr(Address(rcx, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 183 __ jcc(Assembler::equal, L); 184 __ stop("StubRoutines::call_stub: entered with pending exception"); 185 __ bind(L); 186 } 187 #endif 188 189 // pass parameters if any 190 BLOCK_COMMENT("pass parameters if any"); 191 Label parameters_done; 192 __ movl(rcx, parameter_size); // parameter counter 193 __ testl(rcx, rcx); 194 __ jcc(Assembler::zero, parameters_done); 195 196 // parameter passing loop 197 198 Label loop; 199 // Copy Java parameters in reverse order (receiver last) 200 // Note that the argument order is inverted in the process 201 // source is rdx[rcx: N-1..0] 202 // dest is rsp[rbx: 0..N-1] 203 204 __ movptr(rdx, parameters); // parameter pointer 205 __ xorptr(rbx, rbx); 206 207 __ BIND(loop); 208 209 // get parameter 210 __ movptr(rax, Address(rdx, rcx, Interpreter::stackElementScale(), -wordSize)); 211 __ movptr(Address(rsp, rbx, Interpreter::stackElementScale(), 212 Interpreter::expr_offset_in_bytes(0)), rax); // store parameter 213 __ increment(rbx); 214 __ decrement(rcx); 215 __ jcc(Assembler::notZero, loop); 216 217 // call Java function 218 __ BIND(parameters_done); 219 __ movptr(rbx, method); // get Method* 220 __ movptr(rax, entry_point); // get entry_point 221 __ mov(rsi, rsp); // set sender sp 222 BLOCK_COMMENT("call Java function"); 223 __ call(rax); 224 225 BLOCK_COMMENT("call_stub_return_address:"); 226 return_address = __ pc(); 227 228 #ifdef COMPILER2 229 { 230 Label L_skip; 231 if (UseSSE >= 2) { 232 __ verify_FPU(0, "call_stub_return"); 233 } else { 234 for (int i = 1; i < 8; i++) { 235 __ ffree(i); 236 } 237 238 // UseSSE <= 1 so double result should be left on TOS 239 __ movl(rsi, result_type); 240 __ cmpl(rsi, T_DOUBLE); 241 __ jcc(Assembler::equal, L_skip); 242 if (UseSSE == 0) { 243 // UseSSE == 0 so float result should be left on TOS 244 __ cmpl(rsi, T_FLOAT); 245 __ jcc(Assembler::equal, L_skip); 246 } 247 __ ffree(0); 248 } 249 __ BIND(L_skip); 250 } 251 #endif // COMPILER2 252 253 // store result depending on type 254 // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 255 __ movptr(rdi, result); 256 Label is_long, is_float, is_double, exit; 257 __ movl(rsi, result_type); 258 __ cmpl(rsi, T_LONG); 259 __ jcc(Assembler::equal, is_long); 260 __ cmpl(rsi, T_FLOAT); 261 __ jcc(Assembler::equal, is_float); 262 __ cmpl(rsi, T_DOUBLE); 263 __ jcc(Assembler::equal, is_double); 264 265 // handle T_INT case 266 __ movl(Address(rdi, 0), rax); 267 __ BIND(exit); 268 269 // check that FPU stack is empty 270 __ verify_FPU(0, "generate_call_stub"); 271 272 // pop parameters 273 __ lea(rsp, rsp_after_call); 274 275 // restore %mxcsr 276 if (sse_save) { 277 __ ldmxcsr(mxcsr_save); 278 } 279 280 // restore rdi, rsi and rbx, 281 __ movptr(rbx, saved_rbx); 282 __ movptr(rsi, saved_rsi); 283 __ movptr(rdi, saved_rdi); 284 __ addptr(rsp, 4*wordSize); 285 286 // return 287 __ pop(rbp); 288 __ ret(0); 289 290 // handle return types different from T_INT 291 __ BIND(is_long); 292 __ movl(Address(rdi, 0 * wordSize), rax); 293 __ movl(Address(rdi, 1 * wordSize), rdx); 294 __ jmp(exit); 295 296 __ BIND(is_float); 297 // interpreter uses xmm0 for return values 298 if (UseSSE >= 1) { 299 __ movflt(Address(rdi, 0), xmm0); 300 } else { 301 __ fstp_s(Address(rdi, 0)); 302 } 303 __ jmp(exit); 304 305 __ BIND(is_double); 306 // interpreter uses xmm0 for return values 307 if (UseSSE >= 2) { 308 __ movdbl(Address(rdi, 0), xmm0); 309 } else { 310 __ fstp_d(Address(rdi, 0)); 311 } 312 __ jmp(exit); 313 314 return start; 315 } 316 317 318 //------------------------------------------------------------------------------------------------------------------------ 319 // Return point for a Java call if there's an exception thrown in Java code. 320 // The exception is caught and transformed into a pending exception stored in 321 // JavaThread that can be tested from within the VM. 322 // 323 // Note: Usually the parameters are removed by the callee. In case of an exception 324 // crossing an activation frame boundary, that is not the case if the callee 325 // is compiled code => need to setup the rsp. 326 // 327 // rax,: exception oop 328 329 address generate_catch_exception() { 330 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 331 const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_call_stub()! 332 const Address thread (rbp, 9 * wordSize); // same as in generate_call_stub()! 333 address start = __ pc(); 334 335 // get thread directly 336 __ movptr(rcx, thread); 337 #ifdef ASSERT 338 // verify that threads correspond 339 { Label L; 340 __ get_thread(rbx); 341 __ cmpptr(rbx, rcx); 342 __ jcc(Assembler::equal, L); 343 __ stop("StubRoutines::catch_exception: threads must correspond"); 344 __ bind(L); 345 } 346 #endif 347 // set pending exception 348 __ verify_oop(rax); 349 __ movptr(Address(rcx, Thread::pending_exception_offset()), rax ); 350 __ lea(Address(rcx, Thread::exception_file_offset ()), 351 ExternalAddress((address)__FILE__)); 352 __ movl(Address(rcx, Thread::exception_line_offset ()), __LINE__ ); 353 // complete return to VM 354 assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before"); 355 __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address)); 356 357 return start; 358 } 359 360 361 //------------------------------------------------------------------------------------------------------------------------ 362 // Continuation point for runtime calls returning with a pending exception. 363 // The pending exception check happened in the runtime or native call stub. 364 // The pending exception in Thread is converted into a Java-level exception. 365 // 366 // Contract with Java-level exception handlers: 367 // rax: exception 368 // rdx: throwing pc 369 // 370 // NOTE: At entry of this stub, exception-pc must be on stack !! 371 372 address generate_forward_exception() { 373 StubCodeMark mark(this, "StubRoutines", "forward exception"); 374 address start = __ pc(); 375 const Register thread = rcx; 376 377 // other registers used in this stub 378 const Register exception_oop = rax; 379 const Register handler_addr = rbx; 380 const Register exception_pc = rdx; 381 382 // Upon entry, the sp points to the return address returning into Java 383 // (interpreted or compiled) code; i.e., the return address becomes the 384 // throwing pc. 385 // 386 // Arguments pushed before the runtime call are still on the stack but 387 // the exception handler will reset the stack pointer -> ignore them. 388 // A potential result in registers can be ignored as well. 389 390 #ifdef ASSERT 391 // make sure this code is only executed if there is a pending exception 392 { Label L; 393 __ get_thread(thread); 394 __ cmpptr(Address(thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 395 __ jcc(Assembler::notEqual, L); 396 __ stop("StubRoutines::forward exception: no pending exception (1)"); 397 __ bind(L); 398 } 399 #endif 400 401 // compute exception handler into rbx, 402 __ get_thread(thread); 403 __ movptr(exception_pc, Address(rsp, 0)); 404 BLOCK_COMMENT("call exception_handler_for_return_address"); 405 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, exception_pc); 406 __ mov(handler_addr, rax); 407 408 // setup rax & rdx, remove return address & clear pending exception 409 __ get_thread(thread); 410 __ pop(exception_pc); 411 __ movptr(exception_oop, Address(thread, Thread::pending_exception_offset())); 412 __ movptr(Address(thread, Thread::pending_exception_offset()), NULL_WORD); 413 414 #ifdef ASSERT 415 // make sure exception is set 416 { Label L; 417 __ testptr(exception_oop, exception_oop); 418 __ jcc(Assembler::notEqual, L); 419 __ stop("StubRoutines::forward exception: no pending exception (2)"); 420 __ bind(L); 421 } 422 #endif 423 424 // Verify that there is really a valid exception in RAX. 425 __ verify_oop(exception_oop); 426 427 // continue at exception handler (return address removed) 428 // rax: exception 429 // rbx: exception handler 430 // rdx: throwing pc 431 __ jmp(handler_addr); 432 433 return start; 434 } 435 436 437 //---------------------------------------------------------------------------------------------------- 438 // Support for int32_t Atomic::xchg(int32_t exchange_value, volatile int32_t* dest) 439 // 440 // xchg exists as far back as 8086, lock needed for MP only 441 // Stack layout immediately after call: 442 // 443 // 0 [ret addr ] <--- rsp 444 // 1 [ ex ] 445 // 2 [ dest ] 446 // 447 // Result: *dest <- ex, return (old *dest) 448 // 449 // Note: win32 does not currently use this code 450 451 address generate_atomic_xchg() { 452 StubCodeMark mark(this, "StubRoutines", "atomic_xchg"); 453 address start = __ pc(); 454 455 __ push(rdx); 456 Address exchange(rsp, 2 * wordSize); 457 Address dest_addr(rsp, 3 * wordSize); 458 __ movl(rax, exchange); 459 __ movptr(rdx, dest_addr); 460 __ xchgl(rax, Address(rdx, 0)); 461 __ pop(rdx); 462 __ ret(0); 463 464 return start; 465 } 466 467 //---------------------------------------------------------------------------------------------------- 468 // Support for void verify_mxcsr() 469 // 470 // This routine is used with -Xcheck:jni to verify that native 471 // JNI code does not return to Java code without restoring the 472 // MXCSR register to our expected state. 473 474 475 address generate_verify_mxcsr() { 476 StubCodeMark mark(this, "StubRoutines", "verify_mxcsr"); 477 address start = __ pc(); 478 479 const Address mxcsr_save(rsp, 0); 480 481 if (CheckJNICalls && UseSSE > 0 ) { 482 Label ok_ret; 483 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std()); 484 __ push(rax); 485 __ subptr(rsp, wordSize); // allocate a temp location 486 __ stmxcsr(mxcsr_save); 487 __ movl(rax, mxcsr_save); 488 __ andl(rax, MXCSR_MASK); 489 __ cmp32(rax, mxcsr_std); 490 __ jcc(Assembler::equal, ok_ret); 491 492 __ warn("MXCSR changed by native JNI code."); 493 494 __ ldmxcsr(mxcsr_std); 495 496 __ bind(ok_ret); 497 __ addptr(rsp, wordSize); 498 __ pop(rax); 499 } 500 501 __ ret(0); 502 503 return start; 504 } 505 506 507 //--------------------------------------------------------------------------- 508 // Support for void verify_fpu_cntrl_wrd() 509 // 510 // This routine is used with -Xcheck:jni to verify that native 511 // JNI code does not return to Java code without restoring the 512 // FP control word to our expected state. 513 514 address generate_verify_fpu_cntrl_wrd() { 515 StubCodeMark mark(this, "StubRoutines", "verify_spcw"); 516 address start = __ pc(); 517 518 const Address fpu_cntrl_wrd_save(rsp, 0); 519 520 if (CheckJNICalls) { 521 Label ok_ret; 522 __ push(rax); 523 __ subptr(rsp, wordSize); // allocate a temp location 524 __ fnstcw(fpu_cntrl_wrd_save); 525 __ movl(rax, fpu_cntrl_wrd_save); 526 __ andl(rax, FPU_CNTRL_WRD_MASK); 527 ExternalAddress fpu_std(StubRoutines::addr_fpu_cntrl_wrd_std()); 528 __ cmp32(rax, fpu_std); 529 __ jcc(Assembler::equal, ok_ret); 530 531 __ warn("Floating point control word changed by native JNI code."); 532 533 __ fldcw(fpu_std); 534 535 __ bind(ok_ret); 536 __ addptr(rsp, wordSize); 537 __ pop(rax); 538 } 539 540 __ ret(0); 541 542 return start; 543 } 544 545 //--------------------------------------------------------------------------- 546 // Wrapper for slow-case handling of double-to-integer conversion 547 // d2i or f2i fast case failed either because it is nan or because 548 // of under/overflow. 549 // Input: FPU TOS: float value 550 // Output: rax, (rdx): integer (long) result 551 552 address generate_d2i_wrapper(BasicType t, address fcn) { 553 StubCodeMark mark(this, "StubRoutines", "d2i_wrapper"); 554 address start = __ pc(); 555 556 // Capture info about frame layout 557 enum layout { FPUState_off = 0, 558 rbp_off = FPUStateSizeInWords, 559 rdi_off, 560 rsi_off, 561 rcx_off, 562 rbx_off, 563 saved_argument_off, 564 saved_argument_off2, // 2nd half of double 565 framesize 566 }; 567 568 assert(FPUStateSizeInWords == 27, "update stack layout"); 569 570 // Save outgoing argument to stack across push_FPU_state() 571 __ subptr(rsp, wordSize * 2); 572 __ fstp_d(Address(rsp, 0)); 573 574 // Save CPU & FPU state 575 __ push(rbx); 576 __ push(rcx); 577 __ push(rsi); 578 __ push(rdi); 579 __ push(rbp); 580 __ push_FPU_state(); 581 582 // push_FPU_state() resets the FP top of stack 583 // Load original double into FP top of stack 584 __ fld_d(Address(rsp, saved_argument_off * wordSize)); 585 // Store double into stack as outgoing argument 586 __ subptr(rsp, wordSize*2); 587 __ fst_d(Address(rsp, 0)); 588 589 // Prepare FPU for doing math in C-land 590 __ empty_FPU_stack(); 591 // Call the C code to massage the double. Result in EAX 592 if (t == T_INT) 593 { BLOCK_COMMENT("SharedRuntime::d2i"); } 594 else if (t == T_LONG) 595 { BLOCK_COMMENT("SharedRuntime::d2l"); } 596 __ call_VM_leaf( fcn, 2 ); 597 598 // Restore CPU & FPU state 599 __ pop_FPU_state(); 600 __ pop(rbp); 601 __ pop(rdi); 602 __ pop(rsi); 603 __ pop(rcx); 604 __ pop(rbx); 605 __ addptr(rsp, wordSize * 2); 606 607 __ ret(0); 608 609 return start; 610 } 611 612 613 //---------------------------------------------------------------------------------------------------- 614 // Non-destructive plausibility checks for oops 615 616 address generate_verify_oop() { 617 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 618 address start = __ pc(); 619 620 // Incoming arguments on stack after saving rax,: 621 // 622 // [tos ]: saved rdx 623 // [tos + 1]: saved EFLAGS 624 // [tos + 2]: return address 625 // [tos + 3]: char* error message 626 // [tos + 4]: oop object to verify 627 // [tos + 5]: saved rax, - saved by caller and bashed 628 629 Label exit, error; 630 __ pushf(); 631 __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 632 __ push(rdx); // save rdx 633 // make sure object is 'reasonable' 634 __ movptr(rax, Address(rsp, 4 * wordSize)); // get object 635 __ testptr(rax, rax); 636 __ jcc(Assembler::zero, exit); // if obj is NULL it is ok 637 638 // Check if the oop is in the right area of memory 639 const int oop_mask = Universe::verify_oop_mask(); 640 const int oop_bits = Universe::verify_oop_bits(); 641 __ mov(rdx, rax); 642 __ andptr(rdx, oop_mask); 643 __ cmpptr(rdx, oop_bits); 644 __ jcc(Assembler::notZero, error); 645 646 // make sure klass is 'reasonable', which is not zero. 647 __ movptr(rax, Address(rax, oopDesc::klass_offset_in_bytes())); // get klass 648 __ testptr(rax, rax); 649 __ jcc(Assembler::zero, error); // if klass is NULL it is broken 650 651 // return if everything seems ok 652 __ bind(exit); 653 __ movptr(rax, Address(rsp, 5 * wordSize)); // get saved rax, back 654 __ pop(rdx); // restore rdx 655 __ popf(); // restore EFLAGS 656 __ ret(3 * wordSize); // pop arguments 657 658 // handle errors 659 __ bind(error); 660 __ movptr(rax, Address(rsp, 5 * wordSize)); // get saved rax, back 661 __ pop(rdx); // get saved rdx back 662 __ popf(); // get saved EFLAGS off stack -- will be ignored 663 __ pusha(); // push registers (eip = return address & msg are already pushed) 664 BLOCK_COMMENT("call MacroAssembler::debug"); 665 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32))); 666 __ popa(); 667 __ ret(3 * wordSize); // pop arguments 668 return start; 669 } 670 671 672 // Copy 64 bytes chunks 673 // 674 // Inputs: 675 // from - source array address 676 // to_from - destination array address - from 677 // qword_count - 8-bytes element count, negative 678 // 679 void xmm_copy_forward(Register from, Register to_from, Register qword_count) { 680 assert( UseSSE >= 2, "supported cpu only" ); 681 Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; 682 if (UseAVX > 2) { 683 __ push(rbx); 684 __ movl(rbx, 0xffff); 685 __ kmovwl(k1, rbx); 686 __ pop(rbx); 687 } 688 // Copy 64-byte chunks 689 __ jmpb(L_copy_64_bytes); 690 __ align(OptoLoopAlignment); 691 __ BIND(L_copy_64_bytes_loop); 692 693 if (UseUnalignedLoadStores) { 694 if (UseAVX > 2) { 695 __ evmovdqul(xmm0, Address(from, 0), Assembler::AVX_512bit); 696 __ evmovdqul(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit); 697 } else if (UseAVX == 2) { 698 __ vmovdqu(xmm0, Address(from, 0)); 699 __ vmovdqu(Address(from, to_from, Address::times_1, 0), xmm0); 700 __ vmovdqu(xmm1, Address(from, 32)); 701 __ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1); 702 } else { 703 __ movdqu(xmm0, Address(from, 0)); 704 __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0); 705 __ movdqu(xmm1, Address(from, 16)); 706 __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1); 707 __ movdqu(xmm2, Address(from, 32)); 708 __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2); 709 __ movdqu(xmm3, Address(from, 48)); 710 __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3); 711 } 712 } else { 713 __ movq(xmm0, Address(from, 0)); 714 __ movq(Address(from, to_from, Address::times_1, 0), xmm0); 715 __ movq(xmm1, Address(from, 8)); 716 __ movq(Address(from, to_from, Address::times_1, 8), xmm1); 717 __ movq(xmm2, Address(from, 16)); 718 __ movq(Address(from, to_from, Address::times_1, 16), xmm2); 719 __ movq(xmm3, Address(from, 24)); 720 __ movq(Address(from, to_from, Address::times_1, 24), xmm3); 721 __ movq(xmm4, Address(from, 32)); 722 __ movq(Address(from, to_from, Address::times_1, 32), xmm4); 723 __ movq(xmm5, Address(from, 40)); 724 __ movq(Address(from, to_from, Address::times_1, 40), xmm5); 725 __ movq(xmm6, Address(from, 48)); 726 __ movq(Address(from, to_from, Address::times_1, 48), xmm6); 727 __ movq(xmm7, Address(from, 56)); 728 __ movq(Address(from, to_from, Address::times_1, 56), xmm7); 729 } 730 731 __ addl(from, 64); 732 __ BIND(L_copy_64_bytes); 733 __ subl(qword_count, 8); 734 __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop); 735 736 if (UseUnalignedLoadStores && (UseAVX == 2)) { 737 // clean upper bits of YMM registers 738 __ vpxor(xmm0, xmm0); 739 __ vpxor(xmm1, xmm1); 740 } 741 __ addl(qword_count, 8); 742 __ jccb(Assembler::zero, L_exit); 743 // 744 // length is too short, just copy qwords 745 // 746 __ BIND(L_copy_8_bytes); 747 __ movq(xmm0, Address(from, 0)); 748 __ movq(Address(from, to_from, Address::times_1), xmm0); 749 __ addl(from, 8); 750 __ decrement(qword_count); 751 __ jcc(Assembler::greater, L_copy_8_bytes); 752 __ BIND(L_exit); 753 } 754 755 // Copy 64 bytes chunks 756 // 757 // Inputs: 758 // from - source array address 759 // to_from - destination array address - from 760 // qword_count - 8-bytes element count, negative 761 // 762 void mmx_copy_forward(Register from, Register to_from, Register qword_count) { 763 assert( VM_Version::supports_mmx(), "supported cpu only" ); 764 Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit; 765 // Copy 64-byte chunks 766 __ jmpb(L_copy_64_bytes); 767 __ align(OptoLoopAlignment); 768 __ BIND(L_copy_64_bytes_loop); 769 __ movq(mmx0, Address(from, 0)); 770 __ movq(mmx1, Address(from, 8)); 771 __ movq(mmx2, Address(from, 16)); 772 __ movq(Address(from, to_from, Address::times_1, 0), mmx0); 773 __ movq(mmx3, Address(from, 24)); 774 __ movq(Address(from, to_from, Address::times_1, 8), mmx1); 775 __ movq(mmx4, Address(from, 32)); 776 __ movq(Address(from, to_from, Address::times_1, 16), mmx2); 777 __ movq(mmx5, Address(from, 40)); 778 __ movq(Address(from, to_from, Address::times_1, 24), mmx3); 779 __ movq(mmx6, Address(from, 48)); 780 __ movq(Address(from, to_from, Address::times_1, 32), mmx4); 781 __ movq(mmx7, Address(from, 56)); 782 __ movq(Address(from, to_from, Address::times_1, 40), mmx5); 783 __ movq(Address(from, to_from, Address::times_1, 48), mmx6); 784 __ movq(Address(from, to_from, Address::times_1, 56), mmx7); 785 __ addptr(from, 64); 786 __ BIND(L_copy_64_bytes); 787 __ subl(qword_count, 8); 788 __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop); 789 __ addl(qword_count, 8); 790 __ jccb(Assembler::zero, L_exit); 791 // 792 // length is too short, just copy qwords 793 // 794 __ BIND(L_copy_8_bytes); 795 __ movq(mmx0, Address(from, 0)); 796 __ movq(Address(from, to_from, Address::times_1), mmx0); 797 __ addptr(from, 8); 798 __ decrement(qword_count); 799 __ jcc(Assembler::greater, L_copy_8_bytes); 800 __ BIND(L_exit); 801 __ emms(); 802 } 803 804 address generate_disjoint_copy(BasicType t, bool aligned, 805 Address::ScaleFactor sf, 806 address* entry, const char *name, 807 bool dest_uninitialized = false) { 808 __ align(CodeEntryAlignment); 809 StubCodeMark mark(this, "StubRoutines", name); 810 address start = __ pc(); 811 812 Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte; 813 Label L_copy_2_bytes, L_copy_4_bytes, L_copy_64_bytes; 814 815 int shift = Address::times_ptr - sf; 816 817 const Register from = rsi; // source array address 818 const Register to = rdi; // destination array address 819 const Register count = rcx; // elements count 820 const Register to_from = to; // (to - from) 821 const Register saved_to = rdx; // saved destination array address 822 823 __ enter(); // required for proper stackwalking of RuntimeStub frame 824 __ push(rsi); 825 __ push(rdi); 826 __ movptr(from , Address(rsp, 12+ 4)); 827 __ movptr(to , Address(rsp, 12+ 8)); 828 __ movl(count, Address(rsp, 12+ 12)); 829 830 if (entry != NULL) { 831 *entry = __ pc(); // Entry point from conjoint arraycopy stub. 832 BLOCK_COMMENT("Entry:"); 833 } 834 835 if (t == T_OBJECT) { 836 __ testl(count, count); 837 __ jcc(Assembler::zero, L_0_count); 838 } 839 840 DecoratorSet decorators = ARRAYCOPY_DISJOINT; 841 if (dest_uninitialized) { 842 decorators |= AS_DEST_NOT_INITIALIZED; 843 } 844 if (aligned) { 845 decorators |= ARRAYCOPY_ALIGNED; 846 } 847 848 BarrierSetAssembler *bs = Universe::heap()->barrier_set()->barrier_set_assembler(); 849 bs->arraycopy_prologue(_masm, decorators, t, from, to, count); 850 851 __ subptr(to, from); // to --> to_from 852 __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element 853 __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp 854 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) { 855 // align source address at 4 bytes address boundary 856 if (t == T_BYTE) { 857 // One byte misalignment happens only for byte arrays 858 __ testl(from, 1); 859 __ jccb(Assembler::zero, L_skip_align1); 860 __ movb(rax, Address(from, 0)); 861 __ movb(Address(from, to_from, Address::times_1, 0), rax); 862 __ increment(from); 863 __ decrement(count); 864 __ BIND(L_skip_align1); 865 } 866 // Two bytes misalignment happens only for byte and short (char) arrays 867 __ testl(from, 2); 868 __ jccb(Assembler::zero, L_skip_align2); 869 __ movw(rax, Address(from, 0)); 870 __ movw(Address(from, to_from, Address::times_1, 0), rax); 871 __ addptr(from, 2); 872 __ subl(count, 1<<(shift-1)); 873 __ BIND(L_skip_align2); 874 } 875 if (!VM_Version::supports_mmx()) { 876 __ mov(rax, count); // save 'count' 877 __ shrl(count, shift); // bytes count 878 __ addptr(to_from, from);// restore 'to' 879 __ rep_mov(); 880 __ subptr(to_from, from);// restore 'to_from' 881 __ mov(count, rax); // restore 'count' 882 __ jmpb(L_copy_2_bytes); // all dwords were copied 883 } else { 884 if (!UseUnalignedLoadStores) { 885 // align to 8 bytes, we know we are 4 byte aligned to start 886 __ testptr(from, 4); 887 __ jccb(Assembler::zero, L_copy_64_bytes); 888 __ movl(rax, Address(from, 0)); 889 __ movl(Address(from, to_from, Address::times_1, 0), rax); 890 __ addptr(from, 4); 891 __ subl(count, 1<<shift); 892 } 893 __ BIND(L_copy_64_bytes); 894 __ mov(rax, count); 895 __ shrl(rax, shift+1); // 8 bytes chunk count 896 // 897 // Copy 8-byte chunks through MMX registers, 8 per iteration of the loop 898 // 899 if (UseXMMForArrayCopy) { 900 xmm_copy_forward(from, to_from, rax); 901 } else { 902 mmx_copy_forward(from, to_from, rax); 903 } 904 } 905 // copy tailing dword 906 __ BIND(L_copy_4_bytes); 907 __ testl(count, 1<<shift); 908 __ jccb(Assembler::zero, L_copy_2_bytes); 909 __ movl(rax, Address(from, 0)); 910 __ movl(Address(from, to_from, Address::times_1, 0), rax); 911 if (t == T_BYTE || t == T_SHORT) { 912 __ addptr(from, 4); 913 __ BIND(L_copy_2_bytes); 914 // copy tailing word 915 __ testl(count, 1<<(shift-1)); 916 __ jccb(Assembler::zero, L_copy_byte); 917 __ movw(rax, Address(from, 0)); 918 __ movw(Address(from, to_from, Address::times_1, 0), rax); 919 if (t == T_BYTE) { 920 __ addptr(from, 2); 921 __ BIND(L_copy_byte); 922 // copy tailing byte 923 __ testl(count, 1); 924 __ jccb(Assembler::zero, L_exit); 925 __ movb(rax, Address(from, 0)); 926 __ movb(Address(from, to_from, Address::times_1, 0), rax); 927 __ BIND(L_exit); 928 } else { 929 __ BIND(L_copy_byte); 930 } 931 } else { 932 __ BIND(L_copy_2_bytes); 933 } 934 935 __ movl(count, Address(rsp, 12+12)); // reread 'count' 936 bs->arraycopy_epilogue(_masm, decorators, t, from, to, count); 937 938 if (t == T_OBJECT) { 939 __ BIND(L_0_count); 940 } 941 inc_copy_counter_np(t); 942 __ pop(rdi); 943 __ pop(rsi); 944 __ leave(); // required for proper stackwalking of RuntimeStub frame 945 __ vzeroupper(); 946 __ xorptr(rax, rax); // return 0 947 __ ret(0); 948 return start; 949 } 950 951 952 address generate_fill(BasicType t, bool aligned, const char *name) { 953 __ align(CodeEntryAlignment); 954 StubCodeMark mark(this, "StubRoutines", name); 955 address start = __ pc(); 956 957 BLOCK_COMMENT("Entry:"); 958 959 const Register to = rdi; // source array address 960 const Register value = rdx; // value 961 const Register count = rsi; // elements count 962 963 __ enter(); // required for proper stackwalking of RuntimeStub frame 964 __ push(rsi); 965 __ push(rdi); 966 __ movptr(to , Address(rsp, 12+ 4)); 967 __ movl(value, Address(rsp, 12+ 8)); 968 __ movl(count, Address(rsp, 12+ 12)); 969 970 __ generate_fill(t, aligned, to, value, count, rax, xmm0); 971 972 __ pop(rdi); 973 __ pop(rsi); 974 __ leave(); // required for proper stackwalking of RuntimeStub frame 975 __ ret(0); 976 return start; 977 } 978 979 address generate_conjoint_copy(BasicType t, bool aligned, 980 Address::ScaleFactor sf, 981 address nooverlap_target, 982 address* entry, const char *name, 983 bool dest_uninitialized = false) { 984 __ align(CodeEntryAlignment); 985 StubCodeMark mark(this, "StubRoutines", name); 986 address start = __ pc(); 987 988 Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte; 989 Label L_copy_2_bytes, L_copy_4_bytes, L_copy_8_bytes, L_copy_8_bytes_loop; 990 991 int shift = Address::times_ptr - sf; 992 993 const Register src = rax; // source array address 994 const Register dst = rdx; // destination array address 995 const Register from = rsi; // source array address 996 const Register to = rdi; // destination array address 997 const Register count = rcx; // elements count 998 const Register end = rax; // array end address 999 1000 __ enter(); // required for proper stackwalking of RuntimeStub frame 1001 __ push(rsi); 1002 __ push(rdi); 1003 __ movptr(src , Address(rsp, 12+ 4)); // from 1004 __ movptr(dst , Address(rsp, 12+ 8)); // to 1005 __ movl2ptr(count, Address(rsp, 12+12)); // count 1006 1007 if (entry != NULL) { 1008 *entry = __ pc(); // Entry point from generic arraycopy stub. 1009 BLOCK_COMMENT("Entry:"); 1010 } 1011 1012 // nooverlap_target expects arguments in rsi and rdi. 1013 __ mov(from, src); 1014 __ mov(to , dst); 1015 1016 // arrays overlap test: dispatch to disjoint stub if necessary. 1017 RuntimeAddress nooverlap(nooverlap_target); 1018 __ cmpptr(dst, src); 1019 __ lea(end, Address(src, count, sf, 0)); // src + count * elem_size 1020 __ jump_cc(Assembler::belowEqual, nooverlap); 1021 __ cmpptr(dst, end); 1022 __ jump_cc(Assembler::aboveEqual, nooverlap); 1023 1024 if (t == T_OBJECT) { 1025 __ testl(count, count); 1026 __ jcc(Assembler::zero, L_0_count); 1027 } 1028 1029 DecoratorSet decorators = 0; 1030 if (dest_uninitialized) { 1031 decorators |= AS_DEST_NOT_INITIALIZED; 1032 } 1033 if (aligned) { 1034 decorators |= ARRAYCOPY_ALIGNED; 1035 } 1036 1037 BarrierSetAssembler *bs = Universe::heap()->barrier_set()->barrier_set_assembler(); 1038 bs->arraycopy_prologue(_masm, decorators, t, from, to, count); 1039 1040 // copy from high to low 1041 __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element 1042 __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp 1043 if (t == T_BYTE || t == T_SHORT) { 1044 // Align the end of destination array at 4 bytes address boundary 1045 __ lea(end, Address(dst, count, sf, 0)); 1046 if (t == T_BYTE) { 1047 // One byte misalignment happens only for byte arrays 1048 __ testl(end, 1); 1049 __ jccb(Assembler::zero, L_skip_align1); 1050 __ decrement(count); 1051 __ movb(rdx, Address(from, count, sf, 0)); 1052 __ movb(Address(to, count, sf, 0), rdx); 1053 __ BIND(L_skip_align1); 1054 } 1055 // Two bytes misalignment happens only for byte and short (char) arrays 1056 __ testl(end, 2); 1057 __ jccb(Assembler::zero, L_skip_align2); 1058 __ subptr(count, 1<<(shift-1)); 1059 __ movw(rdx, Address(from, count, sf, 0)); 1060 __ movw(Address(to, count, sf, 0), rdx); 1061 __ BIND(L_skip_align2); 1062 __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element 1063 __ jcc(Assembler::below, L_copy_4_bytes); 1064 } 1065 1066 if (!VM_Version::supports_mmx()) { 1067 __ std(); 1068 __ mov(rax, count); // Save 'count' 1069 __ mov(rdx, to); // Save 'to' 1070 __ lea(rsi, Address(from, count, sf, -4)); 1071 __ lea(rdi, Address(to , count, sf, -4)); 1072 __ shrptr(count, shift); // bytes count 1073 __ rep_mov(); 1074 __ cld(); 1075 __ mov(count, rax); // restore 'count' 1076 __ andl(count, (1<<shift)-1); // mask the number of rest elements 1077 __ movptr(from, Address(rsp, 12+4)); // reread 'from' 1078 __ mov(to, rdx); // restore 'to' 1079 __ jmpb(L_copy_2_bytes); // all dword were copied 1080 } else { 1081 // Align to 8 bytes the end of array. It is aligned to 4 bytes already. 1082 __ testptr(end, 4); 1083 __ jccb(Assembler::zero, L_copy_8_bytes); 1084 __ subl(count, 1<<shift); 1085 __ movl(rdx, Address(from, count, sf, 0)); 1086 __ movl(Address(to, count, sf, 0), rdx); 1087 __ jmpb(L_copy_8_bytes); 1088 1089 __ align(OptoLoopAlignment); 1090 // Move 8 bytes 1091 __ BIND(L_copy_8_bytes_loop); 1092 if (UseXMMForArrayCopy) { 1093 __ movq(xmm0, Address(from, count, sf, 0)); 1094 __ movq(Address(to, count, sf, 0), xmm0); 1095 } else { 1096 __ movq(mmx0, Address(from, count, sf, 0)); 1097 __ movq(Address(to, count, sf, 0), mmx0); 1098 } 1099 __ BIND(L_copy_8_bytes); 1100 __ subl(count, 2<<shift); 1101 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); 1102 __ addl(count, 2<<shift); 1103 if (!UseXMMForArrayCopy) { 1104 __ emms(); 1105 } 1106 } 1107 __ BIND(L_copy_4_bytes); 1108 // copy prefix qword 1109 __ testl(count, 1<<shift); 1110 __ jccb(Assembler::zero, L_copy_2_bytes); 1111 __ movl(rdx, Address(from, count, sf, -4)); 1112 __ movl(Address(to, count, sf, -4), rdx); 1113 1114 if (t == T_BYTE || t == T_SHORT) { 1115 __ subl(count, (1<<shift)); 1116 __ BIND(L_copy_2_bytes); 1117 // copy prefix dword 1118 __ testl(count, 1<<(shift-1)); 1119 __ jccb(Assembler::zero, L_copy_byte); 1120 __ movw(rdx, Address(from, count, sf, -2)); 1121 __ movw(Address(to, count, sf, -2), rdx); 1122 if (t == T_BYTE) { 1123 __ subl(count, 1<<(shift-1)); 1124 __ BIND(L_copy_byte); 1125 // copy prefix byte 1126 __ testl(count, 1); 1127 __ jccb(Assembler::zero, L_exit); 1128 __ movb(rdx, Address(from, 0)); 1129 __ movb(Address(to, 0), rdx); 1130 __ BIND(L_exit); 1131 } else { 1132 __ BIND(L_copy_byte); 1133 } 1134 } else { 1135 __ BIND(L_copy_2_bytes); 1136 } 1137 1138 __ movl2ptr(count, Address(rsp, 12+12)); // reread count 1139 bs->arraycopy_epilogue(_masm, decorators, t, from, to, count); 1140 1141 if (t == T_OBJECT) { 1142 __ BIND(L_0_count); 1143 } 1144 inc_copy_counter_np(t); 1145 __ pop(rdi); 1146 __ pop(rsi); 1147 __ leave(); // required for proper stackwalking of RuntimeStub frame 1148 __ xorptr(rax, rax); // return 0 1149 __ ret(0); 1150 return start; 1151 } 1152 1153 1154 address generate_disjoint_long_copy(address* entry, const char *name) { 1155 __ align(CodeEntryAlignment); 1156 StubCodeMark mark(this, "StubRoutines", name); 1157 address start = __ pc(); 1158 1159 Label L_copy_8_bytes, L_copy_8_bytes_loop; 1160 const Register from = rax; // source array address 1161 const Register to = rdx; // destination array address 1162 const Register count = rcx; // elements count 1163 const Register to_from = rdx; // (to - from) 1164 1165 __ enter(); // required for proper stackwalking of RuntimeStub frame 1166 __ movptr(from , Address(rsp, 8+0)); // from 1167 __ movptr(to , Address(rsp, 8+4)); // to 1168 __ movl2ptr(count, Address(rsp, 8+8)); // count 1169 1170 *entry = __ pc(); // Entry point from conjoint arraycopy stub. 1171 BLOCK_COMMENT("Entry:"); 1172 1173 __ subptr(to, from); // to --> to_from 1174 if (VM_Version::supports_mmx()) { 1175 if (UseXMMForArrayCopy) { 1176 xmm_copy_forward(from, to_from, count); 1177 } else { 1178 mmx_copy_forward(from, to_from, count); 1179 } 1180 } else { 1181 __ jmpb(L_copy_8_bytes); 1182 __ align(OptoLoopAlignment); 1183 __ BIND(L_copy_8_bytes_loop); 1184 __ fild_d(Address(from, 0)); 1185 __ fistp_d(Address(from, to_from, Address::times_1)); 1186 __ addptr(from, 8); 1187 __ BIND(L_copy_8_bytes); 1188 __ decrement(count); 1189 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); 1190 } 1191 inc_copy_counter_np(T_LONG); 1192 __ leave(); // required for proper stackwalking of RuntimeStub frame 1193 __ vzeroupper(); 1194 __ xorptr(rax, rax); // return 0 1195 __ ret(0); 1196 return start; 1197 } 1198 1199 address generate_conjoint_long_copy(address nooverlap_target, 1200 address* entry, const char *name) { 1201 __ align(CodeEntryAlignment); 1202 StubCodeMark mark(this, "StubRoutines", name); 1203 address start = __ pc(); 1204 1205 Label L_copy_8_bytes, L_copy_8_bytes_loop; 1206 const Register from = rax; // source array address 1207 const Register to = rdx; // destination array address 1208 const Register count = rcx; // elements count 1209 const Register end_from = rax; // source array end address 1210 1211 __ enter(); // required for proper stackwalking of RuntimeStub frame 1212 __ movptr(from , Address(rsp, 8+0)); // from 1213 __ movptr(to , Address(rsp, 8+4)); // to 1214 __ movl2ptr(count, Address(rsp, 8+8)); // count 1215 1216 *entry = __ pc(); // Entry point from generic arraycopy stub. 1217 BLOCK_COMMENT("Entry:"); 1218 1219 // arrays overlap test 1220 __ cmpptr(to, from); 1221 RuntimeAddress nooverlap(nooverlap_target); 1222 __ jump_cc(Assembler::belowEqual, nooverlap); 1223 __ lea(end_from, Address(from, count, Address::times_8, 0)); 1224 __ cmpptr(to, end_from); 1225 __ movptr(from, Address(rsp, 8)); // from 1226 __ jump_cc(Assembler::aboveEqual, nooverlap); 1227 1228 __ jmpb(L_copy_8_bytes); 1229 1230 __ align(OptoLoopAlignment); 1231 __ BIND(L_copy_8_bytes_loop); 1232 if (VM_Version::supports_mmx()) { 1233 if (UseXMMForArrayCopy) { 1234 __ movq(xmm0, Address(from, count, Address::times_8)); 1235 __ movq(Address(to, count, Address::times_8), xmm0); 1236 } else { 1237 __ movq(mmx0, Address(from, count, Address::times_8)); 1238 __ movq(Address(to, count, Address::times_8), mmx0); 1239 } 1240 } else { 1241 __ fild_d(Address(from, count, Address::times_8)); 1242 __ fistp_d(Address(to, count, Address::times_8)); 1243 } 1244 __ BIND(L_copy_8_bytes); 1245 __ decrement(count); 1246 __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop); 1247 1248 if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) { 1249 __ emms(); 1250 } 1251 inc_copy_counter_np(T_LONG); 1252 __ leave(); // required for proper stackwalking of RuntimeStub frame 1253 __ xorptr(rax, rax); // return 0 1254 __ ret(0); 1255 return start; 1256 } 1257 1258 1259 // Helper for generating a dynamic type check. 1260 // The sub_klass must be one of {rbx, rdx, rsi}. 1261 // The temp is killed. 1262 void generate_type_check(Register sub_klass, 1263 Address& super_check_offset_addr, 1264 Address& super_klass_addr, 1265 Register temp, 1266 Label* L_success, Label* L_failure) { 1267 BLOCK_COMMENT("type_check:"); 1268 1269 Label L_fallthrough; 1270 #define LOCAL_JCC(assembler_con, label_ptr) \ 1271 if (label_ptr != NULL) __ jcc(assembler_con, *(label_ptr)); \ 1272 else __ jcc(assembler_con, L_fallthrough) /*omit semi*/ 1273 1274 // The following is a strange variation of the fast path which requires 1275 // one less register, because needed values are on the argument stack. 1276 // __ check_klass_subtype_fast_path(sub_klass, *super_klass*, temp, 1277 // L_success, L_failure, NULL); 1278 assert_different_registers(sub_klass, temp); 1279 1280 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1281 1282 // if the pointers are equal, we are done (e.g., String[] elements) 1283 __ cmpptr(sub_klass, super_klass_addr); 1284 LOCAL_JCC(Assembler::equal, L_success); 1285 1286 // check the supertype display: 1287 __ movl2ptr(temp, super_check_offset_addr); 1288 Address super_check_addr(sub_klass, temp, Address::times_1, 0); 1289 __ movptr(temp, super_check_addr); // load displayed supertype 1290 __ cmpptr(temp, super_klass_addr); // test the super type 1291 LOCAL_JCC(Assembler::equal, L_success); 1292 1293 // if it was a primary super, we can just fail immediately 1294 __ cmpl(super_check_offset_addr, sc_offset); 1295 LOCAL_JCC(Assembler::notEqual, L_failure); 1296 1297 // The repne_scan instruction uses fixed registers, which will get spilled. 1298 // We happen to know this works best when super_klass is in rax. 1299 Register super_klass = temp; 1300 __ movptr(super_klass, super_klass_addr); 1301 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, 1302 L_success, L_failure); 1303 1304 __ bind(L_fallthrough); 1305 1306 if (L_success == NULL) { BLOCK_COMMENT("L_success:"); } 1307 if (L_failure == NULL) { BLOCK_COMMENT("L_failure:"); } 1308 1309 #undef LOCAL_JCC 1310 } 1311 1312 // 1313 // Generate checkcasting array copy stub 1314 // 1315 // Input: 1316 // 4(rsp) - source array address 1317 // 8(rsp) - destination array address 1318 // 12(rsp) - element count, can be zero 1319 // 16(rsp) - size_t ckoff (super_check_offset) 1320 // 20(rsp) - oop ckval (super_klass) 1321 // 1322 // Output: 1323 // rax, == 0 - success 1324 // rax, == -1^K - failure, where K is partial transfer count 1325 // 1326 address generate_checkcast_copy(const char *name, address* entry, bool dest_uninitialized = false) { 1327 __ align(CodeEntryAlignment); 1328 StubCodeMark mark(this, "StubRoutines", name); 1329 address start = __ pc(); 1330 1331 Label L_load_element, L_store_element, L_do_card_marks, L_done; 1332 1333 // register use: 1334 // rax, rdx, rcx -- loop control (end_from, end_to, count) 1335 // rdi, rsi -- element access (oop, klass) 1336 // rbx, -- temp 1337 const Register from = rax; // source array address 1338 const Register to = rdx; // destination array address 1339 const Register length = rcx; // elements count 1340 const Register elem = rdi; // each oop copied 1341 const Register elem_klass = rsi; // each elem._klass (sub_klass) 1342 const Register temp = rbx; // lone remaining temp 1343 1344 __ enter(); // required for proper stackwalking of RuntimeStub frame 1345 1346 __ push(rsi); 1347 __ push(rdi); 1348 __ push(rbx); 1349 1350 Address from_arg(rsp, 16+ 4); // from 1351 Address to_arg(rsp, 16+ 8); // to 1352 Address length_arg(rsp, 16+12); // elements count 1353 Address ckoff_arg(rsp, 16+16); // super_check_offset 1354 Address ckval_arg(rsp, 16+20); // super_klass 1355 1356 // Load up: 1357 __ movptr(from, from_arg); 1358 __ movptr(to, to_arg); 1359 __ movl2ptr(length, length_arg); 1360 1361 if (entry != NULL) { 1362 *entry = __ pc(); // Entry point from generic arraycopy stub. 1363 BLOCK_COMMENT("Entry:"); 1364 } 1365 1366 //--------------------------------------------------------------- 1367 // Assembler stub will be used for this call to arraycopy 1368 // if the two arrays are subtypes of Object[] but the 1369 // destination array type is not equal to or a supertype 1370 // of the source type. Each element must be separately 1371 // checked. 1372 1373 // Loop-invariant addresses. They are exclusive end pointers. 1374 Address end_from_addr(from, length, Address::times_ptr, 0); 1375 Address end_to_addr(to, length, Address::times_ptr, 0); 1376 1377 Register end_from = from; // re-use 1378 Register end_to = to; // re-use 1379 Register count = length; // re-use 1380 1381 // Loop-variant addresses. They assume post-incremented count < 0. 1382 Address from_element_addr(end_from, count, Address::times_ptr, 0); 1383 Address to_element_addr(end_to, count, Address::times_ptr, 0); 1384 Address elem_klass_addr(elem, oopDesc::klass_offset_in_bytes()); 1385 1386 DecoratorSet decorators = ARRAYCOPY_CHECKCAST; 1387 if (dest_uninitialized) { 1388 decorators |= AS_DEST_NOT_INITIALIZED; 1389 } 1390 1391 BasicType type = T_OBJECT; 1392 BarrierSetAssembler *bs = Universe::heap()->barrier_set()->barrier_set_assembler(); 1393 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 1394 1395 // Copy from low to high addresses, indexed from the end of each array. 1396 __ lea(end_from, end_from_addr); 1397 __ lea(end_to, end_to_addr); 1398 assert(length == count, ""); // else fix next line: 1399 __ negptr(count); // negate and test the length 1400 __ jccb(Assembler::notZero, L_load_element); 1401 1402 // Empty array: Nothing to do. 1403 __ xorptr(rax, rax); // return 0 on (trivial) success 1404 __ jmp(L_done); 1405 1406 // ======== begin loop ======== 1407 // (Loop is rotated; its entry is L_load_element.) 1408 // Loop control: 1409 // for (count = -count; count != 0; count++) 1410 // Base pointers src, dst are biased by 8*count,to last element. 1411 __ align(OptoLoopAlignment); 1412 1413 __ BIND(L_store_element); 1414 __ movptr(to_element_addr, elem); // store the oop 1415 __ increment(count); // increment the count toward zero 1416 __ jccb(Assembler::zero, L_do_card_marks); 1417 1418 // ======== loop entry is here ======== 1419 __ BIND(L_load_element); 1420 __ movptr(elem, from_element_addr); // load the oop 1421 __ testptr(elem, elem); 1422 __ jccb(Assembler::zero, L_store_element); 1423 1424 // (Could do a trick here: Remember last successful non-null 1425 // element stored and make a quick oop equality check on it.) 1426 1427 __ movptr(elem_klass, elem_klass_addr); // query the object klass 1428 generate_type_check(elem_klass, ckoff_arg, ckval_arg, temp, 1429 &L_store_element, NULL); 1430 // (On fall-through, we have failed the element type check.) 1431 // ======== end loop ======== 1432 1433 // It was a real error; we must depend on the caller to finish the job. 1434 // Register "count" = -1 * number of *remaining* oops, length_arg = *total* oops. 1435 // Emit GC store barriers for the oops we have copied (length_arg + count), 1436 // and report their number to the caller. 1437 assert_different_registers(to, count, rax); 1438 Label L_post_barrier; 1439 __ addl(count, length_arg); // transfers = (length - remaining) 1440 __ movl2ptr(rax, count); // save the value 1441 __ notptr(rax); // report (-1^K) to caller (does not affect flags) 1442 __ jccb(Assembler::notZero, L_post_barrier); 1443 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier 1444 1445 // Come here on success only. 1446 __ BIND(L_do_card_marks); 1447 __ xorptr(rax, rax); // return 0 on success 1448 __ movl2ptr(count, length_arg); 1449 1450 __ BIND(L_post_barrier); 1451 __ movptr(to, to_arg); // reload 1452 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count); 1453 1454 // Common exit point (success or failure). 1455 __ BIND(L_done); 1456 __ pop(rbx); 1457 __ pop(rdi); 1458 __ pop(rsi); 1459 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1460 __ leave(); // required for proper stackwalking of RuntimeStub frame 1461 __ ret(0); 1462 1463 return start; 1464 } 1465 1466 // 1467 // Generate 'unsafe' array copy stub 1468 // Though just as safe as the other stubs, it takes an unscaled 1469 // size_t argument instead of an element count. 1470 // 1471 // Input: 1472 // 4(rsp) - source array address 1473 // 8(rsp) - destination array address 1474 // 12(rsp) - byte count, can be zero 1475 // 1476 // Output: 1477 // rax, == 0 - success 1478 // rax, == -1 - need to call System.arraycopy 1479 // 1480 // Examines the alignment of the operands and dispatches 1481 // to a long, int, short, or byte copy loop. 1482 // 1483 address generate_unsafe_copy(const char *name, 1484 address byte_copy_entry, 1485 address short_copy_entry, 1486 address int_copy_entry, 1487 address long_copy_entry) { 1488 1489 Label L_long_aligned, L_int_aligned, L_short_aligned; 1490 1491 __ align(CodeEntryAlignment); 1492 StubCodeMark mark(this, "StubRoutines", name); 1493 address start = __ pc(); 1494 1495 const Register from = rax; // source array address 1496 const Register to = rdx; // destination array address 1497 const Register count = rcx; // elements count 1498 1499 __ enter(); // required for proper stackwalking of RuntimeStub frame 1500 __ push(rsi); 1501 __ push(rdi); 1502 Address from_arg(rsp, 12+ 4); // from 1503 Address to_arg(rsp, 12+ 8); // to 1504 Address count_arg(rsp, 12+12); // byte count 1505 1506 // Load up: 1507 __ movptr(from , from_arg); 1508 __ movptr(to , to_arg); 1509 __ movl2ptr(count, count_arg); 1510 1511 // bump this on entry, not on exit: 1512 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1513 1514 const Register bits = rsi; 1515 __ mov(bits, from); 1516 __ orptr(bits, to); 1517 __ orptr(bits, count); 1518 1519 __ testl(bits, BytesPerLong-1); 1520 __ jccb(Assembler::zero, L_long_aligned); 1521 1522 __ testl(bits, BytesPerInt-1); 1523 __ jccb(Assembler::zero, L_int_aligned); 1524 1525 __ testl(bits, BytesPerShort-1); 1526 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry)); 1527 1528 __ BIND(L_short_aligned); 1529 __ shrptr(count, LogBytesPerShort); // size => short_count 1530 __ movl(count_arg, count); // update 'count' 1531 __ jump(RuntimeAddress(short_copy_entry)); 1532 1533 __ BIND(L_int_aligned); 1534 __ shrptr(count, LogBytesPerInt); // size => int_count 1535 __ movl(count_arg, count); // update 'count' 1536 __ jump(RuntimeAddress(int_copy_entry)); 1537 1538 __ BIND(L_long_aligned); 1539 __ shrptr(count, LogBytesPerLong); // size => qword_count 1540 __ movl(count_arg, count); // update 'count' 1541 __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it. 1542 __ pop(rsi); 1543 __ jump(RuntimeAddress(long_copy_entry)); 1544 1545 return start; 1546 } 1547 1548 1549 // Perform range checks on the proposed arraycopy. 1550 // Smashes src_pos and dst_pos. (Uses them up for temps.) 1551 void arraycopy_range_checks(Register src, 1552 Register src_pos, 1553 Register dst, 1554 Register dst_pos, 1555 Address& length, 1556 Label& L_failed) { 1557 BLOCK_COMMENT("arraycopy_range_checks:"); 1558 const Register src_end = src_pos; // source array end position 1559 const Register dst_end = dst_pos; // destination array end position 1560 __ addl(src_end, length); // src_pos + length 1561 __ addl(dst_end, length); // dst_pos + length 1562 1563 // if (src_pos + length > arrayOop(src)->length() ) FAIL; 1564 __ cmpl(src_end, Address(src, arrayOopDesc::length_offset_in_bytes())); 1565 __ jcc(Assembler::above, L_failed); 1566 1567 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL; 1568 __ cmpl(dst_end, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1569 __ jcc(Assembler::above, L_failed); 1570 1571 BLOCK_COMMENT("arraycopy_range_checks done"); 1572 } 1573 1574 1575 // 1576 // Generate generic array copy stubs 1577 // 1578 // Input: 1579 // 4(rsp) - src oop 1580 // 8(rsp) - src_pos 1581 // 12(rsp) - dst oop 1582 // 16(rsp) - dst_pos 1583 // 20(rsp) - element count 1584 // 1585 // Output: 1586 // rax, == 0 - success 1587 // rax, == -1^K - failure, where K is partial transfer count 1588 // 1589 address generate_generic_copy(const char *name, 1590 address entry_jbyte_arraycopy, 1591 address entry_jshort_arraycopy, 1592 address entry_jint_arraycopy, 1593 address entry_oop_arraycopy, 1594 address entry_jlong_arraycopy, 1595 address entry_checkcast_arraycopy) { 1596 Label L_failed, L_failed_0, L_objArray; 1597 1598 { int modulus = CodeEntryAlignment; 1599 int target = modulus - 5; // 5 = sizeof jmp(L_failed) 1600 int advance = target - (__ offset() % modulus); 1601 if (advance < 0) advance += modulus; 1602 if (advance > 0) __ nop(advance); 1603 } 1604 StubCodeMark mark(this, "StubRoutines", name); 1605 1606 // Short-hop target to L_failed. Makes for denser prologue code. 1607 __ BIND(L_failed_0); 1608 __ jmp(L_failed); 1609 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed"); 1610 1611 __ align(CodeEntryAlignment); 1612 address start = __ pc(); 1613 1614 __ enter(); // required for proper stackwalking of RuntimeStub frame 1615 __ push(rsi); 1616 __ push(rdi); 1617 1618 // bump this on entry, not on exit: 1619 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1620 1621 // Input values 1622 Address SRC (rsp, 12+ 4); 1623 Address SRC_POS (rsp, 12+ 8); 1624 Address DST (rsp, 12+12); 1625 Address DST_POS (rsp, 12+16); 1626 Address LENGTH (rsp, 12+20); 1627 1628 //----------------------------------------------------------------------- 1629 // Assembler stub will be used for this call to arraycopy 1630 // if the following conditions are met: 1631 // 1632 // (1) src and dst must not be null. 1633 // (2) src_pos must not be negative. 1634 // (3) dst_pos must not be negative. 1635 // (4) length must not be negative. 1636 // (5) src klass and dst klass should be the same and not NULL. 1637 // (6) src and dst should be arrays. 1638 // (7) src_pos + length must not exceed length of src. 1639 // (8) dst_pos + length must not exceed length of dst. 1640 // 1641 1642 const Register src = rax; // source array oop 1643 const Register src_pos = rsi; 1644 const Register dst = rdx; // destination array oop 1645 const Register dst_pos = rdi; 1646 const Register length = rcx; // transfer count 1647 1648 // if (src == NULL) return -1; 1649 __ movptr(src, SRC); // src oop 1650 __ testptr(src, src); 1651 __ jccb(Assembler::zero, L_failed_0); 1652 1653 // if (src_pos < 0) return -1; 1654 __ movl2ptr(src_pos, SRC_POS); // src_pos 1655 __ testl(src_pos, src_pos); 1656 __ jccb(Assembler::negative, L_failed_0); 1657 1658 // if (dst == NULL) return -1; 1659 __ movptr(dst, DST); // dst oop 1660 __ testptr(dst, dst); 1661 __ jccb(Assembler::zero, L_failed_0); 1662 1663 // if (dst_pos < 0) return -1; 1664 __ movl2ptr(dst_pos, DST_POS); // dst_pos 1665 __ testl(dst_pos, dst_pos); 1666 __ jccb(Assembler::negative, L_failed_0); 1667 1668 // if (length < 0) return -1; 1669 __ movl2ptr(length, LENGTH); // length 1670 __ testl(length, length); 1671 __ jccb(Assembler::negative, L_failed_0); 1672 1673 // if (src->klass() == NULL) return -1; 1674 Address src_klass_addr(src, oopDesc::klass_offset_in_bytes()); 1675 Address dst_klass_addr(dst, oopDesc::klass_offset_in_bytes()); 1676 const Register rcx_src_klass = rcx; // array klass 1677 __ movptr(rcx_src_klass, Address(src, oopDesc::klass_offset_in_bytes())); 1678 1679 #ifdef ASSERT 1680 // assert(src->klass() != NULL); 1681 BLOCK_COMMENT("assert klasses not null"); 1682 { Label L1, L2; 1683 __ testptr(rcx_src_klass, rcx_src_klass); 1684 __ jccb(Assembler::notZero, L2); // it is broken if klass is NULL 1685 __ bind(L1); 1686 __ stop("broken null klass"); 1687 __ bind(L2); 1688 __ cmpptr(dst_klass_addr, (int32_t)NULL_WORD); 1689 __ jccb(Assembler::equal, L1); // this would be broken also 1690 BLOCK_COMMENT("assert done"); 1691 } 1692 #endif //ASSERT 1693 1694 // Load layout helper (32-bits) 1695 // 1696 // |array_tag| | header_size | element_type | |log2_element_size| 1697 // 32 30 24 16 8 2 0 1698 // 1699 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 1700 // 1701 1702 int lh_offset = in_bytes(Klass::layout_helper_offset()); 1703 Address src_klass_lh_addr(rcx_src_klass, lh_offset); 1704 1705 // Handle objArrays completely differently... 1706 jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 1707 __ cmpl(src_klass_lh_addr, objArray_lh); 1708 __ jcc(Assembler::equal, L_objArray); 1709 1710 // if (src->klass() != dst->klass()) return -1; 1711 __ cmpptr(rcx_src_klass, dst_klass_addr); 1712 __ jccb(Assembler::notEqual, L_failed_0); 1713 1714 const Register rcx_lh = rcx; // layout helper 1715 assert(rcx_lh == rcx_src_klass, "known alias"); 1716 __ movl(rcx_lh, src_klass_lh_addr); 1717 1718 // if (!src->is_Array()) return -1; 1719 __ cmpl(rcx_lh, Klass::_lh_neutral_value); 1720 __ jcc(Assembler::greaterEqual, L_failed_0); // signed cmp 1721 1722 // At this point, it is known to be a typeArray (array_tag 0x3). 1723 #ifdef ASSERT 1724 { Label L; 1725 __ cmpl(rcx_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift)); 1726 __ jcc(Assembler::greaterEqual, L); // signed cmp 1727 __ stop("must be a primitive array"); 1728 __ bind(L); 1729 } 1730 #endif 1731 1732 assert_different_registers(src, src_pos, dst, dst_pos, rcx_lh); 1733 arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed); 1734 1735 // TypeArrayKlass 1736 // 1737 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 1738 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 1739 // 1740 const Register rsi_offset = rsi; // array offset 1741 const Register src_array = src; // src array offset 1742 const Register dst_array = dst; // dst array offset 1743 const Register rdi_elsize = rdi; // log2 element size 1744 1745 __ mov(rsi_offset, rcx_lh); 1746 __ shrptr(rsi_offset, Klass::_lh_header_size_shift); 1747 __ andptr(rsi_offset, Klass::_lh_header_size_mask); // array_offset 1748 __ addptr(src_array, rsi_offset); // src array offset 1749 __ addptr(dst_array, rsi_offset); // dst array offset 1750 __ andptr(rcx_lh, Klass::_lh_log2_element_size_mask); // log2 elsize 1751 1752 // next registers should be set before the jump to corresponding stub 1753 const Register from = src; // source array address 1754 const Register to = dst; // destination array address 1755 const Register count = rcx; // elements count 1756 // some of them should be duplicated on stack 1757 #define FROM Address(rsp, 12+ 4) 1758 #define TO Address(rsp, 12+ 8) // Not used now 1759 #define COUNT Address(rsp, 12+12) // Only for oop arraycopy 1760 1761 BLOCK_COMMENT("scale indexes to element size"); 1762 __ movl2ptr(rsi, SRC_POS); // src_pos 1763 __ shlptr(rsi); // src_pos << rcx (log2 elsize) 1764 assert(src_array == from, ""); 1765 __ addptr(from, rsi); // from = src_array + SRC_POS << log2 elsize 1766 __ movl2ptr(rdi, DST_POS); // dst_pos 1767 __ shlptr(rdi); // dst_pos << rcx (log2 elsize) 1768 assert(dst_array == to, ""); 1769 __ addptr(to, rdi); // to = dst_array + DST_POS << log2 elsize 1770 __ movptr(FROM, from); // src_addr 1771 __ mov(rdi_elsize, rcx_lh); // log2 elsize 1772 __ movl2ptr(count, LENGTH); // elements count 1773 1774 BLOCK_COMMENT("choose copy loop based on element size"); 1775 __ cmpl(rdi_elsize, 0); 1776 1777 __ jump_cc(Assembler::equal, RuntimeAddress(entry_jbyte_arraycopy)); 1778 __ cmpl(rdi_elsize, LogBytesPerShort); 1779 __ jump_cc(Assembler::equal, RuntimeAddress(entry_jshort_arraycopy)); 1780 __ cmpl(rdi_elsize, LogBytesPerInt); 1781 __ jump_cc(Assembler::equal, RuntimeAddress(entry_jint_arraycopy)); 1782 #ifdef ASSERT 1783 __ cmpl(rdi_elsize, LogBytesPerLong); 1784 __ jccb(Assembler::notEqual, L_failed); 1785 #endif 1786 __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it. 1787 __ pop(rsi); 1788 __ jump(RuntimeAddress(entry_jlong_arraycopy)); 1789 1790 __ BIND(L_failed); 1791 __ xorptr(rax, rax); 1792 __ notptr(rax); // return -1 1793 __ pop(rdi); 1794 __ pop(rsi); 1795 __ leave(); // required for proper stackwalking of RuntimeStub frame 1796 __ ret(0); 1797 1798 // ObjArrayKlass 1799 __ BIND(L_objArray); 1800 // live at this point: rcx_src_klass, src[_pos], dst[_pos] 1801 1802 Label L_plain_copy, L_checkcast_copy; 1803 // test array classes for subtyping 1804 __ cmpptr(rcx_src_klass, dst_klass_addr); // usual case is exact equality 1805 __ jccb(Assembler::notEqual, L_checkcast_copy); 1806 1807 // Identically typed arrays can be copied without element-wise checks. 1808 assert_different_registers(src, src_pos, dst, dst_pos, rcx_src_klass); 1809 arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed); 1810 1811 __ BIND(L_plain_copy); 1812 __ movl2ptr(count, LENGTH); // elements count 1813 __ movl2ptr(src_pos, SRC_POS); // reload src_pos 1814 __ lea(from, Address(src, src_pos, Address::times_ptr, 1815 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr 1816 __ movl2ptr(dst_pos, DST_POS); // reload dst_pos 1817 __ lea(to, Address(dst, dst_pos, Address::times_ptr, 1818 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr 1819 __ movptr(FROM, from); // src_addr 1820 __ movptr(TO, to); // dst_addr 1821 __ movl(COUNT, count); // count 1822 __ jump(RuntimeAddress(entry_oop_arraycopy)); 1823 1824 __ BIND(L_checkcast_copy); 1825 // live at this point: rcx_src_klass, dst[_pos], src[_pos] 1826 { 1827 // Handy offsets: 1828 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 1829 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1830 1831 Register rsi_dst_klass = rsi; 1832 Register rdi_temp = rdi; 1833 assert(rsi_dst_klass == src_pos, "expected alias w/ src_pos"); 1834 assert(rdi_temp == dst_pos, "expected alias w/ dst_pos"); 1835 Address dst_klass_lh_addr(rsi_dst_klass, lh_offset); 1836 1837 // Before looking at dst.length, make sure dst is also an objArray. 1838 __ movptr(rsi_dst_klass, dst_klass_addr); 1839 __ cmpl(dst_klass_lh_addr, objArray_lh); 1840 __ jccb(Assembler::notEqual, L_failed); 1841 1842 // It is safe to examine both src.length and dst.length. 1843 __ movl2ptr(src_pos, SRC_POS); // reload rsi 1844 arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed); 1845 // (Now src_pos and dst_pos are killed, but not src and dst.) 1846 1847 // We'll need this temp (don't forget to pop it after the type check). 1848 __ push(rbx); 1849 Register rbx_src_klass = rbx; 1850 1851 __ mov(rbx_src_klass, rcx_src_klass); // spill away from rcx 1852 __ movptr(rsi_dst_klass, dst_klass_addr); 1853 Address super_check_offset_addr(rsi_dst_klass, sco_offset); 1854 Label L_fail_array_check; 1855 generate_type_check(rbx_src_klass, 1856 super_check_offset_addr, dst_klass_addr, 1857 rdi_temp, NULL, &L_fail_array_check); 1858 // (On fall-through, we have passed the array type check.) 1859 __ pop(rbx); 1860 __ jmp(L_plain_copy); 1861 1862 __ BIND(L_fail_array_check); 1863 // Reshuffle arguments so we can call checkcast_arraycopy: 1864 1865 // match initial saves for checkcast_arraycopy 1866 // push(rsi); // already done; see above 1867 // push(rdi); // already done; see above 1868 // push(rbx); // already done; see above 1869 1870 // Marshal outgoing arguments now, freeing registers. 1871 Address from_arg(rsp, 16+ 4); // from 1872 Address to_arg(rsp, 16+ 8); // to 1873 Address length_arg(rsp, 16+12); // elements count 1874 Address ckoff_arg(rsp, 16+16); // super_check_offset 1875 Address ckval_arg(rsp, 16+20); // super_klass 1876 1877 Address SRC_POS_arg(rsp, 16+ 8); 1878 Address DST_POS_arg(rsp, 16+16); 1879 Address LENGTH_arg(rsp, 16+20); 1880 // push rbx, changed the incoming offsets (why not just use rbp,??) 1881 // assert(SRC_POS_arg.disp() == SRC_POS.disp() + 4, ""); 1882 1883 __ movptr(rbx, Address(rsi_dst_klass, ek_offset)); 1884 __ movl2ptr(length, LENGTH_arg); // reload elements count 1885 __ movl2ptr(src_pos, SRC_POS_arg); // reload src_pos 1886 __ movl2ptr(dst_pos, DST_POS_arg); // reload dst_pos 1887 1888 __ movptr(ckval_arg, rbx); // destination element type 1889 __ movl(rbx, Address(rbx, sco_offset)); 1890 __ movl(ckoff_arg, rbx); // corresponding class check offset 1891 1892 __ movl(length_arg, length); // outgoing length argument 1893 1894 __ lea(from, Address(src, src_pos, Address::times_ptr, 1895 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 1896 __ movptr(from_arg, from); 1897 1898 __ lea(to, Address(dst, dst_pos, Address::times_ptr, 1899 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 1900 __ movptr(to_arg, to); 1901 __ jump(RuntimeAddress(entry_checkcast_arraycopy)); 1902 } 1903 1904 return start; 1905 } 1906 1907 void generate_arraycopy_stubs() { 1908 address entry; 1909 address entry_jbyte_arraycopy; 1910 address entry_jshort_arraycopy; 1911 address entry_jint_arraycopy; 1912 address entry_oop_arraycopy; 1913 address entry_jlong_arraycopy; 1914 address entry_checkcast_arraycopy; 1915 1916 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = 1917 generate_disjoint_copy(T_BYTE, true, Address::times_1, &entry, 1918 "arrayof_jbyte_disjoint_arraycopy"); 1919 StubRoutines::_arrayof_jbyte_arraycopy = 1920 generate_conjoint_copy(T_BYTE, true, Address::times_1, entry, 1921 NULL, "arrayof_jbyte_arraycopy"); 1922 StubRoutines::_jbyte_disjoint_arraycopy = 1923 generate_disjoint_copy(T_BYTE, false, Address::times_1, &entry, 1924 "jbyte_disjoint_arraycopy"); 1925 StubRoutines::_jbyte_arraycopy = 1926 generate_conjoint_copy(T_BYTE, false, Address::times_1, entry, 1927 &entry_jbyte_arraycopy, "jbyte_arraycopy"); 1928 1929 StubRoutines::_arrayof_jshort_disjoint_arraycopy = 1930 generate_disjoint_copy(T_SHORT, true, Address::times_2, &entry, 1931 "arrayof_jshort_disjoint_arraycopy"); 1932 StubRoutines::_arrayof_jshort_arraycopy = 1933 generate_conjoint_copy(T_SHORT, true, Address::times_2, entry, 1934 NULL, "arrayof_jshort_arraycopy"); 1935 StubRoutines::_jshort_disjoint_arraycopy = 1936 generate_disjoint_copy(T_SHORT, false, Address::times_2, &entry, 1937 "jshort_disjoint_arraycopy"); 1938 StubRoutines::_jshort_arraycopy = 1939 generate_conjoint_copy(T_SHORT, false, Address::times_2, entry, 1940 &entry_jshort_arraycopy, "jshort_arraycopy"); 1941 1942 // Next arrays are always aligned on 4 bytes at least. 1943 StubRoutines::_jint_disjoint_arraycopy = 1944 generate_disjoint_copy(T_INT, true, Address::times_4, &entry, 1945 "jint_disjoint_arraycopy"); 1946 StubRoutines::_jint_arraycopy = 1947 generate_conjoint_copy(T_INT, true, Address::times_4, entry, 1948 &entry_jint_arraycopy, "jint_arraycopy"); 1949 1950 StubRoutines::_oop_disjoint_arraycopy = 1951 generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry, 1952 "oop_disjoint_arraycopy"); 1953 StubRoutines::_oop_arraycopy = 1954 generate_conjoint_copy(T_OBJECT, true, Address::times_ptr, entry, 1955 &entry_oop_arraycopy, "oop_arraycopy"); 1956 1957 StubRoutines::_oop_disjoint_arraycopy_uninit = 1958 generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry, 1959 "oop_disjoint_arraycopy_uninit", 1960 /*dest_uninitialized*/true); 1961 StubRoutines::_oop_arraycopy_uninit = 1962 generate_conjoint_copy(T_OBJECT, true, Address::times_ptr, entry, 1963 NULL, "oop_arraycopy_uninit", 1964 /*dest_uninitialized*/true); 1965 1966 StubRoutines::_jlong_disjoint_arraycopy = 1967 generate_disjoint_long_copy(&entry, "jlong_disjoint_arraycopy"); 1968 StubRoutines::_jlong_arraycopy = 1969 generate_conjoint_long_copy(entry, &entry_jlong_arraycopy, 1970 "jlong_arraycopy"); 1971 1972 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 1973 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 1974 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 1975 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 1976 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 1977 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 1978 1979 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy; 1980 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy; 1981 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit; 1982 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy; 1983 1984 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy; 1985 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy; 1986 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit; 1987 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy; 1988 1989 StubRoutines::_checkcast_arraycopy = 1990 generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 1991 StubRoutines::_checkcast_arraycopy_uninit = 1992 generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, /*dest_uninitialized*/true); 1993 1994 StubRoutines::_unsafe_arraycopy = 1995 generate_unsafe_copy("unsafe_arraycopy", 1996 entry_jbyte_arraycopy, 1997 entry_jshort_arraycopy, 1998 entry_jint_arraycopy, 1999 entry_jlong_arraycopy); 2000 2001 StubRoutines::_generic_arraycopy = 2002 generate_generic_copy("generic_arraycopy", 2003 entry_jbyte_arraycopy, 2004 entry_jshort_arraycopy, 2005 entry_jint_arraycopy, 2006 entry_oop_arraycopy, 2007 entry_jlong_arraycopy, 2008 entry_checkcast_arraycopy); 2009 } 2010 2011 // AES intrinsic stubs 2012 enum {AESBlockSize = 16}; 2013 2014 address generate_key_shuffle_mask() { 2015 __ align(16); 2016 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask"); 2017 address start = __ pc(); 2018 __ emit_data(0x00010203, relocInfo::none, 0 ); 2019 __ emit_data(0x04050607, relocInfo::none, 0 ); 2020 __ emit_data(0x08090a0b, relocInfo::none, 0 ); 2021 __ emit_data(0x0c0d0e0f, relocInfo::none, 0 ); 2022 return start; 2023 } 2024 2025 address generate_counter_shuffle_mask() { 2026 __ align(16); 2027 StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask"); 2028 address start = __ pc(); 2029 __ emit_data(0x0c0d0e0f, relocInfo::none, 0); 2030 __ emit_data(0x08090a0b, relocInfo::none, 0); 2031 __ emit_data(0x04050607, relocInfo::none, 0); 2032 __ emit_data(0x00010203, relocInfo::none, 0); 2033 return start; 2034 } 2035 2036 // Utility routine for loading a 128-bit key word in little endian format 2037 // can optionally specify that the shuffle mask is already in an xmmregister 2038 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 2039 __ movdqu(xmmdst, Address(key, offset)); 2040 if (xmm_shuf_mask != NULL) { 2041 __ pshufb(xmmdst, xmm_shuf_mask); 2042 } else { 2043 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 2044 } 2045 } 2046 2047 // aesenc using specified key+offset 2048 // can optionally specify that the shuffle mask is already in an xmmregister 2049 void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 2050 load_key(xmmtmp, key, offset, xmm_shuf_mask); 2051 __ aesenc(xmmdst, xmmtmp); 2052 } 2053 2054 // aesdec using specified key+offset 2055 // can optionally specify that the shuffle mask is already in an xmmregister 2056 void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 2057 load_key(xmmtmp, key, offset, xmm_shuf_mask); 2058 __ aesdec(xmmdst, xmmtmp); 2059 } 2060 2061 // Utility routine for increase 128bit counter (iv in CTR mode) 2062 // XMM_128bit, D3, D2, D1, D0 2063 void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) { 2064 __ pextrd(reg, xmmdst, 0x0); 2065 __ addl(reg, inc_delta); 2066 __ pinsrd(xmmdst, reg, 0x0); 2067 __ jcc(Assembler::carryClear, next_block); // jump if no carry 2068 2069 __ pextrd(reg, xmmdst, 0x01); // Carry-> D1 2070 __ addl(reg, 0x01); 2071 __ pinsrd(xmmdst, reg, 0x01); 2072 __ jcc(Assembler::carryClear, next_block); // jump if no carry 2073 2074 __ pextrd(reg, xmmdst, 0x02); // Carry-> D2 2075 __ addl(reg, 0x01); 2076 __ pinsrd(xmmdst, reg, 0x02); 2077 __ jcc(Assembler::carryClear, next_block); // jump if no carry 2078 2079 __ pextrd(reg, xmmdst, 0x03); // Carry -> D3 2080 __ addl(reg, 0x01); 2081 __ pinsrd(xmmdst, reg, 0x03); 2082 2083 __ BIND(next_block); // next instruction 2084 } 2085 2086 2087 // Arguments: 2088 // 2089 // Inputs: 2090 // c_rarg0 - source byte array address 2091 // c_rarg1 - destination byte array address 2092 // c_rarg2 - K (key) in little endian int array 2093 // 2094 address generate_aescrypt_encryptBlock() { 2095 assert(UseAES, "need AES instructions and misaligned SSE support"); 2096 __ align(CodeEntryAlignment); 2097 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2098 Label L_doLast; 2099 address start = __ pc(); 2100 2101 const Register from = rdx; // source array address 2102 const Register to = rdx; // destination array address 2103 const Register key = rcx; // key array address 2104 const Register keylen = rax; 2105 const Address from_param(rbp, 8+0); 2106 const Address to_param (rbp, 8+4); 2107 const Address key_param (rbp, 8+8); 2108 2109 const XMMRegister xmm_result = xmm0; 2110 const XMMRegister xmm_key_shuf_mask = xmm1; 2111 const XMMRegister xmm_temp1 = xmm2; 2112 const XMMRegister xmm_temp2 = xmm3; 2113 const XMMRegister xmm_temp3 = xmm4; 2114 const XMMRegister xmm_temp4 = xmm5; 2115 2116 __ enter(); // required for proper stackwalking of RuntimeStub frame 2117 2118 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 2119 // context for the registers used, where all instructions below are using 128-bit mode 2120 // On EVEX without VL and BW, these instructions will all be AVX. 2121 if (VM_Version::supports_avx512vlbw()) { 2122 __ movl(rdx, 0xffff); 2123 __ kmovdl(k1, rdx); 2124 } 2125 2126 __ movptr(from, from_param); 2127 __ movptr(key, key_param); 2128 2129 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} 2130 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2131 2132 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 2133 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input 2134 __ movptr(to, to_param); 2135 2136 // For encryption, the java expanded key ordering is just what we need 2137 2138 load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask); 2139 __ pxor(xmm_result, xmm_temp1); 2140 2141 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); 2142 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); 2143 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); 2144 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); 2145 2146 __ aesenc(xmm_result, xmm_temp1); 2147 __ aesenc(xmm_result, xmm_temp2); 2148 __ aesenc(xmm_result, xmm_temp3); 2149 __ aesenc(xmm_result, xmm_temp4); 2150 2151 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); 2152 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); 2153 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); 2154 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); 2155 2156 __ aesenc(xmm_result, xmm_temp1); 2157 __ aesenc(xmm_result, xmm_temp2); 2158 __ aesenc(xmm_result, xmm_temp3); 2159 __ aesenc(xmm_result, xmm_temp4); 2160 2161 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); 2162 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); 2163 2164 __ cmpl(keylen, 44); 2165 __ jccb(Assembler::equal, L_doLast); 2166 2167 __ aesenc(xmm_result, xmm_temp1); 2168 __ aesenc(xmm_result, xmm_temp2); 2169 2170 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); 2171 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); 2172 2173 __ cmpl(keylen, 52); 2174 __ jccb(Assembler::equal, L_doLast); 2175 2176 __ aesenc(xmm_result, xmm_temp1); 2177 __ aesenc(xmm_result, xmm_temp2); 2178 2179 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); 2180 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); 2181 2182 __ BIND(L_doLast); 2183 __ aesenc(xmm_result, xmm_temp1); 2184 __ aesenclast(xmm_result, xmm_temp2); 2185 __ movdqu(Address(to, 0), xmm_result); // store the result 2186 __ xorptr(rax, rax); // return 0 2187 __ leave(); // required for proper stackwalking of RuntimeStub frame 2188 __ ret(0); 2189 2190 return start; 2191 } 2192 2193 2194 // Arguments: 2195 // 2196 // Inputs: 2197 // c_rarg0 - source byte array address 2198 // c_rarg1 - destination byte array address 2199 // c_rarg2 - K (key) in little endian int array 2200 // 2201 address generate_aescrypt_decryptBlock() { 2202 assert(UseAES, "need AES instructions and misaligned SSE support"); 2203 __ align(CodeEntryAlignment); 2204 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2205 Label L_doLast; 2206 address start = __ pc(); 2207 2208 const Register from = rdx; // source array address 2209 const Register to = rdx; // destination array address 2210 const Register key = rcx; // key array address 2211 const Register keylen = rax; 2212 const Address from_param(rbp, 8+0); 2213 const Address to_param (rbp, 8+4); 2214 const Address key_param (rbp, 8+8); 2215 2216 const XMMRegister xmm_result = xmm0; 2217 const XMMRegister xmm_key_shuf_mask = xmm1; 2218 const XMMRegister xmm_temp1 = xmm2; 2219 const XMMRegister xmm_temp2 = xmm3; 2220 const XMMRegister xmm_temp3 = xmm4; 2221 const XMMRegister xmm_temp4 = xmm5; 2222 2223 __ enter(); // required for proper stackwalking of RuntimeStub frame 2224 2225 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 2226 // context for the registers used, where all instructions below are using 128-bit mode 2227 // On EVEX without VL and BW, these instructions will all be AVX. 2228 if (VM_Version::supports_avx512vlbw()) { 2229 __ movl(rdx, 0xffff); 2230 __ kmovdl(k1, rdx); 2231 } 2232 2233 __ movptr(from, from_param); 2234 __ movptr(key, key_param); 2235 2236 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} 2237 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2238 2239 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 2240 __ movdqu(xmm_result, Address(from, 0)); 2241 __ movptr(to, to_param); 2242 2243 // for decryption java expanded key ordering is rotated one position from what we want 2244 // so we start from 0x10 here and hit 0x00 last 2245 // we don't know if the key is aligned, hence not using load-execute form 2246 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); 2247 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); 2248 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); 2249 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); 2250 2251 __ pxor (xmm_result, xmm_temp1); 2252 __ aesdec(xmm_result, xmm_temp2); 2253 __ aesdec(xmm_result, xmm_temp3); 2254 __ aesdec(xmm_result, xmm_temp4); 2255 2256 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); 2257 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); 2258 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); 2259 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); 2260 2261 __ aesdec(xmm_result, xmm_temp1); 2262 __ aesdec(xmm_result, xmm_temp2); 2263 __ aesdec(xmm_result, xmm_temp3); 2264 __ aesdec(xmm_result, xmm_temp4); 2265 2266 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); 2267 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); 2268 load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask); 2269 2270 __ cmpl(keylen, 44); 2271 __ jccb(Assembler::equal, L_doLast); 2272 2273 __ aesdec(xmm_result, xmm_temp1); 2274 __ aesdec(xmm_result, xmm_temp2); 2275 2276 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); 2277 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); 2278 2279 __ cmpl(keylen, 52); 2280 __ jccb(Assembler::equal, L_doLast); 2281 2282 __ aesdec(xmm_result, xmm_temp1); 2283 __ aesdec(xmm_result, xmm_temp2); 2284 2285 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); 2286 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); 2287 2288 __ BIND(L_doLast); 2289 __ aesdec(xmm_result, xmm_temp1); 2290 __ aesdec(xmm_result, xmm_temp2); 2291 2292 // for decryption the aesdeclast operation is always on key+0x00 2293 __ aesdeclast(xmm_result, xmm_temp3); 2294 __ movdqu(Address(to, 0), xmm_result); // store the result 2295 __ xorptr(rax, rax); // return 0 2296 __ leave(); // required for proper stackwalking of RuntimeStub frame 2297 __ ret(0); 2298 2299 return start; 2300 } 2301 2302 void handleSOERegisters(bool saving) { 2303 const int saveFrameSizeInBytes = 4 * wordSize; 2304 const Address saved_rbx (rbp, -3 * wordSize); 2305 const Address saved_rsi (rbp, -2 * wordSize); 2306 const Address saved_rdi (rbp, -1 * wordSize); 2307 2308 if (saving) { 2309 __ subptr(rsp, saveFrameSizeInBytes); 2310 __ movptr(saved_rsi, rsi); 2311 __ movptr(saved_rdi, rdi); 2312 __ movptr(saved_rbx, rbx); 2313 } else { 2314 // restoring 2315 __ movptr(rsi, saved_rsi); 2316 __ movptr(rdi, saved_rdi); 2317 __ movptr(rbx, saved_rbx); 2318 } 2319 } 2320 2321 // Arguments: 2322 // 2323 // Inputs: 2324 // c_rarg0 - source byte array address 2325 // c_rarg1 - destination byte array address 2326 // c_rarg2 - K (key) in little endian int array 2327 // c_rarg3 - r vector byte array address 2328 // c_rarg4 - input length 2329 // 2330 // Output: 2331 // rax - input length 2332 // 2333 address generate_cipherBlockChaining_encryptAESCrypt() { 2334 assert(UseAES, "need AES instructions and misaligned SSE support"); 2335 __ align(CodeEntryAlignment); 2336 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2337 address start = __ pc(); 2338 2339 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; 2340 const Register from = rsi; // source array address 2341 const Register to = rdx; // destination array address 2342 const Register key = rcx; // key array address 2343 const Register rvec = rdi; // r byte array initialized from initvector array address 2344 // and left with the results of the last encryption block 2345 const Register len_reg = rbx; // src len (must be multiple of blocksize 16) 2346 const Register pos = rax; 2347 2348 // xmm register assignments for the loops below 2349 const XMMRegister xmm_result = xmm0; 2350 const XMMRegister xmm_temp = xmm1; 2351 // first 6 keys preloaded into xmm2-xmm7 2352 const int XMM_REG_NUM_KEY_FIRST = 2; 2353 const int XMM_REG_NUM_KEY_LAST = 7; 2354 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 2355 2356 __ enter(); // required for proper stackwalking of RuntimeStub frame 2357 handleSOERegisters(true /*saving*/); 2358 2359 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 2360 // context for the registers used, where all instructions below are using 128-bit mode 2361 // On EVEX without VL and BW, these instructions will all be AVX. 2362 if (VM_Version::supports_avx512vlbw()) { 2363 __ movl(rdx, 0xffff); 2364 __ kmovdl(k1, rdx); 2365 } 2366 2367 // load registers from incoming parameters 2368 const Address from_param(rbp, 8+0); 2369 const Address to_param (rbp, 8+4); 2370 const Address key_param (rbp, 8+8); 2371 const Address rvec_param (rbp, 8+12); 2372 const Address len_param (rbp, 8+16); 2373 __ movptr(from , from_param); 2374 __ movptr(to , to_param); 2375 __ movptr(key , key_param); 2376 __ movptr(rvec , rvec_param); 2377 __ movptr(len_reg , len_param); 2378 2379 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front 2380 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 2381 // load up xmm regs 2 thru 7 with keys 0-5 2382 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 2383 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 2384 offset += 0x10; 2385 } 2386 2387 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec 2388 2389 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 2390 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2391 __ cmpl(rax, 44); 2392 __ jcc(Assembler::notEqual, L_key_192_256); 2393 2394 // 128 bit code follows here 2395 __ movl(pos, 0); 2396 __ align(OptoLoopAlignment); 2397 __ BIND(L_loopTop_128); 2398 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 2399 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 2400 2401 __ pxor (xmm_result, xmm_key0); // do the aes rounds 2402 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 2403 __ aesenc(xmm_result, as_XMMRegister(rnum)); 2404 } 2405 for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) { 2406 aes_enc_key(xmm_result, xmm_temp, key, key_offset); 2407 } 2408 load_key(xmm_temp, key, 0xa0); 2409 __ aesenclast(xmm_result, xmm_temp); 2410 2411 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 2412 // no need to store r to memory until we exit 2413 __ addptr(pos, AESBlockSize); 2414 __ subptr(len_reg, AESBlockSize); 2415 __ jcc(Assembler::notEqual, L_loopTop_128); 2416 2417 __ BIND(L_exit); 2418 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object 2419 2420 handleSOERegisters(false /*restoring*/); 2421 __ movptr(rax, len_param); // return length 2422 __ leave(); // required for proper stackwalking of RuntimeStub frame 2423 __ ret(0); 2424 2425 __ BIND(L_key_192_256); 2426 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 2427 __ cmpl(rax, 52); 2428 __ jcc(Assembler::notEqual, L_key_256); 2429 2430 // 192-bit code follows here (could be changed to use more xmm registers) 2431 __ movl(pos, 0); 2432 __ align(OptoLoopAlignment); 2433 __ BIND(L_loopTop_192); 2434 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 2435 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 2436 2437 __ pxor (xmm_result, xmm_key0); // do the aes rounds 2438 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 2439 __ aesenc(xmm_result, as_XMMRegister(rnum)); 2440 } 2441 for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) { 2442 aes_enc_key(xmm_result, xmm_temp, key, key_offset); 2443 } 2444 load_key(xmm_temp, key, 0xc0); 2445 __ aesenclast(xmm_result, xmm_temp); 2446 2447 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 2448 // no need to store r to memory until we exit 2449 __ addptr(pos, AESBlockSize); 2450 __ subptr(len_reg, AESBlockSize); 2451 __ jcc(Assembler::notEqual, L_loopTop_192); 2452 __ jmp(L_exit); 2453 2454 __ BIND(L_key_256); 2455 // 256-bit code follows here (could be changed to use more xmm registers) 2456 __ movl(pos, 0); 2457 __ align(OptoLoopAlignment); 2458 __ BIND(L_loopTop_256); 2459 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 2460 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 2461 2462 __ pxor (xmm_result, xmm_key0); // do the aes rounds 2463 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 2464 __ aesenc(xmm_result, as_XMMRegister(rnum)); 2465 } 2466 for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) { 2467 aes_enc_key(xmm_result, xmm_temp, key, key_offset); 2468 } 2469 load_key(xmm_temp, key, 0xe0); 2470 __ aesenclast(xmm_result, xmm_temp); 2471 2472 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 2473 // no need to store r to memory until we exit 2474 __ addptr(pos, AESBlockSize); 2475 __ subptr(len_reg, AESBlockSize); 2476 __ jcc(Assembler::notEqual, L_loopTop_256); 2477 __ jmp(L_exit); 2478 2479 return start; 2480 } 2481 2482 2483 // CBC AES Decryption. 2484 // In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time. 2485 // 2486 // Arguments: 2487 // 2488 // Inputs: 2489 // c_rarg0 - source byte array address 2490 // c_rarg1 - destination byte array address 2491 // c_rarg2 - K (key) in little endian int array 2492 // c_rarg3 - r vector byte array address 2493 // c_rarg4 - input length 2494 // 2495 // Output: 2496 // rax - input length 2497 // 2498 2499 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 2500 assert(UseAES, "need AES instructions and misaligned SSE support"); 2501 __ align(CodeEntryAlignment); 2502 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2503 address start = __ pc(); 2504 2505 const Register from = rsi; // source array address 2506 const Register to = rdx; // destination array address 2507 const Register key = rcx; // key array address 2508 const Register rvec = rdi; // r byte array initialized from initvector array address 2509 // and left with the results of the last encryption block 2510 const Register len_reg = rbx; // src len (must be multiple of blocksize 16) 2511 const Register pos = rax; 2512 2513 const int PARALLEL_FACTOR = 4; 2514 const int ROUNDS[3] = { 10, 12, 14 }; //aes rounds for key128, key192, key256 2515 2516 Label L_exit; 2517 Label L_singleBlock_loopTop[3]; //128, 192, 256 2518 Label L_multiBlock_loopTop[3]; //128, 192, 256 2519 2520 const XMMRegister xmm_prev_block_cipher = xmm0; // holds cipher of previous block 2521 const XMMRegister xmm_key_shuf_mask = xmm1; 2522 2523 const XMMRegister xmm_key_tmp0 = xmm2; 2524 const XMMRegister xmm_key_tmp1 = xmm3; 2525 2526 // registers holding the six results in the parallelized loop 2527 const XMMRegister xmm_result0 = xmm4; 2528 const XMMRegister xmm_result1 = xmm5; 2529 const XMMRegister xmm_result2 = xmm6; 2530 const XMMRegister xmm_result3 = xmm7; 2531 2532 __ enter(); // required for proper stackwalking of RuntimeStub frame 2533 handleSOERegisters(true /*saving*/); 2534 2535 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 2536 // context for the registers used, where all instructions below are using 128-bit mode 2537 // On EVEX without VL and BW, these instructions will all be AVX. 2538 if (VM_Version::supports_avx512vlbw()) { 2539 __ movl(rdx, 0xffff); 2540 __ kmovdl(k1, rdx); 2541 } 2542 2543 // load registers from incoming parameters 2544 const Address from_param(rbp, 8+0); 2545 const Address to_param (rbp, 8+4); 2546 const Address key_param (rbp, 8+8); 2547 const Address rvec_param (rbp, 8+12); 2548 const Address len_param (rbp, 8+16); 2549 2550 __ movptr(from , from_param); 2551 __ movptr(to , to_param); 2552 __ movptr(key , key_param); 2553 __ movptr(rvec , rvec_param); 2554 __ movptr(len_reg , len_param); 2555 2556 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 2557 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec 2558 2559 __ xorptr(pos, pos); 2560 2561 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 2562 // rvec is reused 2563 __ movl(rvec, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2564 __ cmpl(rvec, 52); 2565 __ jcc(Assembler::equal, L_multiBlock_loopTop[1]); 2566 __ cmpl(rvec, 60); 2567 __ jcc(Assembler::equal, L_multiBlock_loopTop[2]); 2568 2569 #define DoFour(opc, src_reg) \ 2570 __ opc(xmm_result0, src_reg); \ 2571 __ opc(xmm_result1, src_reg); \ 2572 __ opc(xmm_result2, src_reg); \ 2573 __ opc(xmm_result3, src_reg); \ 2574 2575 for (int k = 0; k < 3; ++k) { 2576 __ align(OptoLoopAlignment); 2577 __ BIND(L_multiBlock_loopTop[k]); 2578 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left 2579 __ jcc(Assembler::less, L_singleBlock_loopTop[k]); 2580 2581 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers 2582 __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); 2583 __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); 2584 __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); 2585 2586 // the java expanded key ordering is rotated one position from what we want 2587 // so we start from 0x10 here and hit 0x00 last 2588 load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask); 2589 DoFour(pxor, xmm_key_tmp0); //xor with first key 2590 // do the aes dec rounds 2591 for (int rnum = 1; rnum <= ROUNDS[k];) { 2592 //load two keys at a time 2593 //k1->0x20, ..., k9->0xa0, k10->0x00 2594 load_key(xmm_key_tmp1, key, (rnum + 1) * 0x10, xmm_key_shuf_mask); 2595 load_key(xmm_key_tmp0, key, ((rnum + 2) % (ROUNDS[k] + 1)) * 0x10, xmm_key_shuf_mask); // hit 0x00 last! 2596 DoFour(aesdec, xmm_key_tmp1); 2597 rnum++; 2598 if (rnum != ROUNDS[k]) { 2599 DoFour(aesdec, xmm_key_tmp0); 2600 } 2601 else { 2602 DoFour(aesdeclast, xmm_key_tmp0); 2603 } 2604 rnum++; 2605 } 2606 2607 // for each result, xor with the r vector of previous cipher block 2608 __ pxor(xmm_result0, xmm_prev_block_cipher); 2609 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize)); 2610 __ pxor(xmm_result1, xmm_prev_block_cipher); 2611 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize)); 2612 __ pxor(xmm_result2, xmm_prev_block_cipher); 2613 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize)); 2614 __ pxor(xmm_result3, xmm_prev_block_cipher); 2615 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks 2616 2617 // store 4 results into the next 64 bytes of output 2618 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); 2619 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); 2620 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); 2621 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); 2622 2623 __ addptr(pos, 4 * AESBlockSize); 2624 __ subptr(len_reg, 4 * AESBlockSize); 2625 __ jmp(L_multiBlock_loopTop[k]); 2626 2627 //singleBlock starts here 2628 __ align(OptoLoopAlignment); 2629 __ BIND(L_singleBlock_loopTop[k]); 2630 __ cmpptr(len_reg, 0); // any blocks left? 2631 __ jcc(Assembler::equal, L_exit); 2632 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 2633 __ movdqa(xmm_result1, xmm_result0); 2634 2635 load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask); 2636 __ pxor(xmm_result0, xmm_key_tmp0); 2637 // do the aes dec rounds 2638 for (int rnum = 1; rnum < ROUNDS[k]; rnum++) { 2639 // the java expanded key ordering is rotated one position from what we want 2640 load_key(xmm_key_tmp0, key, (rnum + 1) * 0x10, xmm_key_shuf_mask); 2641 __ aesdec(xmm_result0, xmm_key_tmp0); 2642 } 2643 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); 2644 __ aesdeclast(xmm_result0, xmm_key_tmp0); 2645 __ pxor(xmm_result0, xmm_prev_block_cipher); // xor with the current r vector 2646 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result0); // store into the next 16 bytes of output 2647 // no need to store r to memory until we exit 2648 __ movdqa(xmm_prev_block_cipher, xmm_result1); // set up next r vector with cipher input from this block 2649 2650 __ addptr(pos, AESBlockSize); 2651 __ subptr(len_reg, AESBlockSize); 2652 __ jmp(L_singleBlock_loopTop[k]); 2653 }//for 128/192/256 2654 2655 __ BIND(L_exit); 2656 __ movptr(rvec, rvec_param); // restore this since reused earlier 2657 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object 2658 handleSOERegisters(false /*restoring*/); 2659 __ movptr(rax, len_param); // return length 2660 __ leave(); // required for proper stackwalking of RuntimeStub frame 2661 __ ret(0); 2662 2663 return start; 2664 } 2665 2666 // CTR AES crypt. 2667 // In 32-bit stub, parallelize 4 blocks at a time 2668 // Arguments: 2669 // 2670 // Inputs: 2671 // c_rarg0 - source byte array address 2672 // c_rarg1 - destination byte array address 2673 // c_rarg2 - K (key) in little endian int array 2674 // c_rarg3 - counter vector byte array address 2675 // c_rarg4 - input length 2676 // 2677 // Output: 2678 // rax - input length 2679 // 2680 address generate_counterMode_AESCrypt_Parallel() { 2681 assert(UseAES, "need AES instructions and misaligned SSE support"); 2682 __ align(CodeEntryAlignment); 2683 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 2684 address start = __ pc(); 2685 const Register from = rsi; // source array address 2686 const Register to = rdx; // destination array address 2687 const Register key = rcx; // key array address 2688 const Register counter = rdi; // counter byte array initialized from initvector array address 2689 // and updated with the incremented counter in the end 2690 const Register len_reg = rbx; 2691 const Register pos = rax; 2692 2693 __ enter(); // required for proper stackwalking of RuntimeStub frame 2694 handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi 2695 2696 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 2697 // context for the registers used, where all instructions below are using 128-bit mode 2698 // On EVEX without VL and BW, these instructions will all be AVX. 2699 if (VM_Version::supports_avx512vlbw()) { 2700 __ movl(rdx, 0xffff); 2701 __ kmovdl(k1, rdx); 2702 } 2703 2704 // load registers from incoming parameters 2705 const Address from_param(rbp, 8+0); 2706 const Address to_param (rbp, 8+4); 2707 const Address key_param (rbp, 8+8); 2708 const Address rvec_param (rbp, 8+12); 2709 const Address len_param (rbp, 8+16); 2710 const Address saved_counter_param(rbp, 8 + 20); 2711 const Address used_addr_param(rbp, 8 + 24); 2712 2713 __ movptr(from , from_param); 2714 __ movptr(to , to_param); 2715 __ movptr(len_reg , len_param); 2716 2717 // Use the partially used encrpyted counter from last invocation 2718 Label L_exit_preLoop, L_preLoop_start; 2719 2720 // Use the registers 'counter' and 'key' here in this preloop 2721 // to hold of last 2 params 'used' and 'saved_encCounter_start' 2722 Register used = counter; 2723 Register saved_encCounter_start = key; 2724 Register used_addr = saved_encCounter_start; 2725 2726 __ movptr(used_addr, used_addr_param); 2727 __ movptr(used, Address(used_addr, 0)); 2728 __ movptr(saved_encCounter_start, saved_counter_param); 2729 2730 __ BIND(L_preLoop_start); 2731 __ cmpptr(used, 16); 2732 __ jcc(Assembler::aboveEqual, L_exit_preLoop); 2733 __ cmpptr(len_reg, 0); 2734 __ jcc(Assembler::lessEqual, L_exit_preLoop); 2735 __ movb(rax, Address(saved_encCounter_start, used)); 2736 __ xorb(rax, Address(from, 0)); 2737 __ movb(Address(to, 0), rax); 2738 __ addptr(from, 1); 2739 __ addptr(to, 1); 2740 __ addptr(used, 1); 2741 __ subptr(len_reg, 1); 2742 2743 __ jmp(L_preLoop_start); 2744 2745 __ BIND(L_exit_preLoop); 2746 __ movptr(used_addr, used_addr_param); 2747 __ movptr(used_addr, used_addr_param); 2748 __ movl(Address(used_addr, 0), used); 2749 2750 // load the parameters 'key' and 'counter' 2751 __ movptr(key, key_param); 2752 __ movptr(counter, rvec_param); 2753 2754 // xmm register assignments for the loops below 2755 const XMMRegister xmm_curr_counter = xmm0; 2756 const XMMRegister xmm_counter_shuf_mask = xmm1; // need to be reloaded 2757 const XMMRegister xmm_key_shuf_mask = xmm2; // need to be reloaded 2758 const XMMRegister xmm_key = xmm3; 2759 const XMMRegister xmm_result0 = xmm4; 2760 const XMMRegister xmm_result1 = xmm5; 2761 const XMMRegister xmm_result2 = xmm6; 2762 const XMMRegister xmm_result3 = xmm7; 2763 const XMMRegister xmm_from0 = xmm1; //reuse XMM register 2764 const XMMRegister xmm_from1 = xmm2; 2765 const XMMRegister xmm_from2 = xmm3; 2766 const XMMRegister xmm_from3 = xmm4; 2767 2768 //for key_128, key_192, key_256 2769 const int rounds[3] = {10, 12, 14}; 2770 Label L_singleBlockLoopTop[3]; 2771 Label L_multiBlock_loopTop[3]; 2772 Label L_key192_top, L_key256_top; 2773 Label L_incCounter[3][4]; // 3: different key length, 4: 4 blocks at a time 2774 Label L_incCounter_single[3]; //for single block, key128, key192, key256 2775 Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3]; 2776 Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3]; 2777 2778 Label L_exit; 2779 const int PARALLEL_FACTOR = 4; //because of the limited register number 2780 2781 // initialize counter with initial counter 2782 __ movdqu(xmm_curr_counter, Address(counter, 0x00)); 2783 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); 2784 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled for increase 2785 2786 // key length could be only {11, 13, 15} * 4 = {44, 52, 60} 2787 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 2788 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2789 __ cmpl(rax, 52); 2790 __ jcc(Assembler::equal, L_key192_top); 2791 __ cmpl(rax, 60); 2792 __ jcc(Assembler::equal, L_key256_top); 2793 2794 //key128 begins here 2795 __ movptr(pos, 0); // init pos before L_multiBlock_loopTop 2796 2797 #define CTR_DoFour(opc, src_reg) \ 2798 __ opc(xmm_result0, src_reg); \ 2799 __ opc(xmm_result1, src_reg); \ 2800 __ opc(xmm_result2, src_reg); \ 2801 __ opc(xmm_result3, src_reg); 2802 2803 // k == 0 : generate code for key_128 2804 // k == 1 : generate code for key_192 2805 // k == 2 : generate code for key_256 2806 for (int k = 0; k < 3; ++k) { 2807 //multi blocks starts here 2808 __ align(OptoLoopAlignment); 2809 __ BIND(L_multiBlock_loopTop[k]); 2810 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left 2811 __ jcc(Assembler::less, L_singleBlockLoopTop[k]); 2812 2813 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 2814 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); 2815 2816 //load, then increase counters 2817 CTR_DoFour(movdqa, xmm_curr_counter); 2818 __ push(rbx); 2819 inc_counter(rbx, xmm_result1, 0x01, L_incCounter[k][0]); 2820 inc_counter(rbx, xmm_result2, 0x02, L_incCounter[k][1]); 2821 inc_counter(rbx, xmm_result3, 0x03, L_incCounter[k][2]); 2822 inc_counter(rbx, xmm_curr_counter, 0x04, L_incCounter[k][3]); 2823 __ pop (rbx); 2824 2825 load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); // load Round 0 key. interleaving for better performance 2826 2827 CTR_DoFour(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR 2828 CTR_DoFour(pxor, xmm_key); //PXOR with Round 0 key 2829 2830 for (int i = 1; i < rounds[k]; ++i) { 2831 load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask); 2832 CTR_DoFour(aesenc, xmm_key); 2833 } 2834 load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask); 2835 CTR_DoFour(aesenclast, xmm_key); 2836 2837 // get next PARALLEL_FACTOR blocks into xmm_from registers 2838 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); 2839 __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); 2840 __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); 2841 2842 // PXOR with input text 2843 __ pxor(xmm_result0, xmm_from0); //result0 is xmm4 2844 __ pxor(xmm_result1, xmm_from1); 2845 __ pxor(xmm_result2, xmm_from2); 2846 2847 // store PARALLEL_FACTOR results into the next 64 bytes of output 2848 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); 2849 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); 2850 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); 2851 2852 // do it here after xmm_result0 is saved, because xmm_from3 reuse the same register of xmm_result0. 2853 __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); 2854 __ pxor(xmm_result3, xmm_from3); 2855 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); 2856 2857 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text 2858 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length 2859 __ jmp(L_multiBlock_loopTop[k]); 2860 2861 // singleBlock starts here 2862 __ align(OptoLoopAlignment); 2863 __ BIND(L_singleBlockLoopTop[k]); 2864 __ cmpptr(len_reg, 0); 2865 __ jcc(Assembler::equal, L_exit); 2866 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 2867 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); 2868 __ movdqa(xmm_result0, xmm_curr_counter); 2869 load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); 2870 __ push(rbx);//rbx is used for increasing counter 2871 inc_counter(rbx, xmm_curr_counter, 0x01, L_incCounter_single[k]); 2872 __ pop (rbx); 2873 __ pshufb(xmm_result0, xmm_counter_shuf_mask); 2874 __ pxor(xmm_result0, xmm_key); 2875 for (int i = 1; i < rounds[k]; i++) { 2876 load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask); 2877 __ aesenc(xmm_result0, xmm_key); 2878 } 2879 load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask); 2880 __ aesenclast(xmm_result0, xmm_key); 2881 __ cmpptr(len_reg, AESBlockSize); 2882 __ jcc(Assembler::less, L_processTail_insr[k]); 2883 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); 2884 __ pxor(xmm_result0, xmm_from0); 2885 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); 2886 __ addptr(pos, AESBlockSize); 2887 __ subptr(len_reg, AESBlockSize); 2888 __ jmp(L_singleBlockLoopTop[k]); 2889 2890 __ BIND(L_processTail_insr[k]); // Process the tail part of the input array 2891 __ addptr(pos, len_reg); // 1. Insert bytes from src array into xmm_from0 register 2892 __ testptr(len_reg, 8); 2893 __ jcc(Assembler::zero, L_processTail_4_insr[k]); 2894 __ subptr(pos,8); 2895 __ pinsrd(xmm_from0, Address(from, pos), 0); 2896 __ pinsrd(xmm_from0, Address(from, pos, Address::times_1, 4), 1); 2897 __ BIND(L_processTail_4_insr[k]); 2898 __ testptr(len_reg, 4); 2899 __ jcc(Assembler::zero, L_processTail_2_insr[k]); 2900 __ subptr(pos,4); 2901 __ pslldq(xmm_from0, 4); 2902 __ pinsrd(xmm_from0, Address(from, pos), 0); 2903 __ BIND(L_processTail_2_insr[k]); 2904 __ testptr(len_reg, 2); 2905 __ jcc(Assembler::zero, L_processTail_1_insr[k]); 2906 __ subptr(pos, 2); 2907 __ pslldq(xmm_from0, 2); 2908 __ pinsrw(xmm_from0, Address(from, pos), 0); 2909 __ BIND(L_processTail_1_insr[k]); 2910 __ testptr(len_reg, 1); 2911 __ jcc(Assembler::zero, L_processTail_exit_insr[k]); 2912 __ subptr(pos, 1); 2913 __ pslldq(xmm_from0, 1); 2914 __ pinsrb(xmm_from0, Address(from, pos), 0); 2915 __ BIND(L_processTail_exit_insr[k]); 2916 2917 __ movptr(saved_encCounter_start, saved_counter_param); 2918 __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); // 2. Perform pxor of the encrypted counter and plaintext Bytes. 2919 __ pxor(xmm_result0, xmm_from0); // Also the encrypted counter is saved for next invocation. 2920 2921 __ testptr(len_reg, 8); 2922 __ jcc(Assembler::zero, L_processTail_4_extr[k]); // 3. Extract bytes from xmm_result0 into the dest. array 2923 __ pextrd(Address(to, pos), xmm_result0, 0); 2924 __ pextrd(Address(to, pos, Address::times_1, 4), xmm_result0, 1); 2925 __ psrldq(xmm_result0, 8); 2926 __ addptr(pos, 8); 2927 __ BIND(L_processTail_4_extr[k]); 2928 __ testptr(len_reg, 4); 2929 __ jcc(Assembler::zero, L_processTail_2_extr[k]); 2930 __ pextrd(Address(to, pos), xmm_result0, 0); 2931 __ psrldq(xmm_result0, 4); 2932 __ addptr(pos, 4); 2933 __ BIND(L_processTail_2_extr[k]); 2934 __ testptr(len_reg, 2); 2935 __ jcc(Assembler::zero, L_processTail_1_extr[k]); 2936 __ pextrb(Address(to, pos), xmm_result0, 0); 2937 __ pextrb(Address(to, pos, Address::times_1, 1), xmm_result0, 1); 2938 __ psrldq(xmm_result0, 2); 2939 __ addptr(pos, 2); 2940 __ BIND(L_processTail_1_extr[k]); 2941 __ testptr(len_reg, 1); 2942 __ jcc(Assembler::zero, L_processTail_exit_extr[k]); 2943 __ pextrb(Address(to, pos), xmm_result0, 0); 2944 2945 __ BIND(L_processTail_exit_extr[k]); 2946 __ movptr(used_addr, used_addr_param); 2947 __ movl(Address(used_addr, 0), len_reg); 2948 __ jmp(L_exit); 2949 } 2950 2951 __ BIND(L_exit); 2952 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); 2953 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back. 2954 __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back 2955 handleSOERegisters(false /*restoring*/); 2956 __ movptr(rax, len_param); // return length 2957 __ leave(); // required for proper stackwalking of RuntimeStub frame 2958 __ ret(0); 2959 2960 __ BIND (L_key192_top); 2961 __ movptr(pos, 0); // init pos before L_multiBlock_loopTop 2962 __ jmp(L_multiBlock_loopTop[1]); //key192 2963 2964 __ BIND (L_key256_top); 2965 __ movptr(pos, 0); // init pos before L_multiBlock_loopTop 2966 __ jmp(L_multiBlock_loopTop[2]); //key192 2967 2968 return start; 2969 } 2970 2971 address generate_upper_word_mask() { 2972 __ align(64); 2973 StubCodeMark mark(this, "StubRoutines", "upper_word_mask"); 2974 address start = __ pc(); 2975 __ emit_data(0x00000000, relocInfo::none, 0); 2976 __ emit_data(0x00000000, relocInfo::none, 0); 2977 __ emit_data(0x00000000, relocInfo::none, 0); 2978 __ emit_data(0xFFFFFFFF, relocInfo::none, 0); 2979 return start; 2980 } 2981 2982 address generate_shuffle_byte_flip_mask() { 2983 __ align(64); 2984 StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask"); 2985 address start = __ pc(); 2986 __ emit_data(0x0c0d0e0f, relocInfo::none, 0); 2987 __ emit_data(0x08090a0b, relocInfo::none, 0); 2988 __ emit_data(0x04050607, relocInfo::none, 0); 2989 __ emit_data(0x00010203, relocInfo::none, 0); 2990 return start; 2991 } 2992 2993 // ofs and limit are use for multi-block byte array. 2994 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 2995 address generate_sha1_implCompress(bool multi_block, const char *name) { 2996 __ align(CodeEntryAlignment); 2997 StubCodeMark mark(this, "StubRoutines", name); 2998 address start = __ pc(); 2999 3000 Register buf = rax; 3001 Register state = rdx; 3002 Register ofs = rcx; 3003 Register limit = rdi; 3004 3005 const Address buf_param(rbp, 8 + 0); 3006 const Address state_param(rbp, 8 + 4); 3007 const Address ofs_param(rbp, 8 + 8); 3008 const Address limit_param(rbp, 8 + 12); 3009 3010 const XMMRegister abcd = xmm0; 3011 const XMMRegister e0 = xmm1; 3012 const XMMRegister e1 = xmm2; 3013 const XMMRegister msg0 = xmm3; 3014 3015 const XMMRegister msg1 = xmm4; 3016 const XMMRegister msg2 = xmm5; 3017 const XMMRegister msg3 = xmm6; 3018 const XMMRegister shuf_mask = xmm7; 3019 3020 __ enter(); 3021 __ subptr(rsp, 8 * wordSize); 3022 if (multi_block) { 3023 __ push(limit); 3024 } 3025 __ movptr(buf, buf_param); 3026 __ movptr(state, state_param); 3027 if (multi_block) { 3028 __ movptr(ofs, ofs_param); 3029 __ movptr(limit, limit_param); 3030 } 3031 3032 __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask, 3033 buf, state, ofs, limit, rsp, multi_block); 3034 3035 if (multi_block) { 3036 __ pop(limit); 3037 } 3038 __ addptr(rsp, 8 * wordSize); 3039 __ leave(); 3040 __ ret(0); 3041 return start; 3042 } 3043 3044 address generate_pshuffle_byte_flip_mask() { 3045 __ align(64); 3046 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask"); 3047 address start = __ pc(); 3048 __ emit_data(0x00010203, relocInfo::none, 0); 3049 __ emit_data(0x04050607, relocInfo::none, 0); 3050 __ emit_data(0x08090a0b, relocInfo::none, 0); 3051 __ emit_data(0x0c0d0e0f, relocInfo::none, 0); 3052 return start; 3053 } 3054 3055 // ofs and limit are use for multi-block byte array. 3056 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 3057 address generate_sha256_implCompress(bool multi_block, const char *name) { 3058 __ align(CodeEntryAlignment); 3059 StubCodeMark mark(this, "StubRoutines", name); 3060 address start = __ pc(); 3061 3062 Register buf = rbx; 3063 Register state = rsi; 3064 Register ofs = rdx; 3065 Register limit = rcx; 3066 3067 const Address buf_param(rbp, 8 + 0); 3068 const Address state_param(rbp, 8 + 4); 3069 const Address ofs_param(rbp, 8 + 8); 3070 const Address limit_param(rbp, 8 + 12); 3071 3072 const XMMRegister msg = xmm0; 3073 const XMMRegister state0 = xmm1; 3074 const XMMRegister state1 = xmm2; 3075 const XMMRegister msgtmp0 = xmm3; 3076 3077 const XMMRegister msgtmp1 = xmm4; 3078 const XMMRegister msgtmp2 = xmm5; 3079 const XMMRegister msgtmp3 = xmm6; 3080 const XMMRegister msgtmp4 = xmm7; 3081 3082 __ enter(); 3083 __ subptr(rsp, 8 * wordSize); 3084 handleSOERegisters(true /*saving*/); 3085 __ movptr(buf, buf_param); 3086 __ movptr(state, state_param); 3087 if (multi_block) { 3088 __ movptr(ofs, ofs_param); 3089 __ movptr(limit, limit_param); 3090 } 3091 3092 __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, 3093 buf, state, ofs, limit, rsp, multi_block); 3094 3095 handleSOERegisters(false); 3096 __ addptr(rsp, 8 * wordSize); 3097 __ leave(); 3098 __ ret(0); 3099 return start; 3100 } 3101 3102 // byte swap x86 long 3103 address generate_ghash_long_swap_mask() { 3104 __ align(CodeEntryAlignment); 3105 StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask"); 3106 address start = __ pc(); 3107 __ emit_data(0x0b0a0908, relocInfo::none, 0); 3108 __ emit_data(0x0f0e0d0c, relocInfo::none, 0); 3109 __ emit_data(0x03020100, relocInfo::none, 0); 3110 __ emit_data(0x07060504, relocInfo::none, 0); 3111 3112 return start; 3113 } 3114 3115 // byte swap x86 byte array 3116 address generate_ghash_byte_swap_mask() { 3117 __ align(CodeEntryAlignment); 3118 StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask"); 3119 address start = __ pc(); 3120 __ emit_data(0x0c0d0e0f, relocInfo::none, 0); 3121 __ emit_data(0x08090a0b, relocInfo::none, 0); 3122 __ emit_data(0x04050607, relocInfo::none, 0); 3123 __ emit_data(0x00010203, relocInfo::none, 0); 3124 return start; 3125 } 3126 3127 /* Single and multi-block ghash operations */ 3128 address generate_ghash_processBlocks() { 3129 assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support"); 3130 __ align(CodeEntryAlignment); 3131 Label L_ghash_loop, L_exit; 3132 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3133 address start = __ pc(); 3134 3135 const Register state = rdi; 3136 const Register subkeyH = rsi; 3137 const Register data = rdx; 3138 const Register blocks = rcx; 3139 3140 const Address state_param(rbp, 8+0); 3141 const Address subkeyH_param(rbp, 8+4); 3142 const Address data_param(rbp, 8+8); 3143 const Address blocks_param(rbp, 8+12); 3144 3145 const XMMRegister xmm_temp0 = xmm0; 3146 const XMMRegister xmm_temp1 = xmm1; 3147 const XMMRegister xmm_temp2 = xmm2; 3148 const XMMRegister xmm_temp3 = xmm3; 3149 const XMMRegister xmm_temp4 = xmm4; 3150 const XMMRegister xmm_temp5 = xmm5; 3151 const XMMRegister xmm_temp6 = xmm6; 3152 const XMMRegister xmm_temp7 = xmm7; 3153 3154 __ enter(); 3155 handleSOERegisters(true); // Save registers 3156 3157 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 3158 // context for the registers used, where all instructions below are using 128-bit mode 3159 // On EVEX without VL and BW, these instructions will all be AVX. 3160 if (VM_Version::supports_avx512vlbw()) { 3161 __ movl(rdx, 0xffff); 3162 __ kmovdl(k1, rdx); 3163 } 3164 3165 __ movptr(state, state_param); 3166 __ movptr(subkeyH, subkeyH_param); 3167 __ movptr(data, data_param); 3168 __ movptr(blocks, blocks_param); 3169 3170 __ movdqu(xmm_temp0, Address(state, 0)); 3171 __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); 3172 3173 __ movdqu(xmm_temp1, Address(subkeyH, 0)); 3174 __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); 3175 3176 __ BIND(L_ghash_loop); 3177 __ movdqu(xmm_temp2, Address(data, 0)); 3178 __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); 3179 3180 __ pxor(xmm_temp0, xmm_temp2); 3181 3182 // 3183 // Multiply with the hash key 3184 // 3185 __ movdqu(xmm_temp3, xmm_temp0); 3186 __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0 3187 __ movdqu(xmm_temp4, xmm_temp0); 3188 __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1 3189 3190 __ movdqu(xmm_temp5, xmm_temp0); 3191 __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0 3192 __ movdqu(xmm_temp6, xmm_temp0); 3193 __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1 3194 3195 __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0 3196 3197 __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5 3198 __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right 3199 __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left 3200 __ pxor(xmm_temp3, xmm_temp5); 3201 __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result 3202 // of the carry-less multiplication of 3203 // xmm0 by xmm1. 3204 3205 // We shift the result of the multiplication by one bit position 3206 // to the left to cope for the fact that the bits are reversed. 3207 __ movdqu(xmm_temp7, xmm_temp3); 3208 __ movdqu(xmm_temp4, xmm_temp6); 3209 __ pslld (xmm_temp3, 1); 3210 __ pslld(xmm_temp6, 1); 3211 __ psrld(xmm_temp7, 31); 3212 __ psrld(xmm_temp4, 31); 3213 __ movdqu(xmm_temp5, xmm_temp7); 3214 __ pslldq(xmm_temp4, 4); 3215 __ pslldq(xmm_temp7, 4); 3216 __ psrldq(xmm_temp5, 12); 3217 __ por(xmm_temp3, xmm_temp7); 3218 __ por(xmm_temp6, xmm_temp4); 3219 __ por(xmm_temp6, xmm_temp5); 3220 3221 // 3222 // First phase of the reduction 3223 // 3224 // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts 3225 // independently. 3226 __ movdqu(xmm_temp7, xmm_temp3); 3227 __ movdqu(xmm_temp4, xmm_temp3); 3228 __ movdqu(xmm_temp5, xmm_temp3); 3229 __ pslld(xmm_temp7, 31); // packed right shift shifting << 31 3230 __ pslld(xmm_temp4, 30); // packed right shift shifting << 30 3231 __ pslld(xmm_temp5, 25); // packed right shift shifting << 25 3232 __ pxor(xmm_temp7, xmm_temp4); // xor the shifted versions 3233 __ pxor(xmm_temp7, xmm_temp5); 3234 __ movdqu(xmm_temp4, xmm_temp7); 3235 __ pslldq(xmm_temp7, 12); 3236 __ psrldq(xmm_temp4, 4); 3237 __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete 3238 3239 // 3240 // Second phase of the reduction 3241 // 3242 // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these 3243 // shift operations. 3244 __ movdqu(xmm_temp2, xmm_temp3); 3245 __ movdqu(xmm_temp7, xmm_temp3); 3246 __ movdqu(xmm_temp5, xmm_temp3); 3247 __ psrld(xmm_temp2, 1); // packed left shifting >> 1 3248 __ psrld(xmm_temp7, 2); // packed left shifting >> 2 3249 __ psrld(xmm_temp5, 7); // packed left shifting >> 7 3250 __ pxor(xmm_temp2, xmm_temp7); // xor the shifted versions 3251 __ pxor(xmm_temp2, xmm_temp5); 3252 __ pxor(xmm_temp2, xmm_temp4); 3253 __ pxor(xmm_temp3, xmm_temp2); 3254 __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6 3255 3256 __ decrement(blocks); 3257 __ jcc(Assembler::zero, L_exit); 3258 __ movdqu(xmm_temp0, xmm_temp6); 3259 __ addptr(data, 16); 3260 __ jmp(L_ghash_loop); 3261 3262 __ BIND(L_exit); 3263 // Byte swap 16-byte result 3264 __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); 3265 __ movdqu(Address(state, 0), xmm_temp6); // store the result 3266 3267 handleSOERegisters(false); // restore registers 3268 __ leave(); 3269 __ ret(0); 3270 return start; 3271 } 3272 3273 /** 3274 * Arguments: 3275 * 3276 * Inputs: 3277 * rsp(4) - int crc 3278 * rsp(8) - byte* buf 3279 * rsp(12) - int length 3280 * 3281 * Ouput: 3282 * rax - int crc result 3283 */ 3284 address generate_updateBytesCRC32() { 3285 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions"); 3286 3287 __ align(CodeEntryAlignment); 3288 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3289 3290 address start = __ pc(); 3291 3292 const Register crc = rdx; // crc 3293 const Register buf = rsi; // source java byte array address 3294 const Register len = rcx; // length 3295 const Register table = rdi; // crc_table address (reuse register) 3296 const Register tmp = rbx; 3297 assert_different_registers(crc, buf, len, table, tmp, rax); 3298 3299 BLOCK_COMMENT("Entry:"); 3300 __ enter(); // required for proper stackwalking of RuntimeStub frame 3301 __ push(rsi); 3302 __ push(rdi); 3303 __ push(rbx); 3304 3305 Address crc_arg(rbp, 8 + 0); 3306 Address buf_arg(rbp, 8 + 4); 3307 Address len_arg(rbp, 8 + 8); 3308 3309 // Load up: 3310 __ movl(crc, crc_arg); 3311 __ movptr(buf, buf_arg); 3312 __ movl(len, len_arg); 3313 3314 __ kernel_crc32(crc, buf, len, table, tmp); 3315 3316 __ movl(rax, crc); 3317 __ pop(rbx); 3318 __ pop(rdi); 3319 __ pop(rsi); 3320 __ vzeroupper(); 3321 __ leave(); // required for proper stackwalking of RuntimeStub frame 3322 __ ret(0); 3323 3324 return start; 3325 } 3326 3327 /** 3328 * Arguments: 3329 * 3330 * Inputs: 3331 * rsp(4) - int crc 3332 * rsp(8) - byte* buf 3333 * rsp(12) - int length 3334 * rsp(16) - table_start - optional (present only when doing a library_calll, 3335 * not used by x86 algorithm) 3336 * 3337 * Ouput: 3338 * rax - int crc result 3339 */ 3340 address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) { 3341 assert(UseCRC32CIntrinsics, "need SSE4_2"); 3342 __ align(CodeEntryAlignment); 3343 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3344 address start = __ pc(); 3345 const Register crc = rax; // crc 3346 const Register buf = rcx; // source java byte array address 3347 const Register len = rdx; // length 3348 const Register d = rbx; 3349 const Register g = rsi; 3350 const Register h = rdi; 3351 const Register empty = 0; // will never be used, in order not 3352 // to change a signature for crc32c_IPL_Alg2_Alt2 3353 // between 64/32 I'm just keeping it here 3354 assert_different_registers(crc, buf, len, d, g, h); 3355 3356 BLOCK_COMMENT("Entry:"); 3357 __ enter(); // required for proper stackwalking of RuntimeStub frame 3358 Address crc_arg(rsp, 4 + 4 + 0); // ESP+4 + 3359 // we need to add additional 4 because __ enter 3360 // have just pushed ebp on a stack 3361 Address buf_arg(rsp, 4 + 4 + 4); 3362 Address len_arg(rsp, 4 + 4 + 8); 3363 // Load up: 3364 __ movl(crc, crc_arg); 3365 __ movl(buf, buf_arg); 3366 __ movl(len, len_arg); 3367 __ push(d); 3368 __ push(g); 3369 __ push(h); 3370 __ crc32c_ipl_alg2_alt2(crc, buf, len, 3371 d, g, h, 3372 empty, empty, empty, 3373 xmm0, xmm1, xmm2, 3374 is_pclmulqdq_supported); 3375 __ pop(h); 3376 __ pop(g); 3377 __ pop(d); 3378 __ vzeroupper(); 3379 __ leave(); // required for proper stackwalking of RuntimeStub frame 3380 __ ret(0); 3381 3382 return start; 3383 } 3384 3385 address generate_libmExp() { 3386 StubCodeMark mark(this, "StubRoutines", "libmExp"); 3387 3388 address start = __ pc(); 3389 3390 const XMMRegister x0 = xmm0; 3391 const XMMRegister x1 = xmm1; 3392 const XMMRegister x2 = xmm2; 3393 const XMMRegister x3 = xmm3; 3394 3395 const XMMRegister x4 = xmm4; 3396 const XMMRegister x5 = xmm5; 3397 const XMMRegister x6 = xmm6; 3398 const XMMRegister x7 = xmm7; 3399 3400 const Register tmp = rbx; 3401 3402 BLOCK_COMMENT("Entry:"); 3403 __ enter(); // required for proper stackwalking of RuntimeStub frame 3404 __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); 3405 __ leave(); // required for proper stackwalking of RuntimeStub frame 3406 __ ret(0); 3407 3408 return start; 3409 3410 } 3411 3412 address generate_libmLog() { 3413 StubCodeMark mark(this, "StubRoutines", "libmLog"); 3414 3415 address start = __ pc(); 3416 3417 const XMMRegister x0 = xmm0; 3418 const XMMRegister x1 = xmm1; 3419 const XMMRegister x2 = xmm2; 3420 const XMMRegister x3 = xmm3; 3421 3422 const XMMRegister x4 = xmm4; 3423 const XMMRegister x5 = xmm5; 3424 const XMMRegister x6 = xmm6; 3425 const XMMRegister x7 = xmm7; 3426 3427 const Register tmp = rbx; 3428 3429 BLOCK_COMMENT("Entry:"); 3430 __ enter(); // required for proper stackwalking of RuntimeStub frame 3431 __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); 3432 __ leave(); // required for proper stackwalking of RuntimeStub frame 3433 __ ret(0); 3434 3435 return start; 3436 3437 } 3438 3439 address generate_libmLog10() { 3440 StubCodeMark mark(this, "StubRoutines", "libmLog10"); 3441 3442 address start = __ pc(); 3443 3444 const XMMRegister x0 = xmm0; 3445 const XMMRegister x1 = xmm1; 3446 const XMMRegister x2 = xmm2; 3447 const XMMRegister x3 = xmm3; 3448 3449 const XMMRegister x4 = xmm4; 3450 const XMMRegister x5 = xmm5; 3451 const XMMRegister x6 = xmm6; 3452 const XMMRegister x7 = xmm7; 3453 3454 const Register tmp = rbx; 3455 3456 BLOCK_COMMENT("Entry:"); 3457 __ enter(); // required for proper stackwalking of RuntimeStub frame 3458 __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); 3459 __ leave(); // required for proper stackwalking of RuntimeStub frame 3460 __ ret(0); 3461 3462 return start; 3463 3464 } 3465 3466 address generate_libmPow() { 3467 StubCodeMark mark(this, "StubRoutines", "libmPow"); 3468 3469 address start = __ pc(); 3470 3471 const XMMRegister x0 = xmm0; 3472 const XMMRegister x1 = xmm1; 3473 const XMMRegister x2 = xmm2; 3474 const XMMRegister x3 = xmm3; 3475 3476 const XMMRegister x4 = xmm4; 3477 const XMMRegister x5 = xmm5; 3478 const XMMRegister x6 = xmm6; 3479 const XMMRegister x7 = xmm7; 3480 3481 const Register tmp = rbx; 3482 3483 BLOCK_COMMENT("Entry:"); 3484 __ enter(); // required for proper stackwalking of RuntimeStub frame 3485 __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); 3486 __ leave(); // required for proper stackwalking of RuntimeStub frame 3487 __ ret(0); 3488 3489 return start; 3490 3491 } 3492 3493 address generate_libm_reduce_pi04l() { 3494 StubCodeMark mark(this, "StubRoutines", "libm_reduce_pi04l"); 3495 3496 address start = __ pc(); 3497 3498 BLOCK_COMMENT("Entry:"); 3499 __ libm_reduce_pi04l(rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp); 3500 3501 return start; 3502 3503 } 3504 3505 address generate_libm_sin_cos_huge() { 3506 StubCodeMark mark(this, "StubRoutines", "libm_sin_cos_huge"); 3507 3508 address start = __ pc(); 3509 3510 const XMMRegister x0 = xmm0; 3511 const XMMRegister x1 = xmm1; 3512 3513 BLOCK_COMMENT("Entry:"); 3514 __ libm_sincos_huge(x0, x1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp); 3515 3516 return start; 3517 3518 } 3519 3520 address generate_libmSin() { 3521 StubCodeMark mark(this, "StubRoutines", "libmSin"); 3522 3523 address start = __ pc(); 3524 3525 const XMMRegister x0 = xmm0; 3526 const XMMRegister x1 = xmm1; 3527 const XMMRegister x2 = xmm2; 3528 const XMMRegister x3 = xmm3; 3529 3530 const XMMRegister x4 = xmm4; 3531 const XMMRegister x5 = xmm5; 3532 const XMMRegister x6 = xmm6; 3533 const XMMRegister x7 = xmm7; 3534 3535 BLOCK_COMMENT("Entry:"); 3536 __ enter(); // required for proper stackwalking of RuntimeStub frame 3537 __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rdx); 3538 __ leave(); // required for proper stackwalking of RuntimeStub frame 3539 __ ret(0); 3540 3541 return start; 3542 3543 } 3544 3545 address generate_libmCos() { 3546 StubCodeMark mark(this, "StubRoutines", "libmCos"); 3547 3548 address start = __ pc(); 3549 3550 const XMMRegister x0 = xmm0; 3551 const XMMRegister x1 = xmm1; 3552 const XMMRegister x2 = xmm2; 3553 const XMMRegister x3 = xmm3; 3554 3555 const XMMRegister x4 = xmm4; 3556 const XMMRegister x5 = xmm5; 3557 const XMMRegister x6 = xmm6; 3558 const XMMRegister x7 = xmm7; 3559 3560 const Register tmp = rbx; 3561 3562 BLOCK_COMMENT("Entry:"); 3563 __ enter(); // required for proper stackwalking of RuntimeStub frame 3564 __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); 3565 __ leave(); // required for proper stackwalking of RuntimeStub frame 3566 __ ret(0); 3567 3568 return start; 3569 3570 } 3571 3572 address generate_libm_tan_cot_huge() { 3573 StubCodeMark mark(this, "StubRoutines", "libm_tan_cot_huge"); 3574 3575 address start = __ pc(); 3576 3577 const XMMRegister x0 = xmm0; 3578 const XMMRegister x1 = xmm1; 3579 3580 BLOCK_COMMENT("Entry:"); 3581 __ libm_tancot_huge(x0, x1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp); 3582 3583 return start; 3584 3585 } 3586 3587 address generate_libmTan() { 3588 StubCodeMark mark(this, "StubRoutines", "libmTan"); 3589 3590 address start = __ pc(); 3591 3592 const XMMRegister x0 = xmm0; 3593 const XMMRegister x1 = xmm1; 3594 const XMMRegister x2 = xmm2; 3595 const XMMRegister x3 = xmm3; 3596 3597 const XMMRegister x4 = xmm4; 3598 const XMMRegister x5 = xmm5; 3599 const XMMRegister x6 = xmm6; 3600 const XMMRegister x7 = xmm7; 3601 3602 const Register tmp = rbx; 3603 3604 BLOCK_COMMENT("Entry:"); 3605 __ enter(); // required for proper stackwalking of RuntimeStub frame 3606 __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); 3607 __ leave(); // required for proper stackwalking of RuntimeStub frame 3608 __ ret(0); 3609 3610 return start; 3611 3612 } 3613 3614 // Safefetch stubs. 3615 void generate_safefetch(const char* name, int size, address* entry, 3616 address* fault_pc, address* continuation_pc) { 3617 // safefetch signatures: 3618 // int SafeFetch32(int* adr, int errValue); 3619 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3620 3621 StubCodeMark mark(this, "StubRoutines", name); 3622 3623 // Entry point, pc or function descriptor. 3624 *entry = __ pc(); 3625 3626 __ movl(rax, Address(rsp, 0x8)); 3627 __ movl(rcx, Address(rsp, 0x4)); 3628 // Load *adr into eax, may fault. 3629 *fault_pc = __ pc(); 3630 switch (size) { 3631 case 4: 3632 // int32_t 3633 __ movl(rax, Address(rcx, 0)); 3634 break; 3635 case 8: 3636 // int64_t 3637 Unimplemented(); 3638 break; 3639 default: 3640 ShouldNotReachHere(); 3641 } 3642 3643 // Return errValue or *adr. 3644 *continuation_pc = __ pc(); 3645 __ ret(0); 3646 } 3647 3648 public: 3649 // Information about frame layout at time of blocking runtime call. 3650 // Note that we only have to preserve callee-saved registers since 3651 // the compilers are responsible for supplying a continuation point 3652 // if they expect all registers to be preserved. 3653 enum layout { 3654 thread_off, // last_java_sp 3655 arg1_off, 3656 arg2_off, 3657 rbp_off, // callee saved register 3658 ret_pc, 3659 framesize 3660 }; 3661 3662 private: 3663 3664 #undef __ 3665 #define __ masm-> 3666 3667 //------------------------------------------------------------------------------------------------------------------------ 3668 // Continuation point for throwing of implicit exceptions that are not handled in 3669 // the current activation. Fabricates an exception oop and initiates normal 3670 // exception dispatching in this frame. 3671 // 3672 // Previously the compiler (c2) allowed for callee save registers on Java calls. 3673 // This is no longer true after adapter frames were removed but could possibly 3674 // be brought back in the future if the interpreter code was reworked and it 3675 // was deemed worthwhile. The comment below was left to describe what must 3676 // happen here if callee saves were resurrected. As it stands now this stub 3677 // could actually be a vanilla BufferBlob and have now oopMap at all. 3678 // Since it doesn't make much difference we've chosen to leave it the 3679 // way it was in the callee save days and keep the comment. 3680 3681 // If we need to preserve callee-saved values we need a callee-saved oop map and 3682 // therefore have to make these stubs into RuntimeStubs rather than BufferBlobs. 3683 // If the compiler needs all registers to be preserved between the fault 3684 // point and the exception handler then it must assume responsibility for that in 3685 // AbstractCompiler::continuation_for_implicit_null_exception or 3686 // continuation_for_implicit_division_by_zero_exception. All other implicit 3687 // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are 3688 // either at call sites or otherwise assume that stack unwinding will be initiated, 3689 // so caller saved registers were assumed volatile in the compiler. 3690 address generate_throw_exception(const char* name, address runtime_entry, 3691 Register arg1 = noreg, Register arg2 = noreg) { 3692 3693 int insts_size = 256; 3694 int locs_size = 32; 3695 3696 CodeBuffer code(name, insts_size, locs_size); 3697 OopMapSet* oop_maps = new OopMapSet(); 3698 MacroAssembler* masm = new MacroAssembler(&code); 3699 3700 address start = __ pc(); 3701 3702 // This is an inlined and slightly modified version of call_VM 3703 // which has the ability to fetch the return PC out of 3704 // thread-local storage and also sets up last_Java_sp slightly 3705 // differently than the real call_VM 3706 Register java_thread = rbx; 3707 __ get_thread(java_thread); 3708 3709 __ enter(); // required for proper stackwalking of RuntimeStub frame 3710 3711 // pc and rbp, already pushed 3712 __ subptr(rsp, (framesize-2) * wordSize); // prolog 3713 3714 // Frame is now completed as far as size and linkage. 3715 3716 int frame_complete = __ pc() - start; 3717 3718 // push java thread (becomes first argument of C function) 3719 __ movptr(Address(rsp, thread_off * wordSize), java_thread); 3720 if (arg1 != noreg) { 3721 __ movptr(Address(rsp, arg1_off * wordSize), arg1); 3722 } 3723 if (arg2 != noreg) { 3724 assert(arg1 != noreg, "missing reg arg"); 3725 __ movptr(Address(rsp, arg2_off * wordSize), arg2); 3726 } 3727 3728 // Set up last_Java_sp and last_Java_fp 3729 __ set_last_Java_frame(java_thread, rsp, rbp, NULL); 3730 3731 // Call runtime 3732 BLOCK_COMMENT("call runtime_entry"); 3733 __ call(RuntimeAddress(runtime_entry)); 3734 // Generate oop map 3735 OopMap* map = new OopMap(framesize, 0); 3736 oop_maps->add_gc_map(__ pc() - start, map); 3737 3738 // restore the thread (cannot use the pushed argument since arguments 3739 // may be overwritten by C code generated by an optimizing compiler); 3740 // however can use the register value directly if it is callee saved. 3741 __ get_thread(java_thread); 3742 3743 __ reset_last_Java_frame(java_thread, true); 3744 3745 __ leave(); // required for proper stackwalking of RuntimeStub frame 3746 3747 // check for pending exceptions 3748 #ifdef ASSERT 3749 Label L; 3750 __ cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 3751 __ jcc(Assembler::notEqual, L); 3752 __ should_not_reach_here(); 3753 __ bind(L); 3754 #endif /* ASSERT */ 3755 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3756 3757 3758 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, framesize, oop_maps, false); 3759 return stub->entry_point(); 3760 } 3761 3762 3763 void create_control_words() { 3764 // Round to nearest, 53-bit mode, exceptions masked 3765 StubRoutines::_fpu_cntrl_wrd_std = 0x027F; 3766 // Round to zero, 53-bit mode, exception mased 3767 StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F; 3768 // Round to nearest, 24-bit mode, exceptions masked 3769 StubRoutines::_fpu_cntrl_wrd_24 = 0x007F; 3770 // Round to nearest, 64-bit mode, exceptions masked 3771 StubRoutines::_fpu_cntrl_wrd_64 = 0x037F; 3772 // Round to nearest, 64-bit mode, exceptions masked 3773 StubRoutines::_mxcsr_std = 0x1F80; 3774 // Note: the following two constants are 80-bit values 3775 // layout is critical for correct loading by FPU. 3776 // Bias for strict fp multiply/divide 3777 StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000 3778 StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000; 3779 StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff; 3780 // Un-Bias for strict fp multiply/divide 3781 StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000 3782 StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000; 3783 StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff; 3784 } 3785 3786 //--------------------------------------------------------------------------- 3787 // Initialization 3788 3789 void generate_initial() { 3790 // Generates all stubs and initializes the entry points 3791 3792 //------------------------------------------------------------------------------------------------------------------------ 3793 // entry points that exist in all platforms 3794 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than 3795 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp. 3796 StubRoutines::_forward_exception_entry = generate_forward_exception(); 3797 3798 StubRoutines::_call_stub_entry = 3799 generate_call_stub(StubRoutines::_call_stub_return_address); 3800 // is referenced by megamorphic call 3801 StubRoutines::_catch_exception_entry = generate_catch_exception(); 3802 3803 // These are currently used by Solaris/Intel 3804 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg(); 3805 3806 // platform dependent 3807 create_control_words(); 3808 3809 StubRoutines::x86::_verify_mxcsr_entry = generate_verify_mxcsr(); 3810 StubRoutines::x86::_verify_fpu_cntrl_wrd_entry = generate_verify_fpu_cntrl_wrd(); 3811 StubRoutines::_d2i_wrapper = generate_d2i_wrapper(T_INT, 3812 CAST_FROM_FN_PTR(address, SharedRuntime::d2i)); 3813 StubRoutines::_d2l_wrapper = generate_d2i_wrapper(T_LONG, 3814 CAST_FROM_FN_PTR(address, SharedRuntime::d2l)); 3815 3816 // Build this early so it's available for the interpreter 3817 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", 3818 CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError)); 3819 StubRoutines::_throw_delayed_StackOverflowError_entry = generate_throw_exception("delayed StackOverflowError throw_exception", 3820 CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError)); 3821 3822 if (UseCRC32Intrinsics) { 3823 // set table address before stub generation which use it 3824 StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table; 3825 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 3826 } 3827 3828 if (UseCRC32CIntrinsics) { 3829 bool supports_clmul = VM_Version::supports_clmul(); 3830 StubRoutines::x86::generate_CRC32C_table(supports_clmul); 3831 StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table; 3832 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul); 3833 } 3834 if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) { 3835 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) || 3836 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) || 3837 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) { 3838 StubRoutines::x86::_L_2il0floatpacket_0_adr = (address)StubRoutines::x86::_L_2il0floatpacket_0; 3839 StubRoutines::x86::_Pi4Inv_adr = (address)StubRoutines::x86::_Pi4Inv; 3840 StubRoutines::x86::_Pi4x3_adr = (address)StubRoutines::x86::_Pi4x3; 3841 StubRoutines::x86::_Pi4x4_adr = (address)StubRoutines::x86::_Pi4x4; 3842 StubRoutines::x86::_ones_adr = (address)StubRoutines::x86::_ones; 3843 } 3844 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) { 3845 StubRoutines::_dexp = generate_libmExp(); 3846 } 3847 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 3848 StubRoutines::_dlog = generate_libmLog(); 3849 } 3850 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) { 3851 StubRoutines::_dlog10 = generate_libmLog10(); 3852 } 3853 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) { 3854 StubRoutines::_dpow = generate_libmPow(); 3855 } 3856 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) || 3857 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) || 3858 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) { 3859 StubRoutines::_dlibm_reduce_pi04l = generate_libm_reduce_pi04l(); 3860 } 3861 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) || 3862 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 3863 StubRoutines::_dlibm_sin_cos_huge = generate_libm_sin_cos_huge(); 3864 } 3865 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 3866 StubRoutines::_dsin = generate_libmSin(); 3867 } 3868 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 3869 StubRoutines::_dcos = generate_libmCos(); 3870 } 3871 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) { 3872 StubRoutines::_dlibm_tan_cot_huge = generate_libm_tan_cot_huge(); 3873 StubRoutines::_dtan = generate_libmTan(); 3874 } 3875 } 3876 } 3877 3878 void generate_all() { 3879 // Generates all stubs and initializes the entry points 3880 3881 // These entry points require SharedInfo::stack0 to be set up in non-core builds 3882 // and need to be relocatable, so they each fabricate a RuntimeStub internally. 3883 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError)); 3884 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError)); 3885 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call)); 3886 3887 //------------------------------------------------------------------------------------------------------------------------ 3888 // entry points that are platform specific 3889 3890 // support for verify_oop (must happen after universe_init) 3891 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 3892 3893 // arraycopy stubs used by compilers 3894 generate_arraycopy_stubs(); 3895 3896 // don't bother generating these AES intrinsic stubs unless global flag is set 3897 if (UseAESIntrinsics) { 3898 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // might be needed by the others 3899 3900 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 3901 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 3902 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 3903 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 3904 } 3905 3906 if (UseAESCTRIntrinsics) { 3907 StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask(); 3908 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel(); 3909 } 3910 3911 if (UseSHA1Intrinsics) { 3912 StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask(); 3913 StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask(); 3914 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 3915 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 3916 } 3917 if (UseSHA256Intrinsics) { 3918 StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256; 3919 StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask(); 3920 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 3921 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 3922 } 3923 3924 // Generate GHASH intrinsics code 3925 if (UseGHASHIntrinsics) { 3926 StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); 3927 StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask(); 3928 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 3929 } 3930 3931 // Safefetch stubs. 3932 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 3933 &StubRoutines::_safefetch32_fault_pc, 3934 &StubRoutines::_safefetch32_continuation_pc); 3935 StubRoutines::_safefetchN_entry = StubRoutines::_safefetch32_entry; 3936 StubRoutines::_safefetchN_fault_pc = StubRoutines::_safefetch32_fault_pc; 3937 StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc; 3938 } 3939 3940 3941 public: 3942 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 3943 if (all) { 3944 generate_all(); 3945 } else { 3946 generate_initial(); 3947 } 3948 } 3949 }; // end class declaration 3950 3951 3952 void StubGenerator_generate(CodeBuffer* code, bool all) { 3953 StubGenerator g(code, all); 3954 }