1 /* 2 * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "interpreter/interpreter.hpp" 29 #include "nativeInst_x86.hpp" 30 #include "oops/instanceOop.hpp" 31 #include "oops/method.hpp" 32 #include "oops/objArrayKlass.hpp" 33 #include "oops/oop.inline.hpp" 34 #include "prims/methodHandles.hpp" 35 #include "runtime/frame.inline.hpp" 36 #include "runtime/handles.inline.hpp" 37 #include "runtime/sharedRuntime.hpp" 38 #include "runtime/stubCodeGenerator.hpp" 39 #include "runtime/stubRoutines.hpp" 40 #include "runtime/thread.inline.hpp" 41 #ifdef COMPILER2 42 #include "opto/runtime.hpp" 43 #endif 44 #if INCLUDE_ALL_GCS 45 #include "gc/z/zBarrier.inline.hpp" 46 #include "gc/z/zGlobals.hpp" 47 #endif 48 49 // Declaration and definition of StubGenerator (no .hpp file). 50 // For a more detailed description of the stub routine structure 51 // see the comment in stubRoutines.hpp 52 53 #define __ _masm-> 54 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8) 55 #define a__ ((Assembler*)_masm)-> 56 57 #ifdef PRODUCT 58 #define BLOCK_COMMENT(str) /* nothing */ 59 #else 60 #define BLOCK_COMMENT(str) __ block_comment(str) 61 #endif 62 63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 64 const int MXCSR_MASK = 0xFFC0; // Mask out any pending exceptions 65 66 // Stub Code definitions 67 68 class StubGenerator: public StubCodeGenerator { 69 private: 70 71 #ifdef PRODUCT 72 #define inc_counter_np(counter) ((void)0) 73 #else 74 void inc_counter_np_(int& counter) { 75 // This can destroy rscratch1 if counter is far from the code cache 76 __ incrementl(ExternalAddress((address)&counter)); 77 } 78 #define inc_counter_np(counter) \ 79 BLOCK_COMMENT("inc_counter " #counter); \ 80 inc_counter_np_(counter); 81 #endif 82 83 // Call stubs are used to call Java from C 84 // 85 // Linux Arguments: 86 // c_rarg0: call wrapper address address 87 // c_rarg1: result address 88 // c_rarg2: result type BasicType 89 // c_rarg3: method Method* 90 // c_rarg4: (interpreter) entry point address 91 // c_rarg5: parameters intptr_t* 92 // 16(rbp): parameter size (in words) int 93 // 24(rbp): thread Thread* 94 // 95 // [ return_from_Java ] <--- rsp 96 // [ argument word n ] 97 // ... 98 // -12 [ argument word 1 ] 99 // -11 [ saved r15 ] <--- rsp_after_call 100 // -10 [ saved r14 ] 101 // -9 [ saved r13 ] 102 // -8 [ saved r12 ] 103 // -7 [ saved rbx ] 104 // -6 [ call wrapper ] 105 // -5 [ result ] 106 // -4 [ result type ] 107 // -3 [ method ] 108 // -2 [ entry point ] 109 // -1 [ parameters ] 110 // 0 [ saved rbp ] <--- rbp 111 // 1 [ return address ] 112 // 2 [ parameter size ] 113 // 3 [ thread ] 114 // 115 // Windows Arguments: 116 // c_rarg0: call wrapper address address 117 // c_rarg1: result address 118 // c_rarg2: result type BasicType 119 // c_rarg3: method Method* 120 // 48(rbp): (interpreter) entry point address 121 // 56(rbp): parameters intptr_t* 122 // 64(rbp): parameter size (in words) int 123 // 72(rbp): thread Thread* 124 // 125 // [ return_from_Java ] <--- rsp 126 // [ argument word n ] 127 // ... 128 // -60 [ argument word 1 ] 129 // -59 [ saved xmm31 ] <--- rsp after_call 130 // [ saved xmm16-xmm30 ] (EVEX enabled, else the space is blank) 131 // -27 [ saved xmm15 ] 132 // [ saved xmm7-xmm14 ] 133 // -9 [ saved xmm6 ] (each xmm register takes 2 slots) 134 // -7 [ saved r15 ] 135 // -6 [ saved r14 ] 136 // -5 [ saved r13 ] 137 // -4 [ saved r12 ] 138 // -3 [ saved rdi ] 139 // -2 [ saved rsi ] 140 // -1 [ saved rbx ] 141 // 0 [ saved rbp ] <--- rbp 142 // 1 [ return address ] 143 // 2 [ call wrapper ] 144 // 3 [ result ] 145 // 4 [ result type ] 146 // 5 [ method ] 147 // 6 [ entry point ] 148 // 7 [ parameters ] 149 // 8 [ parameter size ] 150 // 9 [ thread ] 151 // 152 // Windows reserves the callers stack space for arguments 1-4. 153 // We spill c_rarg0-c_rarg3 to this space. 154 155 // Call stub stack layout word offsets from rbp 156 enum call_stub_layout { 157 #ifdef _WIN64 158 xmm_save_first = 6, // save from xmm6 159 xmm_save_last = 31, // to xmm31 160 xmm_save_base = -9, 161 rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27 162 r15_off = -7, 163 r14_off = -6, 164 r13_off = -5, 165 r12_off = -4, 166 rdi_off = -3, 167 rsi_off = -2, 168 rbx_off = -1, 169 rbp_off = 0, 170 retaddr_off = 1, 171 call_wrapper_off = 2, 172 result_off = 3, 173 result_type_off = 4, 174 method_off = 5, 175 entry_point_off = 6, 176 parameters_off = 7, 177 parameter_size_off = 8, 178 thread_off = 9 179 #else 180 rsp_after_call_off = -12, 181 mxcsr_off = rsp_after_call_off, 182 r15_off = -11, 183 r14_off = -10, 184 r13_off = -9, 185 r12_off = -8, 186 rbx_off = -7, 187 call_wrapper_off = -6, 188 result_off = -5, 189 result_type_off = -4, 190 method_off = -3, 191 entry_point_off = -2, 192 parameters_off = -1, 193 rbp_off = 0, 194 retaddr_off = 1, 195 parameter_size_off = 2, 196 thread_off = 3 197 #endif 198 }; 199 200 #ifdef _WIN64 201 Address xmm_save(int reg) { 202 assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range"); 203 return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize); 204 } 205 #endif 206 207 address generate_call_stub(address& return_address) { 208 assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 && 209 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 210 "adjust this code"); 211 StubCodeMark mark(this, "StubRoutines", "call_stub"); 212 address start = __ pc(); 213 214 // same as in generate_catch_exception()! 215 const Address rsp_after_call(rbp, rsp_after_call_off * wordSize); 216 217 const Address call_wrapper (rbp, call_wrapper_off * wordSize); 218 const Address result (rbp, result_off * wordSize); 219 const Address result_type (rbp, result_type_off * wordSize); 220 const Address method (rbp, method_off * wordSize); 221 const Address entry_point (rbp, entry_point_off * wordSize); 222 const Address parameters (rbp, parameters_off * wordSize); 223 const Address parameter_size(rbp, parameter_size_off * wordSize); 224 225 // same as in generate_catch_exception()! 226 const Address thread (rbp, thread_off * wordSize); 227 228 const Address r15_save(rbp, r15_off * wordSize); 229 const Address r14_save(rbp, r14_off * wordSize); 230 const Address r13_save(rbp, r13_off * wordSize); 231 const Address r12_save(rbp, r12_off * wordSize); 232 const Address rbx_save(rbp, rbx_off * wordSize); 233 234 // stub code 235 __ enter(); 236 __ subptr(rsp, -rsp_after_call_off * wordSize); 237 238 // save register parameters 239 #ifndef _WIN64 240 __ movptr(parameters, c_rarg5); // parameters 241 __ movptr(entry_point, c_rarg4); // entry_point 242 #endif 243 244 __ movptr(method, c_rarg3); // method 245 __ movl(result_type, c_rarg2); // result type 246 __ movptr(result, c_rarg1); // result 247 __ movptr(call_wrapper, c_rarg0); // call wrapper 248 249 // save regs belonging to calling function 250 __ movptr(rbx_save, rbx); 251 __ movptr(r12_save, r12); 252 __ movptr(r13_save, r13); 253 __ movptr(r14_save, r14); 254 __ movptr(r15_save, r15); 255 if (UseAVX > 2) { 256 __ movl(rbx, 0xffff); 257 __ kmovwl(k1, rbx); 258 } 259 #ifdef _WIN64 260 int last_reg = 15; 261 if (UseAVX > 2) { 262 last_reg = 31; 263 } 264 if (VM_Version::supports_evex()) { 265 for (int i = xmm_save_first; i <= last_reg; i++) { 266 __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0); 267 } 268 } else { 269 for (int i = xmm_save_first; i <= last_reg; i++) { 270 __ movdqu(xmm_save(i), as_XMMRegister(i)); 271 } 272 } 273 274 const Address rdi_save(rbp, rdi_off * wordSize); 275 const Address rsi_save(rbp, rsi_off * wordSize); 276 277 __ movptr(rsi_save, rsi); 278 __ movptr(rdi_save, rdi); 279 #else 280 const Address mxcsr_save(rbp, mxcsr_off * wordSize); 281 { 282 Label skip_ldmx; 283 __ stmxcsr(mxcsr_save); 284 __ movl(rax, mxcsr_save); 285 __ andl(rax, MXCSR_MASK); // Only check control and mask bits 286 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std()); 287 __ cmp32(rax, mxcsr_std); 288 __ jcc(Assembler::equal, skip_ldmx); 289 __ ldmxcsr(mxcsr_std); 290 __ bind(skip_ldmx); 291 } 292 #endif 293 294 // Load up thread register 295 __ movptr(r15_thread, thread); 296 __ reinit_heapbase(); 297 298 #ifdef ASSERT 299 // make sure we have no pending exceptions 300 { 301 Label L; 302 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 303 __ jcc(Assembler::equal, L); 304 __ stop("StubRoutines::call_stub: entered with pending exception"); 305 __ bind(L); 306 } 307 #endif 308 309 // pass parameters if any 310 BLOCK_COMMENT("pass parameters if any"); 311 Label parameters_done; 312 __ movl(c_rarg3, parameter_size); 313 __ testl(c_rarg3, c_rarg3); 314 __ jcc(Assembler::zero, parameters_done); 315 316 Label loop; 317 __ movptr(c_rarg2, parameters); // parameter pointer 318 __ movl(c_rarg1, c_rarg3); // parameter counter is in c_rarg1 319 __ BIND(loop); 320 __ movptr(rax, Address(c_rarg2, 0));// get parameter 321 __ addptr(c_rarg2, wordSize); // advance to next parameter 322 __ decrementl(c_rarg1); // decrement counter 323 __ push(rax); // pass parameter 324 __ jcc(Assembler::notZero, loop); 325 326 // call Java function 327 __ BIND(parameters_done); 328 __ movptr(rbx, method); // get Method* 329 __ movptr(c_rarg1, entry_point); // get entry_point 330 __ mov(r13, rsp); // set sender sp 331 BLOCK_COMMENT("call Java function"); 332 __ call(c_rarg1); 333 334 BLOCK_COMMENT("call_stub_return_address:"); 335 return_address = __ pc(); 336 337 // store result depending on type (everything that is not 338 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 339 __ movptr(c_rarg0, result); 340 Label is_long, is_float, is_double, exit; 341 __ movl(c_rarg1, result_type); 342 __ cmpl(c_rarg1, T_OBJECT); 343 __ jcc(Assembler::equal, is_long); 344 __ cmpl(c_rarg1, T_LONG); 345 __ jcc(Assembler::equal, is_long); 346 __ cmpl(c_rarg1, T_FLOAT); 347 __ jcc(Assembler::equal, is_float); 348 __ cmpl(c_rarg1, T_DOUBLE); 349 __ jcc(Assembler::equal, is_double); 350 351 // handle T_INT case 352 __ movl(Address(c_rarg0, 0), rax); 353 354 __ BIND(exit); 355 356 // pop parameters 357 __ lea(rsp, rsp_after_call); 358 359 #ifdef ASSERT 360 // verify that threads correspond 361 { 362 Label L1, L2, L3; 363 __ cmpptr(r15_thread, thread); 364 __ jcc(Assembler::equal, L1); 365 __ stop("StubRoutines::call_stub: r15_thread is corrupted"); 366 __ bind(L1); 367 __ get_thread(rbx); 368 __ cmpptr(r15_thread, thread); 369 __ jcc(Assembler::equal, L2); 370 __ stop("StubRoutines::call_stub: r15_thread is modified by call"); 371 __ bind(L2); 372 __ cmpptr(r15_thread, rbx); 373 __ jcc(Assembler::equal, L3); 374 __ stop("StubRoutines::call_stub: threads must correspond"); 375 __ bind(L3); 376 } 377 #endif 378 379 // restore regs belonging to calling function 380 #ifdef _WIN64 381 // emit the restores for xmm regs 382 if (VM_Version::supports_evex()) { 383 for (int i = xmm_save_first; i <= last_reg; i++) { 384 __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0); 385 } 386 } else { 387 for (int i = xmm_save_first; i <= last_reg; i++) { 388 __ movdqu(as_XMMRegister(i), xmm_save(i)); 389 } 390 } 391 #endif 392 __ movptr(r15, r15_save); 393 __ movptr(r14, r14_save); 394 __ movptr(r13, r13_save); 395 __ movptr(r12, r12_save); 396 __ movptr(rbx, rbx_save); 397 398 #ifdef _WIN64 399 __ movptr(rdi, rdi_save); 400 __ movptr(rsi, rsi_save); 401 #else 402 __ ldmxcsr(mxcsr_save); 403 #endif 404 405 // restore rsp 406 __ addptr(rsp, -rsp_after_call_off * wordSize); 407 408 // return 409 __ vzeroupper(); 410 __ pop(rbp); 411 __ ret(0); 412 413 // handle return types different from T_INT 414 __ BIND(is_long); 415 __ movq(Address(c_rarg0, 0), rax); 416 __ jmp(exit); 417 418 __ BIND(is_float); 419 __ movflt(Address(c_rarg0, 0), xmm0); 420 __ jmp(exit); 421 422 __ BIND(is_double); 423 __ movdbl(Address(c_rarg0, 0), xmm0); 424 __ jmp(exit); 425 426 return start; 427 } 428 429 // Return point for a Java call if there's an exception thrown in 430 // Java code. The exception is caught and transformed into a 431 // pending exception stored in JavaThread that can be tested from 432 // within the VM. 433 // 434 // Note: Usually the parameters are removed by the callee. In case 435 // of an exception crossing an activation frame boundary, that is 436 // not the case if the callee is compiled code => need to setup the 437 // rsp. 438 // 439 // rax: exception oop 440 441 address generate_catch_exception() { 442 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 443 address start = __ pc(); 444 445 // same as in generate_call_stub(): 446 const Address rsp_after_call(rbp, rsp_after_call_off * wordSize); 447 const Address thread (rbp, thread_off * wordSize); 448 449 #ifdef ASSERT 450 // verify that threads correspond 451 { 452 Label L1, L2, L3; 453 __ cmpptr(r15_thread, thread); 454 __ jcc(Assembler::equal, L1); 455 __ stop("StubRoutines::catch_exception: r15_thread is corrupted"); 456 __ bind(L1); 457 __ get_thread(rbx); 458 __ cmpptr(r15_thread, thread); 459 __ jcc(Assembler::equal, L2); 460 __ stop("StubRoutines::catch_exception: r15_thread is modified by call"); 461 __ bind(L2); 462 __ cmpptr(r15_thread, rbx); 463 __ jcc(Assembler::equal, L3); 464 __ stop("StubRoutines::catch_exception: threads must correspond"); 465 __ bind(L3); 466 } 467 #endif 468 469 // set pending exception 470 __ verify_oop(rax); 471 472 __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax); 473 __ lea(rscratch1, ExternalAddress((address)__FILE__)); 474 __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1); 475 __ movl(Address(r15_thread, Thread::exception_line_offset()), (int) __LINE__); 476 477 // complete return to VM 478 assert(StubRoutines::_call_stub_return_address != NULL, 479 "_call_stub_return_address must have been generated before"); 480 __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address)); 481 482 return start; 483 } 484 485 // Continuation point for runtime calls returning with a pending 486 // exception. The pending exception check happened in the runtime 487 // or native call stub. The pending exception in Thread is 488 // converted into a Java-level exception. 489 // 490 // Contract with Java-level exception handlers: 491 // rax: exception 492 // rdx: throwing pc 493 // 494 // NOTE: At entry of this stub, exception-pc must be on stack !! 495 496 address generate_forward_exception() { 497 StubCodeMark mark(this, "StubRoutines", "forward exception"); 498 address start = __ pc(); 499 500 // Upon entry, the sp points to the return address returning into 501 // Java (interpreted or compiled) code; i.e., the return address 502 // becomes the throwing pc. 503 // 504 // Arguments pushed before the runtime call are still on the stack 505 // but the exception handler will reset the stack pointer -> 506 // ignore them. A potential result in registers can be ignored as 507 // well. 508 509 #ifdef ASSERT 510 // make sure this code is only executed if there is a pending exception 511 { 512 Label L; 513 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL); 514 __ jcc(Assembler::notEqual, L); 515 __ stop("StubRoutines::forward exception: no pending exception (1)"); 516 __ bind(L); 517 } 518 #endif 519 520 // compute exception handler into rbx 521 __ movptr(c_rarg0, Address(rsp, 0)); 522 BLOCK_COMMENT("call exception_handler_for_return_address"); 523 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 524 SharedRuntime::exception_handler_for_return_address), 525 r15_thread, c_rarg0); 526 __ mov(rbx, rax); 527 528 // setup rax & rdx, remove return address & clear pending exception 529 __ pop(rdx); 530 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 531 __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 532 533 #ifdef ASSERT 534 // make sure exception is set 535 { 536 Label L; 537 __ testptr(rax, rax); 538 __ jcc(Assembler::notEqual, L); 539 __ stop("StubRoutines::forward exception: no pending exception (2)"); 540 __ bind(L); 541 } 542 #endif 543 544 // continue at exception handler (return address removed) 545 // rax: exception 546 // rbx: exception handler 547 // rdx: throwing pc 548 __ verify_oop(rax); 549 __ jmp(rbx); 550 551 return start; 552 } 553 554 // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest) 555 // 556 // Arguments : 557 // c_rarg0: exchange_value 558 // c_rarg0: dest 559 // 560 // Result: 561 // *dest <- ex, return (orig *dest) 562 address generate_atomic_xchg() { 563 StubCodeMark mark(this, "StubRoutines", "atomic_xchg"); 564 address start = __ pc(); 565 566 __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow 567 __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK 568 __ ret(0); 569 570 return start; 571 } 572 573 // Support for intptr_t atomic::xchg_long(jlong exchange_value, volatile jlong* dest) 574 // 575 // Arguments : 576 // c_rarg0: exchange_value 577 // c_rarg1: dest 578 // 579 // Result: 580 // *dest <- ex, return (orig *dest) 581 address generate_atomic_xchg_long() { 582 StubCodeMark mark(this, "StubRoutines", "atomic_xchg_long"); 583 address start = __ pc(); 584 585 __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow 586 __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK 587 __ ret(0); 588 589 return start; 590 } 591 592 // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest, 593 // jint compare_value) 594 // 595 // Arguments : 596 // c_rarg0: exchange_value 597 // c_rarg1: dest 598 // c_rarg2: compare_value 599 // 600 // Result: 601 // if ( compare_value == *dest ) { 602 // *dest = exchange_value 603 // return compare_value; 604 // else 605 // return *dest; 606 address generate_atomic_cmpxchg() { 607 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg"); 608 address start = __ pc(); 609 610 __ movl(rax, c_rarg2); 611 if ( os::is_MP() ) __ lock(); 612 __ cmpxchgl(c_rarg0, Address(c_rarg1, 0)); 613 __ ret(0); 614 615 return start; 616 } 617 618 // Support for int8_t atomic::atomic_cmpxchg(int8_t exchange_value, volatile int8_t* dest, 619 // int8_t compare_value) 620 // 621 // Arguments : 622 // c_rarg0: exchange_value 623 // c_rarg1: dest 624 // c_rarg2: compare_value 625 // 626 // Result: 627 // if ( compare_value == *dest ) { 628 // *dest = exchange_value 629 // return compare_value; 630 // else 631 // return *dest; 632 address generate_atomic_cmpxchg_byte() { 633 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_byte"); 634 address start = __ pc(); 635 636 __ movsbq(rax, c_rarg2); 637 if ( os::is_MP() ) __ lock(); 638 __ cmpxchgb(c_rarg0, Address(c_rarg1, 0)); 639 __ ret(0); 640 641 return start; 642 } 643 644 // Support for int64_t atomic::atomic_cmpxchg(int64_t exchange_value, 645 // volatile int64_t* dest, 646 // int64_t compare_value) 647 // Arguments : 648 // c_rarg0: exchange_value 649 // c_rarg1: dest 650 // c_rarg2: compare_value 651 // 652 // Result: 653 // if ( compare_value == *dest ) { 654 // *dest = exchange_value 655 // return compare_value; 656 // else 657 // return *dest; 658 address generate_atomic_cmpxchg_long() { 659 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long"); 660 address start = __ pc(); 661 662 __ movq(rax, c_rarg2); 663 if ( os::is_MP() ) __ lock(); 664 __ cmpxchgq(c_rarg0, Address(c_rarg1, 0)); 665 __ ret(0); 666 667 return start; 668 } 669 670 // Support for jint atomic::add(jint add_value, volatile jint* dest) 671 // 672 // Arguments : 673 // c_rarg0: add_value 674 // c_rarg1: dest 675 // 676 // Result: 677 // *dest += add_value 678 // return *dest; 679 address generate_atomic_add() { 680 StubCodeMark mark(this, "StubRoutines", "atomic_add"); 681 address start = __ pc(); 682 683 __ movl(rax, c_rarg0); 684 if ( os::is_MP() ) __ lock(); 685 __ xaddl(Address(c_rarg1, 0), c_rarg0); 686 __ addl(rax, c_rarg0); 687 __ ret(0); 688 689 return start; 690 } 691 692 // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest) 693 // 694 // Arguments : 695 // c_rarg0: add_value 696 // c_rarg1: dest 697 // 698 // Result: 699 // *dest += add_value 700 // return *dest; 701 address generate_atomic_add_long() { 702 StubCodeMark mark(this, "StubRoutines", "atomic_add_long"); 703 address start = __ pc(); 704 705 __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow 706 if ( os::is_MP() ) __ lock(); 707 __ xaddptr(Address(c_rarg1, 0), c_rarg0); 708 __ addptr(rax, c_rarg0); 709 __ ret(0); 710 711 return start; 712 } 713 714 // Support for intptr_t OrderAccess::fence() 715 // 716 // Arguments : 717 // 718 // Result: 719 address generate_orderaccess_fence() { 720 StubCodeMark mark(this, "StubRoutines", "orderaccess_fence"); 721 address start = __ pc(); 722 __ membar(Assembler::StoreLoad); 723 __ ret(0); 724 725 return start; 726 } 727 728 // Support for intptr_t get_previous_fp() 729 // 730 // This routine is used to find the previous frame pointer for the 731 // caller (current_frame_guess). This is used as part of debugging 732 // ps() is seemingly lost trying to find frames. 733 // This code assumes that caller current_frame_guess) has a frame. 734 address generate_get_previous_fp() { 735 StubCodeMark mark(this, "StubRoutines", "get_previous_fp"); 736 const Address old_fp(rbp, 0); 737 const Address older_fp(rax, 0); 738 address start = __ pc(); 739 740 __ enter(); 741 __ movptr(rax, old_fp); // callers fp 742 __ movptr(rax, older_fp); // the frame for ps() 743 __ pop(rbp); 744 __ ret(0); 745 746 return start; 747 } 748 749 // Support for intptr_t get_previous_sp() 750 // 751 // This routine is used to find the previous stack pointer for the 752 // caller. 753 address generate_get_previous_sp() { 754 StubCodeMark mark(this, "StubRoutines", "get_previous_sp"); 755 address start = __ pc(); 756 757 __ movptr(rax, rsp); 758 __ addptr(rax, 8); // return address is at the top of the stack. 759 __ ret(0); 760 761 return start; 762 } 763 764 //---------------------------------------------------------------------------------------------------- 765 // Support for void verify_mxcsr() 766 // 767 // This routine is used with -Xcheck:jni to verify that native 768 // JNI code does not return to Java code without restoring the 769 // MXCSR register to our expected state. 770 771 address generate_verify_mxcsr() { 772 StubCodeMark mark(this, "StubRoutines", "verify_mxcsr"); 773 address start = __ pc(); 774 775 const Address mxcsr_save(rsp, 0); 776 777 if (CheckJNICalls) { 778 Label ok_ret; 779 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std()); 780 __ push(rax); 781 __ subptr(rsp, wordSize); // allocate a temp location 782 __ stmxcsr(mxcsr_save); 783 __ movl(rax, mxcsr_save); 784 __ andl(rax, MXCSR_MASK); // Only check control and mask bits 785 __ cmp32(rax, mxcsr_std); 786 __ jcc(Assembler::equal, ok_ret); 787 788 __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall"); 789 790 __ ldmxcsr(mxcsr_std); 791 792 __ bind(ok_ret); 793 __ addptr(rsp, wordSize); 794 __ pop(rax); 795 } 796 797 __ ret(0); 798 799 return start; 800 } 801 802 address generate_load_barrier_stub(Register raddr, address runtime_entry, bool is_weak) { 803 char *name = (char *)NULL; 804 { 805 ResourceMark rm; 806 stringStream ss; 807 if (is_weak) { 808 ss.print("load_barrier_weak_slow_stub_%s", raddr->name()); 809 } else { 810 ss.print("load_barrier_slow_stub_%s", raddr->name()); 811 } 812 name = os::strdup(ss.as_string(),mtCode); 813 } 814 __ align(CodeEntryAlignment); 815 StubCodeMark mark(this, "StubRoutines", name); 816 address start = __ pc(); 817 818 // save live registers 819 if (raddr != rax) { 820 __ push(rax); 821 } 822 if (raddr != rcx) { 823 __ push(rcx); 824 } 825 if (raddr != rdx) { 826 __ push(rdx); 827 } 828 if (raddr != rsi) { 829 __ push(rsi); 830 } 831 if (raddr != rdi) { 832 __ push(rdi); 833 } 834 if (raddr != r8) { 835 __ push(r8); 836 } 837 if (raddr != r9) { 838 __ push(r9); 839 } 840 if (raddr != r10) { 841 __ push(r10); 842 } 843 if (raddr != r11) { 844 __ push(r11); 845 } 846 847 __ movq(c_rarg1,raddr); 848 __ movq(c_rarg0,Address(c_rarg1,0)); 849 __ call_VM_leaf(runtime_entry, c_rarg0, c_rarg1); 850 851 // restore saved registers 852 if (raddr != r11) { 853 __ pop(r11); 854 } 855 if (raddr != r10) { 856 __ pop(r10); 857 } 858 if (raddr != r9) { 859 __ pop(r9); 860 } 861 if (raddr != r8) { 862 __ pop(r8); 863 } 864 if (raddr != rdi) { 865 __ pop(rdi); 866 } 867 if (raddr != rsi) { 868 __ pop(rsi); 869 } 870 if (raddr != rdx) { 871 __ pop(rdx); 872 } 873 if (raddr != rcx) { 874 __ pop(rcx); 875 } 876 if (raddr != rax) { 877 __ movq(raddr,rax); 878 __ pop(rax); 879 } 880 881 __ ret(0); 882 883 return start; 884 } 885 886 address generate_f2i_fixup() { 887 StubCodeMark mark(this, "StubRoutines", "f2i_fixup"); 888 Address inout(rsp, 5 * wordSize); // return address + 4 saves 889 890 address start = __ pc(); 891 892 Label L; 893 894 __ push(rax); 895 __ push(c_rarg3); 896 __ push(c_rarg2); 897 __ push(c_rarg1); 898 899 __ movl(rax, 0x7f800000); 900 __ xorl(c_rarg3, c_rarg3); 901 __ movl(c_rarg2, inout); 902 __ movl(c_rarg1, c_rarg2); 903 __ andl(c_rarg1, 0x7fffffff); 904 __ cmpl(rax, c_rarg1); // NaN? -> 0 905 __ jcc(Assembler::negative, L); 906 __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint 907 __ movl(c_rarg3, 0x80000000); 908 __ movl(rax, 0x7fffffff); 909 __ cmovl(Assembler::positive, c_rarg3, rax); 910 911 __ bind(L); 912 __ movptr(inout, c_rarg3); 913 914 __ pop(c_rarg1); 915 __ pop(c_rarg2); 916 __ pop(c_rarg3); 917 __ pop(rax); 918 919 __ ret(0); 920 921 return start; 922 } 923 924 address generate_f2l_fixup() { 925 StubCodeMark mark(this, "StubRoutines", "f2l_fixup"); 926 Address inout(rsp, 5 * wordSize); // return address + 4 saves 927 address start = __ pc(); 928 929 Label L; 930 931 __ push(rax); 932 __ push(c_rarg3); 933 __ push(c_rarg2); 934 __ push(c_rarg1); 935 936 __ movl(rax, 0x7f800000); 937 __ xorl(c_rarg3, c_rarg3); 938 __ movl(c_rarg2, inout); 939 __ movl(c_rarg1, c_rarg2); 940 __ andl(c_rarg1, 0x7fffffff); 941 __ cmpl(rax, c_rarg1); // NaN? -> 0 942 __ jcc(Assembler::negative, L); 943 __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong 944 __ mov64(c_rarg3, 0x8000000000000000); 945 __ mov64(rax, 0x7fffffffffffffff); 946 __ cmov(Assembler::positive, c_rarg3, rax); 947 948 __ bind(L); 949 __ movptr(inout, c_rarg3); 950 951 __ pop(c_rarg1); 952 __ pop(c_rarg2); 953 __ pop(c_rarg3); 954 __ pop(rax); 955 956 __ ret(0); 957 958 return start; 959 } 960 961 address generate_d2i_fixup() { 962 StubCodeMark mark(this, "StubRoutines", "d2i_fixup"); 963 Address inout(rsp, 6 * wordSize); // return address + 5 saves 964 965 address start = __ pc(); 966 967 Label L; 968 969 __ push(rax); 970 __ push(c_rarg3); 971 __ push(c_rarg2); 972 __ push(c_rarg1); 973 __ push(c_rarg0); 974 975 __ movl(rax, 0x7ff00000); 976 __ movq(c_rarg2, inout); 977 __ movl(c_rarg3, c_rarg2); 978 __ mov(c_rarg1, c_rarg2); 979 __ mov(c_rarg0, c_rarg2); 980 __ negl(c_rarg3); 981 __ shrptr(c_rarg1, 0x20); 982 __ orl(c_rarg3, c_rarg2); 983 __ andl(c_rarg1, 0x7fffffff); 984 __ xorl(c_rarg2, c_rarg2); 985 __ shrl(c_rarg3, 0x1f); 986 __ orl(c_rarg1, c_rarg3); 987 __ cmpl(rax, c_rarg1); 988 __ jcc(Assembler::negative, L); // NaN -> 0 989 __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint 990 __ movl(c_rarg2, 0x80000000); 991 __ movl(rax, 0x7fffffff); 992 __ cmov(Assembler::positive, c_rarg2, rax); 993 994 __ bind(L); 995 __ movptr(inout, c_rarg2); 996 997 __ pop(c_rarg0); 998 __ pop(c_rarg1); 999 __ pop(c_rarg2); 1000 __ pop(c_rarg3); 1001 __ pop(rax); 1002 1003 __ ret(0); 1004 1005 return start; 1006 } 1007 1008 address generate_d2l_fixup() { 1009 StubCodeMark mark(this, "StubRoutines", "d2l_fixup"); 1010 Address inout(rsp, 6 * wordSize); // return address + 5 saves 1011 1012 address start = __ pc(); 1013 1014 Label L; 1015 1016 __ push(rax); 1017 __ push(c_rarg3); 1018 __ push(c_rarg2); 1019 __ push(c_rarg1); 1020 __ push(c_rarg0); 1021 1022 __ movl(rax, 0x7ff00000); 1023 __ movq(c_rarg2, inout); 1024 __ movl(c_rarg3, c_rarg2); 1025 __ mov(c_rarg1, c_rarg2); 1026 __ mov(c_rarg0, c_rarg2); 1027 __ negl(c_rarg3); 1028 __ shrptr(c_rarg1, 0x20); 1029 __ orl(c_rarg3, c_rarg2); 1030 __ andl(c_rarg1, 0x7fffffff); 1031 __ xorl(c_rarg2, c_rarg2); 1032 __ shrl(c_rarg3, 0x1f); 1033 __ orl(c_rarg1, c_rarg3); 1034 __ cmpl(rax, c_rarg1); 1035 __ jcc(Assembler::negative, L); // NaN -> 0 1036 __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong 1037 __ mov64(c_rarg2, 0x8000000000000000); 1038 __ mov64(rax, 0x7fffffffffffffff); 1039 __ cmovq(Assembler::positive, c_rarg2, rax); 1040 1041 __ bind(L); 1042 __ movq(inout, c_rarg2); 1043 1044 __ pop(c_rarg0); 1045 __ pop(c_rarg1); 1046 __ pop(c_rarg2); 1047 __ pop(c_rarg3); 1048 __ pop(rax); 1049 1050 __ ret(0); 1051 1052 return start; 1053 } 1054 1055 address generate_fp_mask(const char *stub_name, int64_t mask) { 1056 __ align(CodeEntryAlignment); 1057 StubCodeMark mark(this, "StubRoutines", stub_name); 1058 address start = __ pc(); 1059 1060 __ emit_data64( mask, relocInfo::none ); 1061 __ emit_data64( mask, relocInfo::none ); 1062 1063 return start; 1064 } 1065 1066 // Non-destructive plausibility checks for oops 1067 // 1068 // Arguments: 1069 // all args on stack! 1070 // 1071 // Stack after saving c_rarg3: 1072 // [tos + 0]: saved c_rarg3 1073 // [tos + 1]: saved c_rarg2 1074 // [tos + 2]: saved r12 (several TemplateTable methods use it) 1075 // [tos + 3]: saved flags 1076 // [tos + 4]: return address 1077 // * [tos + 5]: error message (char*) 1078 // * [tos + 6]: object to verify (oop) 1079 // * [tos + 7]: saved rax - saved by caller and bashed 1080 // * [tos + 8]: saved r10 (rscratch1) - saved by caller 1081 // * = popped on exit 1082 address generate_verify_oop() { 1083 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 1084 address start = __ pc(); 1085 1086 Label exit, error; 1087 1088 __ pushf(); 1089 __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 1090 1091 __ push(r12); 1092 1093 // save c_rarg2 and c_rarg3 1094 __ push(c_rarg2); 1095 __ push(c_rarg3); 1096 1097 enum { 1098 // After previous pushes. 1099 oop_to_verify = 6 * wordSize, 1100 saved_rax = 7 * wordSize, 1101 saved_r10 = 8 * wordSize, 1102 1103 // Before the call to MacroAssembler::debug(), see below. 1104 return_addr = 16 * wordSize, 1105 error_msg = 17 * wordSize 1106 }; 1107 1108 // get object 1109 __ movptr(rax, Address(rsp, oop_to_verify)); 1110 1111 // make sure object is 'reasonable' 1112 __ testptr(rax, rax); 1113 __ jcc(Assembler::zero, exit); // if obj is NULL it is OK 1114 1115 if (UseLoadBarrier) { 1116 // Check if metadata bits indicate a bad oop 1117 __ testptr(rax, ExternalAddress((address)&ZAddressBadMask)); 1118 __ jcc(Assembler::notZero, error); 1119 } 1120 1121 // Check if the oop is in the right area of memory 1122 __ movptr(c_rarg2, rax); 1123 __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 1124 __ andptr(c_rarg2, c_rarg3); 1125 __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 1126 __ cmpptr(c_rarg2, c_rarg3); 1127 __ jcc(Assembler::notZero, error); 1128 1129 // set r12 to heapbase for load_klass() 1130 __ reinit_heapbase(); 1131 1132 // make sure klass is 'reasonable', which is not zero. 1133 __ load_klass(rax, rax); // get klass 1134 __ testptr(rax, rax); 1135 __ jcc(Assembler::zero, error); // if klass is NULL it is broken 1136 1137 // return if everything seems ok 1138 __ bind(exit); 1139 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back 1140 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back 1141 __ pop(c_rarg3); // restore c_rarg3 1142 __ pop(c_rarg2); // restore c_rarg2 1143 __ pop(r12); // restore r12 1144 __ popf(); // restore flags 1145 __ ret(4 * wordSize); // pop caller saved stuff 1146 1147 // handle errors 1148 __ bind(error); 1149 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back 1150 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back 1151 __ pop(c_rarg3); // get saved c_rarg3 back 1152 __ pop(c_rarg2); // get saved c_rarg2 back 1153 __ pop(r12); // get saved r12 back 1154 __ popf(); // get saved flags off stack -- 1155 // will be ignored 1156 1157 __ pusha(); // push registers 1158 // (rip is already 1159 // already pushed) 1160 // debug(char* msg, int64_t pc, int64_t regs[]) 1161 // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and 1162 // pushed all the registers, so now the stack looks like: 1163 // [tos + 0] 16 saved registers 1164 // [tos + 16] return address 1165 // * [tos + 17] error message (char*) 1166 // * [tos + 18] object to verify (oop) 1167 // * [tos + 19] saved rax - saved by caller and bashed 1168 // * [tos + 20] saved r10 (rscratch1) - saved by caller 1169 // * = popped on exit 1170 1171 __ movptr(c_rarg0, Address(rsp, error_msg)); // pass address of error message 1172 __ movptr(c_rarg1, Address(rsp, return_addr)); // pass return address 1173 __ movq(c_rarg2, rsp); // pass address of regs on stack 1174 __ mov(r12, rsp); // remember rsp 1175 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 1176 __ andptr(rsp, -16); // align stack as required by ABI 1177 BLOCK_COMMENT("call MacroAssembler::debug"); 1178 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64))); 1179 __ mov(rsp, r12); // restore rsp 1180 __ popa(); // pop registers (includes r12) 1181 __ ret(4 * wordSize); // pop caller saved stuff 1182 1183 return start; 1184 } 1185 1186 // 1187 // Verify that a register contains clean 32-bits positive value 1188 // (high 32-bits are 0) so it could be used in 64-bits shifts. 1189 // 1190 // Input: 1191 // Rint - 32-bits value 1192 // Rtmp - scratch 1193 // 1194 void assert_clean_int(Register Rint, Register Rtmp) { 1195 #ifdef ASSERT 1196 Label L; 1197 assert_different_registers(Rtmp, Rint); 1198 __ movslq(Rtmp, Rint); 1199 __ cmpq(Rtmp, Rint); 1200 __ jcc(Assembler::equal, L); 1201 __ stop("high 32-bits of int value are not 0"); 1202 __ bind(L); 1203 #endif 1204 } 1205 1206 // Generate overlap test for array copy stubs 1207 // 1208 // Input: 1209 // c_rarg0 - from 1210 // c_rarg1 - to 1211 // c_rarg2 - element count 1212 // 1213 // Output: 1214 // rax - &from[element count - 1] 1215 // 1216 void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) { 1217 assert(no_overlap_target != NULL, "must be generated"); 1218 array_overlap_test(no_overlap_target, NULL, sf); 1219 } 1220 void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) { 1221 array_overlap_test(NULL, &L_no_overlap, sf); 1222 } 1223 void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) { 1224 const Register from = c_rarg0; 1225 const Register to = c_rarg1; 1226 const Register count = c_rarg2; 1227 const Register end_from = rax; 1228 1229 __ cmpptr(to, from); 1230 __ lea(end_from, Address(from, count, sf, 0)); 1231 if (NOLp == NULL) { 1232 ExternalAddress no_overlap(no_overlap_target); 1233 __ jump_cc(Assembler::belowEqual, no_overlap); 1234 __ cmpptr(to, end_from); 1235 __ jump_cc(Assembler::aboveEqual, no_overlap); 1236 } else { 1237 __ jcc(Assembler::belowEqual, (*NOLp)); 1238 __ cmpptr(to, end_from); 1239 __ jcc(Assembler::aboveEqual, (*NOLp)); 1240 } 1241 } 1242 1243 // Shuffle first three arg regs on Windows into Linux/Solaris locations. 1244 // 1245 // Outputs: 1246 // rdi - rcx 1247 // rsi - rdx 1248 // rdx - r8 1249 // rcx - r9 1250 // 1251 // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter 1252 // are non-volatile. r9 and r10 should not be used by the caller. 1253 // 1254 void setup_arg_regs(int nargs = 3) { 1255 const Register saved_rdi = r9; 1256 const Register saved_rsi = r10; 1257 assert(nargs == 3 || nargs == 4, "else fix"); 1258 #ifdef _WIN64 1259 assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9, 1260 "unexpected argument registers"); 1261 if (nargs >= 4) 1262 __ mov(rax, r9); // r9 is also saved_rdi 1263 __ movptr(saved_rdi, rdi); 1264 __ movptr(saved_rsi, rsi); 1265 __ mov(rdi, rcx); // c_rarg0 1266 __ mov(rsi, rdx); // c_rarg1 1267 __ mov(rdx, r8); // c_rarg2 1268 if (nargs >= 4) 1269 __ mov(rcx, rax); // c_rarg3 (via rax) 1270 #else 1271 assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx, 1272 "unexpected argument registers"); 1273 #endif 1274 } 1275 1276 void restore_arg_regs() { 1277 const Register saved_rdi = r9; 1278 const Register saved_rsi = r10; 1279 #ifdef _WIN64 1280 __ movptr(rdi, saved_rdi); 1281 __ movptr(rsi, saved_rsi); 1282 #endif 1283 } 1284 1285 // Generate code for an array write pre barrier 1286 // 1287 // addr - starting address 1288 // count - element count 1289 // 1290 // Destroy no registers! 1291 // 1292 void gen_load_ref_array_barrier(Register addr, Register count) { 1293 BarrierSet* bs = Universe::heap()->barrier_set(); 1294 switch (bs->kind()) { 1295 case BarrierSet::Z: 1296 __ pusha(); // push registers 1297 if (count == c_rarg0) { 1298 if (addr == c_rarg1) { 1299 // exactly backwards!! 1300 __ xchgptr(c_rarg1, c_rarg0); 1301 } else { 1302 __ movptr(c_rarg1, count); 1303 __ movptr(c_rarg0, addr); 1304 } 1305 } else { 1306 __ movptr(c_rarg0, addr); 1307 __ movptr(c_rarg1, count); 1308 } 1309 __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<void (*)(volatile oop*, size_t)>(ZBarrier::load_barrier_on_oop_array)), 2); 1310 __ popa(); 1311 break; 1312 case BarrierSet::G1SATBCTLogging: 1313 case BarrierSet::CardTableModRef: 1314 case BarrierSet::CardTableForRS: 1315 case BarrierSet::CardTableExtension: 1316 case BarrierSet::ModRef: 1317 // No barrier 1318 break; 1319 default: 1320 ShouldNotReachHere(); 1321 break; 1322 } 1323 } 1324 1325 // Generate code for an array write pre barrier 1326 // 1327 // addr - starting address 1328 // count - element count 1329 // tmp - scratch register 1330 // 1331 // Destroy no registers! 1332 // 1333 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 1334 BarrierSet* bs = Universe::heap()->barrier_set(); 1335 switch (bs->kind()) { 1336 case BarrierSet::G1SATBCTLogging: 1337 // With G1, don't generate the call if we statically know that the target in uninitialized 1338 if (!dest_uninitialized) { 1339 Label filtered; 1340 Address in_progress(r15_thread, in_bytes(JavaThread::satb_mark_queue_offset() + 1341 SATBMarkQueue::byte_offset_of_active())); 1342 // Is marking active? 1343 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 1344 __ cmpl(in_progress, 0); 1345 } else { 1346 assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption"); 1347 __ cmpb(in_progress, 0); 1348 } 1349 __ jcc(Assembler::equal, filtered); 1350 1351 __ pusha(); // push registers 1352 if (count == c_rarg0) { 1353 if (addr == c_rarg1) { 1354 // exactly backwards!! 1355 __ xchgptr(c_rarg1, c_rarg0); 1356 } else { 1357 __ movptr(c_rarg1, count); 1358 __ movptr(c_rarg0, addr); 1359 } 1360 } else { 1361 __ movptr(c_rarg0, addr); 1362 __ movptr(c_rarg1, count); 1363 } 1364 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 1365 __ popa(); 1366 1367 __ bind(filtered); 1368 } 1369 break; 1370 case BarrierSet::CardTableForRS: 1371 case BarrierSet::CardTableExtension: 1372 case BarrierSet::ModRef: 1373 case BarrierSet::Z: 1374 // No barrier 1375 break; 1376 default: 1377 ShouldNotReachHere(); 1378 1379 } 1380 } 1381 1382 // 1383 // Generate code for an array write post barrier 1384 // 1385 // Input: 1386 // start - register containing starting address of destination array 1387 // count - elements count 1388 // scratch - scratch register 1389 // 1390 // The input registers are overwritten. 1391 // 1392 void gen_write_ref_array_post_barrier(Register start, Register count, Register scratch) { 1393 assert_different_registers(start, count, scratch); 1394 BarrierSet* bs = Universe::heap()->barrier_set(); 1395 switch (bs->kind()) { 1396 case BarrierSet::G1SATBCTLogging: 1397 { 1398 __ pusha(); // push registers (overkill) 1399 if (c_rarg0 == count) { // On win64 c_rarg0 == rcx 1400 assert_different_registers(c_rarg1, start); 1401 __ mov(c_rarg1, count); 1402 __ mov(c_rarg0, start); 1403 } else { 1404 assert_different_registers(c_rarg0, count); 1405 __ mov(c_rarg0, start); 1406 __ mov(c_rarg1, count); 1407 } 1408 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 1409 __ popa(); 1410 } 1411 break; 1412 case BarrierSet::CardTableForRS: 1413 case BarrierSet::CardTableExtension: 1414 { 1415 CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs); 1416 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 1417 1418 Label L_loop, L_done; 1419 const Register end = count; 1420 1421 __ testl(count, count); 1422 __ jcc(Assembler::zero, L_done); // zero count - nothing to do 1423 1424 __ leaq(end, Address(start, count, TIMES_OOP, 0)); // end == start+count*oop_size 1425 __ subptr(end, BytesPerHeapOop); // end - 1 to make inclusive 1426 __ shrptr(start, CardTableModRefBS::card_shift); 1427 __ shrptr(end, CardTableModRefBS::card_shift); 1428 __ subptr(end, start); // end --> cards count 1429 1430 int64_t disp = (int64_t) ct->byte_map_base; 1431 __ mov64(scratch, disp); 1432 __ addptr(start, scratch); 1433 __ BIND(L_loop); 1434 __ movb(Address(start, count, Address::times_1), 0); 1435 __ decrement(count); 1436 __ jcc(Assembler::greaterEqual, L_loop); 1437 __ BIND(L_done); 1438 } 1439 break; 1440 case BarrierSet::Z: 1441 // No barrier 1442 break; 1443 default: 1444 ShouldNotReachHere(); 1445 1446 } 1447 } 1448 1449 1450 // Copy big chunks forward 1451 // 1452 // Inputs: 1453 // end_from - source arrays end address 1454 // end_to - destination array end address 1455 // qword_count - 64-bits element count, negative 1456 // to - scratch 1457 // L_copy_bytes - entry label 1458 // L_copy_8_bytes - exit label 1459 // 1460 void copy_bytes_forward(Register end_from, Register end_to, 1461 Register qword_count, Register to, 1462 Label& L_copy_bytes, Label& L_copy_8_bytes) { 1463 DEBUG_ONLY(__ stop("enter at entry label, not here")); 1464 Label L_loop; 1465 __ align(OptoLoopAlignment); 1466 if (UseUnalignedLoadStores) { 1467 Label L_end; 1468 if (UseAVX > 2) { 1469 __ movl(to, 0xffff); 1470 __ kmovwl(k1, to); 1471 } 1472 // Copy 64-bytes per iteration 1473 __ BIND(L_loop); 1474 if (UseAVX > 2) { 1475 __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit); 1476 __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit); 1477 } else if (UseAVX == 2) { 1478 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); 1479 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); 1480 __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24)); 1481 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1); 1482 } else { 1483 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); 1484 __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); 1485 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40)); 1486 __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1); 1487 __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24)); 1488 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2); 1489 __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8)); 1490 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3); 1491 } 1492 __ BIND(L_copy_bytes); 1493 __ addptr(qword_count, 8); 1494 __ jcc(Assembler::lessEqual, L_loop); 1495 __ subptr(qword_count, 4); // sub(8) and add(4) 1496 __ jccb(Assembler::greater, L_end); 1497 // Copy trailing 32 bytes 1498 if (UseAVX >= 2) { 1499 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); 1500 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); 1501 } else { 1502 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); 1503 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); 1504 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8)); 1505 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1); 1506 } 1507 __ addptr(qword_count, 4); 1508 __ BIND(L_end); 1509 if (UseAVX >= 2) { 1510 // clean upper bits of YMM registers 1511 __ vpxor(xmm0, xmm0); 1512 __ vpxor(xmm1, xmm1); 1513 } 1514 } else { 1515 // Copy 32-bytes per iteration 1516 __ BIND(L_loop); 1517 __ movq(to, Address(end_from, qword_count, Address::times_8, -24)); 1518 __ movq(Address(end_to, qword_count, Address::times_8, -24), to); 1519 __ movq(to, Address(end_from, qword_count, Address::times_8, -16)); 1520 __ movq(Address(end_to, qword_count, Address::times_8, -16), to); 1521 __ movq(to, Address(end_from, qword_count, Address::times_8, - 8)); 1522 __ movq(Address(end_to, qword_count, Address::times_8, - 8), to); 1523 __ movq(to, Address(end_from, qword_count, Address::times_8, - 0)); 1524 __ movq(Address(end_to, qword_count, Address::times_8, - 0), to); 1525 1526 __ BIND(L_copy_bytes); 1527 __ addptr(qword_count, 4); 1528 __ jcc(Assembler::lessEqual, L_loop); 1529 } 1530 __ subptr(qword_count, 4); 1531 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords 1532 } 1533 1534 // Copy big chunks backward 1535 // 1536 // Inputs: 1537 // from - source arrays address 1538 // dest - destination array address 1539 // qword_count - 64-bits element count 1540 // to - scratch 1541 // L_copy_bytes - entry label 1542 // L_copy_8_bytes - exit label 1543 // 1544 void copy_bytes_backward(Register from, Register dest, 1545 Register qword_count, Register to, 1546 Label& L_copy_bytes, Label& L_copy_8_bytes) { 1547 DEBUG_ONLY(__ stop("enter at entry label, not here")); 1548 Label L_loop; 1549 __ align(OptoLoopAlignment); 1550 if (UseUnalignedLoadStores) { 1551 Label L_end; 1552 if (UseAVX > 2) { 1553 __ movl(to, 0xffff); 1554 __ kmovwl(k1, to); 1555 } 1556 // Copy 64-bytes per iteration 1557 __ BIND(L_loop); 1558 if (UseAVX > 2) { 1559 __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit); 1560 __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit); 1561 } else if (UseAVX == 2) { 1562 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); 1563 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); 1564 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); 1565 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); 1566 } else { 1567 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48)); 1568 __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0); 1569 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32)); 1570 __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1); 1571 __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16)); 1572 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2); 1573 __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0)); 1574 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3); 1575 } 1576 __ BIND(L_copy_bytes); 1577 __ subptr(qword_count, 8); 1578 __ jcc(Assembler::greaterEqual, L_loop); 1579 1580 __ addptr(qword_count, 4); // add(8) and sub(4) 1581 __ jccb(Assembler::less, L_end); 1582 // Copy trailing 32 bytes 1583 if (UseAVX >= 2) { 1584 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0)); 1585 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0); 1586 } else { 1587 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16)); 1588 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0); 1589 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); 1590 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); 1591 } 1592 __ subptr(qword_count, 4); 1593 __ BIND(L_end); 1594 if (UseAVX >= 2) { 1595 // clean upper bits of YMM registers 1596 __ vpxor(xmm0, xmm0); 1597 __ vpxor(xmm1, xmm1); 1598 } 1599 } else { 1600 // Copy 32-bytes per iteration 1601 __ BIND(L_loop); 1602 __ movq(to, Address(from, qword_count, Address::times_8, 24)); 1603 __ movq(Address(dest, qword_count, Address::times_8, 24), to); 1604 __ movq(to, Address(from, qword_count, Address::times_8, 16)); 1605 __ movq(Address(dest, qword_count, Address::times_8, 16), to); 1606 __ movq(to, Address(from, qword_count, Address::times_8, 8)); 1607 __ movq(Address(dest, qword_count, Address::times_8, 8), to); 1608 __ movq(to, Address(from, qword_count, Address::times_8, 0)); 1609 __ movq(Address(dest, qword_count, Address::times_8, 0), to); 1610 1611 __ BIND(L_copy_bytes); 1612 __ subptr(qword_count, 4); 1613 __ jcc(Assembler::greaterEqual, L_loop); 1614 } 1615 __ addptr(qword_count, 4); 1616 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords 1617 } 1618 1619 1620 // Arguments: 1621 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1622 // ignored 1623 // name - stub name string 1624 // 1625 // Inputs: 1626 // c_rarg0 - source array address 1627 // c_rarg1 - destination array address 1628 // c_rarg2 - element count, treated as ssize_t, can be zero 1629 // 1630 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1631 // we let the hardware handle it. The one to eight bytes within words, 1632 // dwords or qwords that span cache line boundaries will still be loaded 1633 // and stored atomically. 1634 // 1635 // Side Effects: 1636 // disjoint_byte_copy_entry is set to the no-overlap entry point 1637 // used by generate_conjoint_byte_copy(). 1638 // 1639 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1640 __ align(CodeEntryAlignment); 1641 StubCodeMark mark(this, "StubRoutines", name); 1642 address start = __ pc(); 1643 1644 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1645 Label L_copy_byte, L_exit; 1646 const Register from = rdi; // source array address 1647 const Register to = rsi; // destination array address 1648 const Register count = rdx; // elements count 1649 const Register byte_count = rcx; 1650 const Register qword_count = count; 1651 const Register end_from = from; // source array end address 1652 const Register end_to = to; // destination array end address 1653 // End pointers are inclusive, and if count is not zero they point 1654 // to the last unit copied: end_to[0] := end_from[0] 1655 1656 __ enter(); // required for proper stackwalking of RuntimeStub frame 1657 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1658 1659 if (entry != NULL) { 1660 *entry = __ pc(); 1661 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1662 BLOCK_COMMENT("Entry:"); 1663 } 1664 1665 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1666 // r9 and r10 may be used to save non-volatile registers 1667 1668 // 'from', 'to' and 'count' are now valid 1669 __ movptr(byte_count, count); 1670 __ shrptr(count, 3); // count => qword_count 1671 1672 // Copy from low to high addresses. Use 'to' as scratch. 1673 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1674 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1675 __ negptr(qword_count); // make the count negative 1676 __ jmp(L_copy_bytes); 1677 1678 // Copy trailing qwords 1679 __ BIND(L_copy_8_bytes); 1680 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1681 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1682 __ increment(qword_count); 1683 __ jcc(Assembler::notZero, L_copy_8_bytes); 1684 1685 // Check for and copy trailing dword 1686 __ BIND(L_copy_4_bytes); 1687 __ testl(byte_count, 4); 1688 __ jccb(Assembler::zero, L_copy_2_bytes); 1689 __ movl(rax, Address(end_from, 8)); 1690 __ movl(Address(end_to, 8), rax); 1691 1692 __ addptr(end_from, 4); 1693 __ addptr(end_to, 4); 1694 1695 // Check for and copy trailing word 1696 __ BIND(L_copy_2_bytes); 1697 __ testl(byte_count, 2); 1698 __ jccb(Assembler::zero, L_copy_byte); 1699 __ movw(rax, Address(end_from, 8)); 1700 __ movw(Address(end_to, 8), rax); 1701 1702 __ addptr(end_from, 2); 1703 __ addptr(end_to, 2); 1704 1705 // Check for and copy trailing byte 1706 __ BIND(L_copy_byte); 1707 __ testl(byte_count, 1); 1708 __ jccb(Assembler::zero, L_exit); 1709 __ movb(rax, Address(end_from, 8)); 1710 __ movb(Address(end_to, 8), rax); 1711 1712 __ BIND(L_exit); 1713 restore_arg_regs(); 1714 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free 1715 __ xorptr(rax, rax); // return 0 1716 __ vzeroupper(); 1717 __ leave(); // required for proper stackwalking of RuntimeStub frame 1718 __ ret(0); 1719 1720 // Copy in multi-bytes chunks 1721 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1722 __ jmp(L_copy_4_bytes); 1723 1724 return start; 1725 } 1726 1727 // Arguments: 1728 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1729 // ignored 1730 // name - stub name string 1731 // 1732 // Inputs: 1733 // c_rarg0 - source array address 1734 // c_rarg1 - destination array address 1735 // c_rarg2 - element count, treated as ssize_t, can be zero 1736 // 1737 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1738 // we let the hardware handle it. The one to eight bytes within words, 1739 // dwords or qwords that span cache line boundaries will still be loaded 1740 // and stored atomically. 1741 // 1742 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1743 address* entry, const char *name) { 1744 __ align(CodeEntryAlignment); 1745 StubCodeMark mark(this, "StubRoutines", name); 1746 address start = __ pc(); 1747 1748 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1749 const Register from = rdi; // source array address 1750 const Register to = rsi; // destination array address 1751 const Register count = rdx; // elements count 1752 const Register byte_count = rcx; 1753 const Register qword_count = count; 1754 1755 __ enter(); // required for proper stackwalking of RuntimeStub frame 1756 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1757 1758 if (entry != NULL) { 1759 *entry = __ pc(); 1760 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1761 BLOCK_COMMENT("Entry:"); 1762 } 1763 1764 array_overlap_test(nooverlap_target, Address::times_1); 1765 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1766 // r9 and r10 may be used to save non-volatile registers 1767 1768 // 'from', 'to' and 'count' are now valid 1769 __ movptr(byte_count, count); 1770 __ shrptr(count, 3); // count => qword_count 1771 1772 // Copy from high to low addresses. 1773 1774 // Check for and copy trailing byte 1775 __ testl(byte_count, 1); 1776 __ jcc(Assembler::zero, L_copy_2_bytes); 1777 __ movb(rax, Address(from, byte_count, Address::times_1, -1)); 1778 __ movb(Address(to, byte_count, Address::times_1, -1), rax); 1779 __ decrement(byte_count); // Adjust for possible trailing word 1780 1781 // Check for and copy trailing word 1782 __ BIND(L_copy_2_bytes); 1783 __ testl(byte_count, 2); 1784 __ jcc(Assembler::zero, L_copy_4_bytes); 1785 __ movw(rax, Address(from, byte_count, Address::times_1, -2)); 1786 __ movw(Address(to, byte_count, Address::times_1, -2), rax); 1787 1788 // Check for and copy trailing dword 1789 __ BIND(L_copy_4_bytes); 1790 __ testl(byte_count, 4); 1791 __ jcc(Assembler::zero, L_copy_bytes); 1792 __ movl(rax, Address(from, qword_count, Address::times_8)); 1793 __ movl(Address(to, qword_count, Address::times_8), rax); 1794 __ jmp(L_copy_bytes); 1795 1796 // Copy trailing qwords 1797 __ BIND(L_copy_8_bytes); 1798 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1799 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1800 __ decrement(qword_count); 1801 __ jcc(Assembler::notZero, L_copy_8_bytes); 1802 1803 restore_arg_regs(); 1804 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free 1805 __ xorptr(rax, rax); // return 0 1806 __ vzeroupper(); 1807 __ leave(); // required for proper stackwalking of RuntimeStub frame 1808 __ ret(0); 1809 1810 // Copy in multi-bytes chunks 1811 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1812 1813 restore_arg_regs(); 1814 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free 1815 __ xorptr(rax, rax); // return 0 1816 __ vzeroupper(); 1817 __ leave(); // required for proper stackwalking of RuntimeStub frame 1818 __ ret(0); 1819 1820 return start; 1821 } 1822 1823 // Arguments: 1824 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1825 // ignored 1826 // name - stub name string 1827 // 1828 // Inputs: 1829 // c_rarg0 - source array address 1830 // c_rarg1 - destination array address 1831 // c_rarg2 - element count, treated as ssize_t, can be zero 1832 // 1833 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1834 // let the hardware handle it. The two or four words within dwords 1835 // or qwords that span cache line boundaries will still be loaded 1836 // and stored atomically. 1837 // 1838 // Side Effects: 1839 // disjoint_short_copy_entry is set to the no-overlap entry point 1840 // used by generate_conjoint_short_copy(). 1841 // 1842 address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) { 1843 __ align(CodeEntryAlignment); 1844 StubCodeMark mark(this, "StubRoutines", name); 1845 address start = __ pc(); 1846 1847 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit; 1848 const Register from = rdi; // source array address 1849 const Register to = rsi; // destination array address 1850 const Register count = rdx; // elements count 1851 const Register word_count = rcx; 1852 const Register qword_count = count; 1853 const Register end_from = from; // source array end address 1854 const Register end_to = to; // destination array end address 1855 // End pointers are inclusive, and if count is not zero they point 1856 // to the last unit copied: end_to[0] := end_from[0] 1857 1858 __ enter(); // required for proper stackwalking of RuntimeStub frame 1859 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1860 1861 if (entry != NULL) { 1862 *entry = __ pc(); 1863 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1864 BLOCK_COMMENT("Entry:"); 1865 } 1866 1867 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1868 // r9 and r10 may be used to save non-volatile registers 1869 1870 // 'from', 'to' and 'count' are now valid 1871 __ movptr(word_count, count); 1872 __ shrptr(count, 2); // count => qword_count 1873 1874 // Copy from low to high addresses. Use 'to' as scratch. 1875 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1876 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1877 __ negptr(qword_count); 1878 __ jmp(L_copy_bytes); 1879 1880 // Copy trailing qwords 1881 __ BIND(L_copy_8_bytes); 1882 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1883 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1884 __ increment(qword_count); 1885 __ jcc(Assembler::notZero, L_copy_8_bytes); 1886 1887 // Original 'dest' is trashed, so we can't use it as a 1888 // base register for a possible trailing word copy 1889 1890 // Check for and copy trailing dword 1891 __ BIND(L_copy_4_bytes); 1892 __ testl(word_count, 2); 1893 __ jccb(Assembler::zero, L_copy_2_bytes); 1894 __ movl(rax, Address(end_from, 8)); 1895 __ movl(Address(end_to, 8), rax); 1896 1897 __ addptr(end_from, 4); 1898 __ addptr(end_to, 4); 1899 1900 // Check for and copy trailing word 1901 __ BIND(L_copy_2_bytes); 1902 __ testl(word_count, 1); 1903 __ jccb(Assembler::zero, L_exit); 1904 __ movw(rax, Address(end_from, 8)); 1905 __ movw(Address(end_to, 8), rax); 1906 1907 __ BIND(L_exit); 1908 restore_arg_regs(); 1909 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free 1910 __ xorptr(rax, rax); // return 0 1911 __ vzeroupper(); 1912 __ leave(); // required for proper stackwalking of RuntimeStub frame 1913 __ ret(0); 1914 1915 // Copy in multi-bytes chunks 1916 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1917 __ jmp(L_copy_4_bytes); 1918 1919 return start; 1920 } 1921 1922 address generate_fill(BasicType t, bool aligned, const char *name) { 1923 __ align(CodeEntryAlignment); 1924 StubCodeMark mark(this, "StubRoutines", name); 1925 address start = __ pc(); 1926 1927 BLOCK_COMMENT("Entry:"); 1928 1929 const Register to = c_rarg0; // source array address 1930 const Register value = c_rarg1; // value 1931 const Register count = c_rarg2; // elements count 1932 1933 __ enter(); // required for proper stackwalking of RuntimeStub frame 1934 1935 __ generate_fill(t, aligned, to, value, count, rax, xmm0); 1936 1937 __ vzeroupper(); 1938 __ leave(); // required for proper stackwalking of RuntimeStub frame 1939 __ ret(0); 1940 return start; 1941 } 1942 1943 // Arguments: 1944 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1945 // ignored 1946 // name - stub name string 1947 // 1948 // Inputs: 1949 // c_rarg0 - source array address 1950 // c_rarg1 - destination array address 1951 // c_rarg2 - element count, treated as ssize_t, can be zero 1952 // 1953 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1954 // let the hardware handle it. The two or four words within dwords 1955 // or qwords that span cache line boundaries will still be loaded 1956 // and stored atomically. 1957 // 1958 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1959 address *entry, const char *name) { 1960 __ align(CodeEntryAlignment); 1961 StubCodeMark mark(this, "StubRoutines", name); 1962 address start = __ pc(); 1963 1964 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes; 1965 const Register from = rdi; // source array address 1966 const Register to = rsi; // destination array address 1967 const Register count = rdx; // elements count 1968 const Register word_count = rcx; 1969 const Register qword_count = count; 1970 1971 __ enter(); // required for proper stackwalking of RuntimeStub frame 1972 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1973 1974 if (entry != NULL) { 1975 *entry = __ pc(); 1976 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1977 BLOCK_COMMENT("Entry:"); 1978 } 1979 1980 array_overlap_test(nooverlap_target, Address::times_2); 1981 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1982 // r9 and r10 may be used to save non-volatile registers 1983 1984 // 'from', 'to' and 'count' are now valid 1985 __ movptr(word_count, count); 1986 __ shrptr(count, 2); // count => qword_count 1987 1988 // Copy from high to low addresses. Use 'to' as scratch. 1989 1990 // Check for and copy trailing word 1991 __ testl(word_count, 1); 1992 __ jccb(Assembler::zero, L_copy_4_bytes); 1993 __ movw(rax, Address(from, word_count, Address::times_2, -2)); 1994 __ movw(Address(to, word_count, Address::times_2, -2), rax); 1995 1996 // Check for and copy trailing dword 1997 __ BIND(L_copy_4_bytes); 1998 __ testl(word_count, 2); 1999 __ jcc(Assembler::zero, L_copy_bytes); 2000 __ movl(rax, Address(from, qword_count, Address::times_8)); 2001 __ movl(Address(to, qword_count, Address::times_8), rax); 2002 __ jmp(L_copy_bytes); 2003 2004 // Copy trailing qwords 2005 __ BIND(L_copy_8_bytes); 2006 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 2007 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 2008 __ decrement(qword_count); 2009 __ jcc(Assembler::notZero, L_copy_8_bytes); 2010 2011 restore_arg_regs(); 2012 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free 2013 __ xorptr(rax, rax); // return 0 2014 __ vzeroupper(); 2015 __ leave(); // required for proper stackwalking of RuntimeStub frame 2016 __ ret(0); 2017 2018 // Copy in multi-bytes chunks 2019 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 2020 2021 restore_arg_regs(); 2022 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free 2023 __ xorptr(rax, rax); // return 0 2024 __ vzeroupper(); 2025 __ leave(); // required for proper stackwalking of RuntimeStub frame 2026 __ ret(0); 2027 2028 return start; 2029 } 2030 2031 // Arguments: 2032 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 2033 // ignored 2034 // is_oop - true => oop array, so generate store check code 2035 // name - stub name string 2036 // 2037 // Inputs: 2038 // c_rarg0 - source array address 2039 // c_rarg1 - destination array address 2040 // c_rarg2 - element count, treated as ssize_t, can be zero 2041 // 2042 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 2043 // the hardware handle it. The two dwords within qwords that span 2044 // cache line boundaries will still be loaded and stored atomicly. 2045 // 2046 // Side Effects: 2047 // disjoint_int_copy_entry is set to the no-overlap entry point 2048 // used by generate_conjoint_int_oop_copy(). 2049 // 2050 address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry, 2051 const char *name, bool dest_uninitialized = false) { 2052 __ align(CodeEntryAlignment); 2053 StubCodeMark mark(this, "StubRoutines", name); 2054 address start = __ pc(); 2055 2056 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit; 2057 const Register from = rdi; // source array address 2058 const Register to = rsi; // destination array address 2059 const Register count = rdx; // elements count 2060 const Register dword_count = rcx; 2061 const Register qword_count = count; 2062 const Register end_from = from; // source array end address 2063 const Register end_to = to; // destination array end address 2064 const Register saved_to = r11; // saved destination array address 2065 // End pointers are inclusive, and if count is not zero they point 2066 // to the last unit copied: end_to[0] := end_from[0] 2067 2068 __ enter(); // required for proper stackwalking of RuntimeStub frame 2069 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2070 2071 if (entry != NULL) { 2072 *entry = __ pc(); 2073 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2074 BLOCK_COMMENT("Entry:"); 2075 } 2076 2077 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 2078 // r9 and r10 may be used to save non-volatile registers 2079 if (is_oop) { 2080 __ movq(saved_to, to); 2081 gen_load_ref_array_barrier(from, count); 2082 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized); 2083 } 2084 2085 // 'from', 'to' and 'count' are now valid 2086 __ movptr(dword_count, count); 2087 __ shrptr(count, 1); // count => qword_count 2088 2089 // Copy from low to high addresses. Use 'to' as scratch. 2090 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 2091 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 2092 __ negptr(qword_count); 2093 __ jmp(L_copy_bytes); 2094 2095 // Copy trailing qwords 2096 __ BIND(L_copy_8_bytes); 2097 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 2098 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 2099 __ increment(qword_count); 2100 __ jcc(Assembler::notZero, L_copy_8_bytes); 2101 2102 // Check for and copy trailing dword 2103 __ BIND(L_copy_4_bytes); 2104 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1 2105 __ jccb(Assembler::zero, L_exit); 2106 __ movl(rax, Address(end_from, 8)); 2107 __ movl(Address(end_to, 8), rax); 2108 2109 __ BIND(L_exit); 2110 if (is_oop) { 2111 gen_write_ref_array_post_barrier(saved_to, dword_count, rax); 2112 } 2113 restore_arg_regs(); 2114 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free 2115 __ vzeroupper(); 2116 __ xorptr(rax, rax); // return 0 2117 __ leave(); // required for proper stackwalking of RuntimeStub frame 2118 __ ret(0); 2119 2120 // Copy in multi-bytes chunks 2121 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 2122 __ jmp(L_copy_4_bytes); 2123 2124 return start; 2125 } 2126 2127 // Arguments: 2128 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 2129 // ignored 2130 // is_oop - true => oop array, so generate store check code 2131 // name - stub name string 2132 // 2133 // Inputs: 2134 // c_rarg0 - source array address 2135 // c_rarg1 - destination array address 2136 // c_rarg2 - element count, treated as ssize_t, can be zero 2137 // 2138 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 2139 // the hardware handle it. The two dwords within qwords that span 2140 // cache line boundaries will still be loaded and stored atomicly. 2141 // 2142 address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target, 2143 address *entry, const char *name, 2144 bool dest_uninitialized = false) { 2145 __ align(CodeEntryAlignment); 2146 StubCodeMark mark(this, "StubRoutines", name); 2147 address start = __ pc(); 2148 2149 Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit; 2150 const Register from = rdi; // source array address 2151 const Register to = rsi; // destination array address 2152 const Register count = rdx; // elements count 2153 const Register dword_count = rcx; 2154 const Register qword_count = count; 2155 2156 __ enter(); // required for proper stackwalking of RuntimeStub frame 2157 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2158 2159 if (entry != NULL) { 2160 *entry = __ pc(); 2161 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2162 BLOCK_COMMENT("Entry:"); 2163 } 2164 2165 array_overlap_test(nooverlap_target, Address::times_4); 2166 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 2167 // r9 and r10 may be used to save non-volatile registers 2168 2169 if (is_oop) { 2170 // no registers are destroyed by this call 2171 gen_load_ref_array_barrier(from, count); 2172 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized); 2173 } 2174 2175 assert_clean_int(count, rax); // Make sure 'count' is clean int. 2176 // 'from', 'to' and 'count' are now valid 2177 __ movptr(dword_count, count); 2178 __ shrptr(count, 1); // count => qword_count 2179 2180 // Copy from high to low addresses. Use 'to' as scratch. 2181 2182 // Check for and copy trailing dword 2183 __ testl(dword_count, 1); 2184 __ jcc(Assembler::zero, L_copy_bytes); 2185 __ movl(rax, Address(from, dword_count, Address::times_4, -4)); 2186 __ movl(Address(to, dword_count, Address::times_4, -4), rax); 2187 __ jmp(L_copy_bytes); 2188 2189 // Copy trailing qwords 2190 __ BIND(L_copy_8_bytes); 2191 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 2192 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 2193 __ decrement(qword_count); 2194 __ jcc(Assembler::notZero, L_copy_8_bytes); 2195 2196 if (is_oop) { 2197 __ jmp(L_exit); 2198 } 2199 restore_arg_regs(); 2200 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free 2201 __ xorptr(rax, rax); // return 0 2202 __ vzeroupper(); 2203 __ leave(); // required for proper stackwalking of RuntimeStub frame 2204 __ ret(0); 2205 2206 // Copy in multi-bytes chunks 2207 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 2208 2209 __ BIND(L_exit); 2210 if (is_oop) { 2211 gen_write_ref_array_post_barrier(to, dword_count, rax); 2212 } 2213 restore_arg_regs(); 2214 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free 2215 __ xorptr(rax, rax); // return 0 2216 __ vzeroupper(); 2217 __ leave(); // required for proper stackwalking of RuntimeStub frame 2218 __ ret(0); 2219 2220 return start; 2221 } 2222 2223 // Arguments: 2224 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 2225 // ignored 2226 // is_oop - true => oop array, so generate store check code 2227 // name - stub name string 2228 // 2229 // Inputs: 2230 // c_rarg0 - source array address 2231 // c_rarg1 - destination array address 2232 // c_rarg2 - element count, treated as ssize_t, can be zero 2233 // 2234 // Side Effects: 2235 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 2236 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 2237 // 2238 address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry, 2239 const char *name, bool dest_uninitialized = false) { 2240 __ align(CodeEntryAlignment); 2241 StubCodeMark mark(this, "StubRoutines", name); 2242 address start = __ pc(); 2243 2244 Label L_copy_bytes, L_copy_8_bytes, L_exit; 2245 const Register from = rdi; // source array address 2246 const Register to = rsi; // destination array address 2247 const Register qword_count = rdx; // elements count 2248 const Register end_from = from; // source array end address 2249 const Register end_to = rcx; // destination array end address 2250 const Register saved_to = to; 2251 const Register saved_count = r11; 2252 // End pointers are inclusive, and if count is not zero they point 2253 // to the last unit copied: end_to[0] := end_from[0] 2254 2255 __ enter(); // required for proper stackwalking of RuntimeStub frame 2256 // Save no-overlap entry point for generate_conjoint_long_oop_copy() 2257 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2258 2259 if (entry != NULL) { 2260 *entry = __ pc(); 2261 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2262 BLOCK_COMMENT("Entry:"); 2263 } 2264 2265 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 2266 // r9 and r10 may be used to save non-volatile registers 2267 // 'from', 'to' and 'qword_count' are now valid 2268 if (is_oop) { 2269 // Save to and count for store barrier 2270 __ movptr(saved_count, qword_count); 2271 // no registers are destroyed by this call 2272 gen_load_ref_array_barrier(from, qword_count); 2273 gen_write_ref_array_pre_barrier(to, qword_count, dest_uninitialized); 2274 } 2275 2276 // Copy from low to high addresses. Use 'to' as scratch. 2277 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 2278 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 2279 __ negptr(qword_count); 2280 __ jmp(L_copy_bytes); 2281 2282 // Copy trailing qwords 2283 __ BIND(L_copy_8_bytes); 2284 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 2285 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 2286 __ increment(qword_count); 2287 __ jcc(Assembler::notZero, L_copy_8_bytes); 2288 2289 if (is_oop) { 2290 __ jmp(L_exit); 2291 } else { 2292 restore_arg_regs(); 2293 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free 2294 __ xorptr(rax, rax); // return 0 2295 __ vzeroupper(); 2296 __ leave(); // required for proper stackwalking of RuntimeStub frame 2297 __ ret(0); 2298 } 2299 2300 // Copy in multi-bytes chunks 2301 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 2302 2303 if (is_oop) { 2304 __ BIND(L_exit); 2305 gen_write_ref_array_post_barrier(saved_to, saved_count, rax); 2306 } 2307 restore_arg_regs(); 2308 if (is_oop) { 2309 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free 2310 } else { 2311 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free 2312 } 2313 __ vzeroupper(); 2314 __ xorptr(rax, rax); // return 0 2315 __ leave(); // required for proper stackwalking of RuntimeStub frame 2316 __ ret(0); 2317 2318 return start; 2319 } 2320 2321 // Arguments: 2322 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 2323 // ignored 2324 // is_oop - true => oop array, so generate store check code 2325 // name - stub name string 2326 // 2327 // Inputs: 2328 // c_rarg0 - source array address 2329 // c_rarg1 - destination array address 2330 // c_rarg2 - element count, treated as ssize_t, can be zero 2331 // 2332 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, 2333 address nooverlap_target, address *entry, 2334 const char *name, bool dest_uninitialized = false) { 2335 __ align(CodeEntryAlignment); 2336 StubCodeMark mark(this, "StubRoutines", name); 2337 address start = __ pc(); 2338 2339 Label L_copy_bytes, L_copy_8_bytes, L_exit; 2340 const Register from = rdi; // source array address 2341 const Register to = rsi; // destination array address 2342 const Register qword_count = rdx; // elements count 2343 const Register saved_count = rcx; 2344 2345 __ enter(); // required for proper stackwalking of RuntimeStub frame 2346 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2347 2348 if (entry != NULL) { 2349 *entry = __ pc(); 2350 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2351 BLOCK_COMMENT("Entry:"); 2352 } 2353 2354 array_overlap_test(nooverlap_target, Address::times_8); 2355 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 2356 // r9 and r10 may be used to save non-volatile registers 2357 // 'from', 'to' and 'qword_count' are now valid 2358 if (is_oop) { 2359 // Save to and count for store barrier 2360 __ movptr(saved_count, qword_count); 2361 // No registers are destroyed by this call 2362 gen_load_ref_array_barrier(from, saved_count); 2363 gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized); 2364 } 2365 2366 __ jmp(L_copy_bytes); 2367 2368 // Copy trailing qwords 2369 __ BIND(L_copy_8_bytes); 2370 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 2371 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 2372 __ decrement(qword_count); 2373 __ jcc(Assembler::notZero, L_copy_8_bytes); 2374 2375 if (is_oop) { 2376 __ jmp(L_exit); 2377 } else { 2378 restore_arg_regs(); 2379 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free 2380 __ xorptr(rax, rax); // return 0 2381 __ vzeroupper(); 2382 __ leave(); // required for proper stackwalking of RuntimeStub frame 2383 __ ret(0); 2384 } 2385 2386 // Copy in multi-bytes chunks 2387 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 2388 2389 if (is_oop) { 2390 __ BIND(L_exit); 2391 gen_write_ref_array_post_barrier(to, saved_count, rax); 2392 } 2393 restore_arg_regs(); 2394 if (is_oop) { 2395 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free 2396 } else { 2397 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free 2398 } 2399 __ vzeroupper(); 2400 __ xorptr(rax, rax); // return 0 2401 __ leave(); // required for proper stackwalking of RuntimeStub frame 2402 __ ret(0); 2403 2404 return start; 2405 } 2406 2407 2408 // Helper for generating a dynamic type check. 2409 // Smashes no registers. 2410 void generate_type_check(Register sub_klass, 2411 Register super_check_offset, 2412 Register super_klass, 2413 Label& L_success) { 2414 assert_different_registers(sub_klass, super_check_offset, super_klass); 2415 2416 BLOCK_COMMENT("type_check:"); 2417 2418 Label L_miss; 2419 2420 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 2421 super_check_offset); 2422 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 2423 2424 // Fall through on failure! 2425 __ BIND(L_miss); 2426 } 2427 2428 // 2429 // Generate checkcasting array copy stub 2430 // 2431 // Input: 2432 // c_rarg0 - source array address 2433 // c_rarg1 - destination array address 2434 // c_rarg2 - element count, treated as ssize_t, can be zero 2435 // c_rarg3 - size_t ckoff (super_check_offset) 2436 // not Win64 2437 // c_rarg4 - oop ckval (super_klass) 2438 // Win64 2439 // rsp+40 - oop ckval (super_klass) 2440 // 2441 // Output: 2442 // rax == 0 - success 2443 // rax == -1^K - failure, where K is partial transfer count 2444 // 2445 address generate_checkcast_copy(const char *name, address *entry, 2446 bool dest_uninitialized = false) { 2447 2448 Label L_load_element, L_store_element, L_do_card_marks, L_done; 2449 2450 // Input registers (after setup_arg_regs) 2451 const Register from = rdi; // source array address 2452 const Register to = rsi; // destination array address 2453 const Register length = rdx; // elements count 2454 const Register ckoff = rcx; // super_check_offset 2455 const Register ckval = r8; // super_klass 2456 2457 // Registers used as temps (r13, r14 are save-on-entry) 2458 const Register end_from = from; // source array end address 2459 const Register end_to = r13; // destination array end address 2460 const Register count = rdx; // -(count_remaining) 2461 const Register r14_length = r14; // saved copy of length 2462 // End pointers are inclusive, and if length is not zero they point 2463 // to the last unit copied: end_to[0] := end_from[0] 2464 2465 const Register rax_oop = rax; // actual oop copied 2466 const Register r11_klass = r11; // oop._klass 2467 2468 //--------------------------------------------------------------- 2469 // Assembler stub will be used for this call to arraycopy 2470 // if the two arrays are subtypes of Object[] but the 2471 // destination array type is not equal to or a supertype 2472 // of the source type. Each element must be separately 2473 // checked. 2474 2475 __ align(CodeEntryAlignment); 2476 StubCodeMark mark(this, "StubRoutines", name); 2477 address start = __ pc(); 2478 2479 __ enter(); // required for proper stackwalking of RuntimeStub frame 2480 2481 #ifdef ASSERT 2482 // caller guarantees that the arrays really are different 2483 // otherwise, we would have to make conjoint checks 2484 { Label L; 2485 array_overlap_test(L, TIMES_OOP); 2486 __ stop("checkcast_copy within a single array"); 2487 __ bind(L); 2488 } 2489 #endif //ASSERT 2490 2491 setup_arg_regs(4); // from => rdi, to => rsi, length => rdx 2492 // ckoff => rcx, ckval => r8 2493 // r9 and r10 may be used to save non-volatile registers 2494 #ifdef _WIN64 2495 // last argument (#4) is on stack on Win64 2496 __ movptr(ckval, Address(rsp, 6 * wordSize)); 2497 #endif 2498 2499 // Caller of this entry point must set up the argument registers. 2500 if (entry != NULL) { 2501 *entry = __ pc(); 2502 BLOCK_COMMENT("Entry:"); 2503 } 2504 2505 // allocate spill slots for r13, r14 2506 enum { 2507 saved_r13_offset, 2508 saved_r14_offset, 2509 saved_rbp_offset 2510 }; 2511 __ subptr(rsp, saved_rbp_offset * wordSize); 2512 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13); 2513 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14); 2514 2515 // check that int operands are properly extended to size_t 2516 assert_clean_int(length, rax); 2517 assert_clean_int(ckoff, rax); 2518 2519 #ifdef ASSERT 2520 BLOCK_COMMENT("assert consistent ckoff/ckval"); 2521 // The ckoff and ckval must be mutually consistent, 2522 // even though caller generates both. 2523 { Label L; 2524 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2525 __ cmpl(ckoff, Address(ckval, sco_offset)); 2526 __ jcc(Assembler::equal, L); 2527 __ stop("super_check_offset inconsistent"); 2528 __ bind(L); 2529 } 2530 #endif //ASSERT 2531 2532 // Loop-invariant addresses. They are exclusive end pointers. 2533 Address end_from_addr(from, length, TIMES_OOP, 0); 2534 Address end_to_addr(to, length, TIMES_OOP, 0); 2535 // Loop-variant addresses. They assume post-incremented count < 0. 2536 Address from_element_addr(end_from, count, TIMES_OOP, 0); 2537 Address to_element_addr(end_to, count, TIMES_OOP, 0); 2538 2539 gen_load_ref_array_barrier(from, count); 2540 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized); 2541 2542 // Copy from low to high addresses, indexed from the end of each array. 2543 __ lea(end_from, end_from_addr); 2544 __ lea(end_to, end_to_addr); 2545 __ movptr(r14_length, length); // save a copy of the length 2546 assert(length == count, ""); // else fix next line: 2547 __ negptr(count); // negate and test the length 2548 __ jcc(Assembler::notZero, L_load_element); 2549 2550 // Empty array: Nothing to do. 2551 __ xorptr(rax, rax); // return 0 on (trivial) success 2552 __ jmp(L_done); 2553 2554 // ======== begin loop ======== 2555 // (Loop is rotated; its entry is L_load_element.) 2556 // Loop control: 2557 // for (count = -count; count != 0; count++) 2558 // Base pointers src, dst are biased by 8*(count-1),to last element. 2559 __ align(OptoLoopAlignment); 2560 2561 __ BIND(L_store_element); 2562 __ store_heap_oop(to_element_addr, rax_oop); // store the oop 2563 __ increment(count); // increment the count toward zero 2564 __ jcc(Assembler::zero, L_do_card_marks); 2565 2566 // ======== loop entry is here ======== 2567 __ BIND(L_load_element); 2568 __ load_heap_oop(rax_oop, from_element_addr); // load the oop 2569 __ testptr(rax_oop, rax_oop); 2570 __ jcc(Assembler::zero, L_store_element); 2571 2572 __ load_klass(r11_klass, rax_oop);// query the object klass 2573 generate_type_check(r11_klass, ckoff, ckval, L_store_element); 2574 // ======== end loop ======== 2575 2576 // It was a real error; we must depend on the caller to finish the job. 2577 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops. 2578 // Emit GC store barriers for the oops we have copied (r14 + rdx), 2579 // and report their number to the caller. 2580 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1); 2581 Label L_post_barrier; 2582 __ addptr(r14_length, count); // K = (original - remaining) oops 2583 __ movptr(rax, r14_length); // save the value 2584 __ notptr(rax); // report (-1^K) to caller (does not affect flags) 2585 __ jccb(Assembler::notZero, L_post_barrier); 2586 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier 2587 2588 // Come here on success only. 2589 __ BIND(L_do_card_marks); 2590 __ xorptr(rax, rax); // return 0 on success 2591 2592 __ BIND(L_post_barrier); 2593 gen_write_ref_array_post_barrier(to, r14_length, rscratch1); 2594 2595 // Common exit point (success or failure). 2596 __ BIND(L_done); 2597 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize)); 2598 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize)); 2599 restore_arg_regs(); 2600 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free 2601 __ leave(); // required for proper stackwalking of RuntimeStub frame 2602 __ ret(0); 2603 2604 return start; 2605 } 2606 2607 // 2608 // Generate 'unsafe' array copy stub 2609 // Though just as safe as the other stubs, it takes an unscaled 2610 // size_t argument instead of an element count. 2611 // 2612 // Input: 2613 // c_rarg0 - source array address 2614 // c_rarg1 - destination array address 2615 // c_rarg2 - byte count, treated as ssize_t, can be zero 2616 // 2617 // Examines the alignment of the operands and dispatches 2618 // to a long, int, short, or byte copy loop. 2619 // 2620 address generate_unsafe_copy(const char *name, 2621 address byte_copy_entry, address short_copy_entry, 2622 address int_copy_entry, address long_copy_entry) { 2623 2624 Label L_long_aligned, L_int_aligned, L_short_aligned; 2625 2626 // Input registers (before setup_arg_regs) 2627 const Register from = c_rarg0; // source array address 2628 const Register to = c_rarg1; // destination array address 2629 const Register size = c_rarg2; // byte count (size_t) 2630 2631 // Register used as a temp 2632 const Register bits = rax; // test copy of low bits 2633 2634 __ align(CodeEntryAlignment); 2635 StubCodeMark mark(this, "StubRoutines", name); 2636 address start = __ pc(); 2637 2638 __ enter(); // required for proper stackwalking of RuntimeStub frame 2639 2640 // bump this on entry, not on exit: 2641 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2642 2643 __ mov(bits, from); 2644 __ orptr(bits, to); 2645 __ orptr(bits, size); 2646 2647 __ testb(bits, BytesPerLong-1); 2648 __ jccb(Assembler::zero, L_long_aligned); 2649 2650 __ testb(bits, BytesPerInt-1); 2651 __ jccb(Assembler::zero, L_int_aligned); 2652 2653 __ testb(bits, BytesPerShort-1); 2654 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry)); 2655 2656 __ BIND(L_short_aligned); 2657 __ shrptr(size, LogBytesPerShort); // size => short_count 2658 __ jump(RuntimeAddress(short_copy_entry)); 2659 2660 __ BIND(L_int_aligned); 2661 __ shrptr(size, LogBytesPerInt); // size => int_count 2662 __ jump(RuntimeAddress(int_copy_entry)); 2663 2664 __ BIND(L_long_aligned); 2665 __ shrptr(size, LogBytesPerLong); // size => qword_count 2666 __ jump(RuntimeAddress(long_copy_entry)); 2667 2668 return start; 2669 } 2670 2671 // Perform range checks on the proposed arraycopy. 2672 // Kills temp, but nothing else. 2673 // Also, clean the sign bits of src_pos and dst_pos. 2674 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2675 Register src_pos, // source position (c_rarg1) 2676 Register dst, // destination array oo (c_rarg2) 2677 Register dst_pos, // destination position (c_rarg3) 2678 Register length, 2679 Register temp, 2680 Label& L_failed) { 2681 BLOCK_COMMENT("arraycopy_range_checks:"); 2682 2683 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2684 __ movl(temp, length); 2685 __ addl(temp, src_pos); // src_pos + length 2686 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes())); 2687 __ jcc(Assembler::above, L_failed); 2688 2689 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2690 __ movl(temp, length); 2691 __ addl(temp, dst_pos); // dst_pos + length 2692 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2693 __ jcc(Assembler::above, L_failed); 2694 2695 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'. 2696 // Move with sign extension can be used since they are positive. 2697 __ movslq(src_pos, src_pos); 2698 __ movslq(dst_pos, dst_pos); 2699 2700 BLOCK_COMMENT("arraycopy_range_checks done"); 2701 } 2702 2703 // 2704 // Generate generic array copy stubs 2705 // 2706 // Input: 2707 // c_rarg0 - src oop 2708 // c_rarg1 - src_pos (32-bits) 2709 // c_rarg2 - dst oop 2710 // c_rarg3 - dst_pos (32-bits) 2711 // not Win64 2712 // c_rarg4 - element count (32-bits) 2713 // Win64 2714 // rsp+40 - element count (32-bits) 2715 // 2716 // Output: 2717 // rax == 0 - success 2718 // rax == -1^K - failure, where K is partial transfer count 2719 // 2720 address generate_generic_copy(const char *name, 2721 address byte_copy_entry, address short_copy_entry, 2722 address int_copy_entry, address oop_copy_entry, 2723 address long_copy_entry, address checkcast_copy_entry) { 2724 2725 Label L_failed, L_failed_0, L_objArray; 2726 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2727 2728 // Input registers 2729 const Register src = c_rarg0; // source array oop 2730 const Register src_pos = c_rarg1; // source position 2731 const Register dst = c_rarg2; // destination array oop 2732 const Register dst_pos = c_rarg3; // destination position 2733 #ifndef _WIN64 2734 const Register length = c_rarg4; 2735 #else 2736 const Address length(rsp, 6 * wordSize); // elements count is on stack on Win64 2737 #endif 2738 2739 { int modulus = CodeEntryAlignment; 2740 int target = modulus - 5; // 5 = sizeof jmp(L_failed) 2741 int advance = target - (__ offset() % modulus); 2742 if (advance < 0) advance += modulus; 2743 if (advance > 0) __ nop(advance); 2744 } 2745 StubCodeMark mark(this, "StubRoutines", name); 2746 2747 // Short-hop target to L_failed. Makes for denser prologue code. 2748 __ BIND(L_failed_0); 2749 __ jmp(L_failed); 2750 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed"); 2751 2752 __ align(CodeEntryAlignment); 2753 address start = __ pc(); 2754 2755 __ enter(); // required for proper stackwalking of RuntimeStub frame 2756 2757 // bump this on entry, not on exit: 2758 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2759 2760 //----------------------------------------------------------------------- 2761 // Assembler stub will be used for this call to arraycopy 2762 // if the following conditions are met: 2763 // 2764 // (1) src and dst must not be null. 2765 // (2) src_pos must not be negative. 2766 // (3) dst_pos must not be negative. 2767 // (4) length must not be negative. 2768 // (5) src klass and dst klass should be the same and not NULL. 2769 // (6) src and dst should be arrays. 2770 // (7) src_pos + length must not exceed length of src. 2771 // (8) dst_pos + length must not exceed length of dst. 2772 // 2773 2774 // if (src == NULL) return -1; 2775 __ testptr(src, src); // src oop 2776 size_t j1off = __ offset(); 2777 __ jccb(Assembler::zero, L_failed_0); 2778 2779 // if (src_pos < 0) return -1; 2780 __ testl(src_pos, src_pos); // src_pos (32-bits) 2781 __ jccb(Assembler::negative, L_failed_0); 2782 2783 // if (dst == NULL) return -1; 2784 __ testptr(dst, dst); // dst oop 2785 __ jccb(Assembler::zero, L_failed_0); 2786 2787 // if (dst_pos < 0) return -1; 2788 __ testl(dst_pos, dst_pos); // dst_pos (32-bits) 2789 size_t j4off = __ offset(); 2790 __ jccb(Assembler::negative, L_failed_0); 2791 2792 // The first four tests are very dense code, 2793 // but not quite dense enough to put four 2794 // jumps in a 16-byte instruction fetch buffer. 2795 // That's good, because some branch predicters 2796 // do not like jumps so close together. 2797 // Make sure of this. 2798 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps"); 2799 2800 // registers used as temp 2801 const Register r11_length = r11; // elements count to copy 2802 const Register r10_src_klass = r10; // array klass 2803 2804 // if (length < 0) return -1; 2805 __ movl(r11_length, length); // length (elements count, 32-bits value) 2806 __ testl(r11_length, r11_length); 2807 __ jccb(Assembler::negative, L_failed_0); 2808 2809 __ load_klass(r10_src_klass, src); 2810 #ifdef ASSERT 2811 // assert(src->klass() != NULL); 2812 { 2813 BLOCK_COMMENT("assert klasses not null {"); 2814 Label L1, L2; 2815 __ testptr(r10_src_klass, r10_src_klass); 2816 __ jcc(Assembler::notZero, L2); // it is broken if klass is NULL 2817 __ bind(L1); 2818 __ stop("broken null klass"); 2819 __ bind(L2); 2820 __ load_klass(rax, dst); 2821 __ cmpq(rax, 0); 2822 __ jcc(Assembler::equal, L1); // this would be broken also 2823 BLOCK_COMMENT("} assert klasses not null done"); 2824 } 2825 #endif 2826 2827 // Load layout helper (32-bits) 2828 // 2829 // |array_tag| | header_size | element_type | |log2_element_size| 2830 // 32 30 24 16 8 2 0 2831 // 2832 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2833 // 2834 2835 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2836 2837 // Handle objArrays completely differently... 2838 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2839 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh); 2840 __ jcc(Assembler::equal, L_objArray); 2841 2842 // if (src->klass() != dst->klass()) return -1; 2843 __ load_klass(rax, dst); 2844 __ cmpq(r10_src_klass, rax); 2845 __ jcc(Assembler::notEqual, L_failed); 2846 2847 const Register rax_lh = rax; // layout helper 2848 __ movl(rax_lh, Address(r10_src_klass, lh_offset)); 2849 2850 // if (!src->is_Array()) return -1; 2851 __ cmpl(rax_lh, Klass::_lh_neutral_value); 2852 __ jcc(Assembler::greaterEqual, L_failed); 2853 2854 // At this point, it is known to be a typeArray (array_tag 0x3). 2855 #ifdef ASSERT 2856 { 2857 BLOCK_COMMENT("assert primitive array {"); 2858 Label L; 2859 __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift)); 2860 __ jcc(Assembler::greaterEqual, L); 2861 __ stop("must be a primitive array"); 2862 __ bind(L); 2863 BLOCK_COMMENT("} assert primitive array done"); 2864 } 2865 #endif 2866 2867 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2868 r10, L_failed); 2869 2870 // TypeArrayKlass 2871 // 2872 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2873 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2874 // 2875 2876 const Register r10_offset = r10; // array offset 2877 const Register rax_elsize = rax_lh; // element size 2878 2879 __ movl(r10_offset, rax_lh); 2880 __ shrl(r10_offset, Klass::_lh_header_size_shift); 2881 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset 2882 __ addptr(src, r10_offset); // src array offset 2883 __ addptr(dst, r10_offset); // dst array offset 2884 BLOCK_COMMENT("choose copy loop based on element size"); 2885 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize 2886 2887 // next registers should be set before the jump to corresponding stub 2888 const Register from = c_rarg0; // source array address 2889 const Register to = c_rarg1; // destination array address 2890 const Register count = c_rarg2; // elements count 2891 2892 // 'from', 'to', 'count' registers should be set in such order 2893 // since they are the same as 'src', 'src_pos', 'dst'. 2894 2895 __ BIND(L_copy_bytes); 2896 __ cmpl(rax_elsize, 0); 2897 __ jccb(Assembler::notEqual, L_copy_shorts); 2898 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr 2899 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr 2900 __ movl2ptr(count, r11_length); // length 2901 __ jump(RuntimeAddress(byte_copy_entry)); 2902 2903 __ BIND(L_copy_shorts); 2904 __ cmpl(rax_elsize, LogBytesPerShort); 2905 __ jccb(Assembler::notEqual, L_copy_ints); 2906 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr 2907 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr 2908 __ movl2ptr(count, r11_length); // length 2909 __ jump(RuntimeAddress(short_copy_entry)); 2910 2911 __ BIND(L_copy_ints); 2912 __ cmpl(rax_elsize, LogBytesPerInt); 2913 __ jccb(Assembler::notEqual, L_copy_longs); 2914 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr 2915 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr 2916 __ movl2ptr(count, r11_length); // length 2917 __ jump(RuntimeAddress(int_copy_entry)); 2918 2919 __ BIND(L_copy_longs); 2920 #ifdef ASSERT 2921 { 2922 BLOCK_COMMENT("assert long copy {"); 2923 Label L; 2924 __ cmpl(rax_elsize, LogBytesPerLong); 2925 __ jcc(Assembler::equal, L); 2926 __ stop("must be long copy, but elsize is wrong"); 2927 __ bind(L); 2928 BLOCK_COMMENT("} assert long copy done"); 2929 } 2930 #endif 2931 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr 2932 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr 2933 __ movl2ptr(count, r11_length); // length 2934 __ jump(RuntimeAddress(long_copy_entry)); 2935 2936 // ObjArrayKlass 2937 __ BIND(L_objArray); 2938 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos] 2939 2940 Label L_plain_copy, L_checkcast_copy; 2941 // test array classes for subtyping 2942 __ load_klass(rax, dst); 2943 __ cmpq(r10_src_klass, rax); // usual case is exact equality 2944 __ jcc(Assembler::notEqual, L_checkcast_copy); 2945 2946 // Identically typed arrays can be copied without element-wise checks. 2947 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2948 r10, L_failed); 2949 2950 __ lea(from, Address(src, src_pos, TIMES_OOP, 2951 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr 2952 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 2953 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr 2954 __ movl2ptr(count, r11_length); // length 2955 __ BIND(L_plain_copy); 2956 __ jump(RuntimeAddress(oop_copy_entry)); 2957 2958 __ BIND(L_checkcast_copy); 2959 // live at this point: r10_src_klass, r11_length, rax (dst_klass) 2960 { 2961 // Before looking at dst.length, make sure dst is also an objArray. 2962 __ cmpl(Address(rax, lh_offset), objArray_lh); 2963 __ jcc(Assembler::notEqual, L_failed); 2964 2965 // It is safe to examine both src.length and dst.length. 2966 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2967 rax, L_failed); 2968 2969 const Register r11_dst_klass = r11; 2970 __ load_klass(r11_dst_klass, dst); // reload 2971 2972 // Marshal the base address arguments now, freeing registers. 2973 __ lea(from, Address(src, src_pos, TIMES_OOP, 2974 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 2975 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 2976 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 2977 __ movl(count, length); // length (reloaded) 2978 Register sco_temp = c_rarg3; // this register is free now 2979 assert_different_registers(from, to, count, sco_temp, 2980 r11_dst_klass, r10_src_klass); 2981 assert_clean_int(count, sco_temp); 2982 2983 // Generate the type check. 2984 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2985 __ movl(sco_temp, Address(r11_dst_klass, sco_offset)); 2986 assert_clean_int(sco_temp, rax); 2987 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy); 2988 2989 // Fetch destination element klass from the ObjArrayKlass header. 2990 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2991 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset)); 2992 __ movl( sco_temp, Address(r11_dst_klass, sco_offset)); 2993 assert_clean_int(sco_temp, rax); 2994 2995 // the checkcast_copy loop needs two extra arguments: 2996 assert(c_rarg3 == sco_temp, "#3 already in place"); 2997 // Set up arguments for checkcast_copy_entry. 2998 setup_arg_regs(4); 2999 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris 3000 __ jump(RuntimeAddress(checkcast_copy_entry)); 3001 } 3002 3003 __ BIND(L_failed); 3004 __ xorptr(rax, rax); 3005 __ notptr(rax); // return -1 3006 __ leave(); // required for proper stackwalking of RuntimeStub frame 3007 __ ret(0); 3008 3009 return start; 3010 } 3011 3012 void generate_arraycopy_stubs() { 3013 address entry; 3014 address entry_jbyte_arraycopy; 3015 address entry_jshort_arraycopy; 3016 address entry_jint_arraycopy; 3017 address entry_oop_arraycopy; 3018 address entry_jlong_arraycopy; 3019 address entry_checkcast_arraycopy; 3020 3021 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 3022 "jbyte_disjoint_arraycopy"); 3023 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy, 3024 "jbyte_arraycopy"); 3025 3026 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 3027 "jshort_disjoint_arraycopy"); 3028 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy, 3029 "jshort_arraycopy"); 3030 3031 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, &entry, 3032 "jint_disjoint_arraycopy"); 3033 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, entry, 3034 &entry_jint_arraycopy, "jint_arraycopy"); 3035 3036 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, false, &entry, 3037 "jlong_disjoint_arraycopy"); 3038 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(false, false, entry, 3039 &entry_jlong_arraycopy, "jlong_arraycopy"); 3040 3041 3042 if (UseCompressedOops) { 3043 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, &entry, 3044 "oop_disjoint_arraycopy"); 3045 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, entry, 3046 &entry_oop_arraycopy, "oop_arraycopy"); 3047 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(false, true, &entry, 3048 "oop_disjoint_arraycopy_uninit", 3049 /*dest_uninitialized*/true); 3050 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(false, true, entry, 3051 NULL, "oop_arraycopy_uninit", 3052 /*dest_uninitialized*/true); 3053 } else { 3054 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, &entry, 3055 "oop_disjoint_arraycopy"); 3056 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, entry, 3057 &entry_oop_arraycopy, "oop_arraycopy"); 3058 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(false, true, &entry, 3059 "oop_disjoint_arraycopy_uninit", 3060 /*dest_uninitialized*/true); 3061 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(false, true, entry, 3062 NULL, "oop_arraycopy_uninit", 3063 /*dest_uninitialized*/true); 3064 } 3065 3066 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 3067 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 3068 /*dest_uninitialized*/true); 3069 3070 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 3071 entry_jbyte_arraycopy, 3072 entry_jshort_arraycopy, 3073 entry_jint_arraycopy, 3074 entry_jlong_arraycopy); 3075 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 3076 entry_jbyte_arraycopy, 3077 entry_jshort_arraycopy, 3078 entry_jint_arraycopy, 3079 entry_oop_arraycopy, 3080 entry_jlong_arraycopy, 3081 entry_checkcast_arraycopy); 3082 3083 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 3084 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 3085 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 3086 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 3087 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 3088 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 3089 3090 // We don't generate specialized code for HeapWord-aligned source 3091 // arrays, so just use the code we've already generated 3092 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy; 3093 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy; 3094 3095 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy; 3096 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy; 3097 3098 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy; 3099 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy; 3100 3101 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy; 3102 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy; 3103 3104 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy; 3105 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy; 3106 3107 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit; 3108 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit; 3109 } 3110 3111 // AES intrinsic stubs 3112 enum {AESBlockSize = 16}; 3113 3114 address generate_key_shuffle_mask() { 3115 __ align(16); 3116 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask"); 3117 address start = __ pc(); 3118 __ emit_data64( 0x0405060700010203, relocInfo::none ); 3119 __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none ); 3120 return start; 3121 } 3122 3123 address generate_counter_shuffle_mask() { 3124 __ align(16); 3125 StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask"); 3126 address start = __ pc(); 3127 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); 3128 __ emit_data64(0x0001020304050607, relocInfo::none); 3129 return start; 3130 } 3131 3132 // Utility routine for loading a 128-bit key word in little endian format 3133 // can optionally specify that the shuffle mask is already in an xmmregister 3134 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 3135 __ movdqu(xmmdst, Address(key, offset)); 3136 if (xmm_shuf_mask != NULL) { 3137 __ pshufb(xmmdst, xmm_shuf_mask); 3138 } else { 3139 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3140 } 3141 } 3142 3143 // Utility routine for increase 128bit counter (iv in CTR mode) 3144 void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) { 3145 __ pextrq(reg, xmmdst, 0x0); 3146 __ addq(reg, inc_delta); 3147 __ pinsrq(xmmdst, reg, 0x0); 3148 __ jcc(Assembler::carryClear, next_block); // jump if no carry 3149 __ pextrq(reg, xmmdst, 0x01); // Carry 3150 __ addq(reg, 0x01); 3151 __ pinsrq(xmmdst, reg, 0x01); //Carry end 3152 __ BIND(next_block); // next instruction 3153 } 3154 3155 // Arguments: 3156 // 3157 // Inputs: 3158 // c_rarg0 - source byte array address 3159 // c_rarg1 - destination byte array address 3160 // c_rarg2 - K (key) in little endian int array 3161 // 3162 address generate_aescrypt_encryptBlock() { 3163 assert(UseAES, "need AES instructions and misaligned SSE support"); 3164 __ align(CodeEntryAlignment); 3165 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 3166 Label L_doLast; 3167 address start = __ pc(); 3168 3169 const Register from = c_rarg0; // source array address 3170 const Register to = c_rarg1; // destination array address 3171 const Register key = c_rarg2; // key array address 3172 const Register keylen = rax; 3173 3174 const XMMRegister xmm_result = xmm0; 3175 const XMMRegister xmm_key_shuf_mask = xmm1; 3176 // On win64 xmm6-xmm15 must be preserved so don't use them. 3177 const XMMRegister xmm_temp1 = xmm2; 3178 const XMMRegister xmm_temp2 = xmm3; 3179 const XMMRegister xmm_temp3 = xmm4; 3180 const XMMRegister xmm_temp4 = xmm5; 3181 3182 __ enter(); // required for proper stackwalking of RuntimeStub frame 3183 3184 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 3185 // context for the registers used, where all instructions below are using 128-bit mode 3186 // On EVEX without VL and BW, these instructions will all be AVX. 3187 if (VM_Version::supports_avx512vlbw()) { 3188 __ movl(rax, 0xffff); 3189 __ kmovql(k1, rax); 3190 } 3191 3192 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} 3193 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3194 3195 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3196 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input 3197 3198 // For encryption, the java expanded key ordering is just what we need 3199 // we don't know if the key is aligned, hence not using load-execute form 3200 3201 load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask); 3202 __ pxor(xmm_result, xmm_temp1); 3203 3204 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); 3205 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); 3206 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); 3207 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); 3208 3209 __ aesenc(xmm_result, xmm_temp1); 3210 __ aesenc(xmm_result, xmm_temp2); 3211 __ aesenc(xmm_result, xmm_temp3); 3212 __ aesenc(xmm_result, xmm_temp4); 3213 3214 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); 3215 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); 3216 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); 3217 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); 3218 3219 __ aesenc(xmm_result, xmm_temp1); 3220 __ aesenc(xmm_result, xmm_temp2); 3221 __ aesenc(xmm_result, xmm_temp3); 3222 __ aesenc(xmm_result, xmm_temp4); 3223 3224 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); 3225 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); 3226 3227 __ cmpl(keylen, 44); 3228 __ jccb(Assembler::equal, L_doLast); 3229 3230 __ aesenc(xmm_result, xmm_temp1); 3231 __ aesenc(xmm_result, xmm_temp2); 3232 3233 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); 3234 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); 3235 3236 __ cmpl(keylen, 52); 3237 __ jccb(Assembler::equal, L_doLast); 3238 3239 __ aesenc(xmm_result, xmm_temp1); 3240 __ aesenc(xmm_result, xmm_temp2); 3241 3242 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); 3243 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); 3244 3245 __ BIND(L_doLast); 3246 __ aesenc(xmm_result, xmm_temp1); 3247 __ aesenclast(xmm_result, xmm_temp2); 3248 __ movdqu(Address(to, 0), xmm_result); // store the result 3249 __ xorptr(rax, rax); // return 0 3250 __ leave(); // required for proper stackwalking of RuntimeStub frame 3251 __ ret(0); 3252 3253 return start; 3254 } 3255 3256 3257 // Arguments: 3258 // 3259 // Inputs: 3260 // c_rarg0 - source byte array address 3261 // c_rarg1 - destination byte array address 3262 // c_rarg2 - K (key) in little endian int array 3263 // 3264 address generate_aescrypt_decryptBlock() { 3265 assert(UseAES, "need AES instructions and misaligned SSE support"); 3266 __ align(CodeEntryAlignment); 3267 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 3268 Label L_doLast; 3269 address start = __ pc(); 3270 3271 const Register from = c_rarg0; // source array address 3272 const Register to = c_rarg1; // destination array address 3273 const Register key = c_rarg2; // key array address 3274 const Register keylen = rax; 3275 3276 const XMMRegister xmm_result = xmm0; 3277 const XMMRegister xmm_key_shuf_mask = xmm1; 3278 // On win64 xmm6-xmm15 must be preserved so don't use them. 3279 const XMMRegister xmm_temp1 = xmm2; 3280 const XMMRegister xmm_temp2 = xmm3; 3281 const XMMRegister xmm_temp3 = xmm4; 3282 const XMMRegister xmm_temp4 = xmm5; 3283 3284 __ enter(); // required for proper stackwalking of RuntimeStub frame 3285 3286 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 3287 // context for the registers used, where all instructions below are using 128-bit mode 3288 // On EVEX without VL and BW, these instructions will all be AVX. 3289 if (VM_Version::supports_avx512vlbw()) { 3290 __ movl(rax, 0xffff); 3291 __ kmovql(k1, rax); 3292 } 3293 3294 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} 3295 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3296 3297 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3298 __ movdqu(xmm_result, Address(from, 0)); 3299 3300 // for decryption java expanded key ordering is rotated one position from what we want 3301 // so we start from 0x10 here and hit 0x00 last 3302 // we don't know if the key is aligned, hence not using load-execute form 3303 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); 3304 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); 3305 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); 3306 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); 3307 3308 __ pxor (xmm_result, xmm_temp1); 3309 __ aesdec(xmm_result, xmm_temp2); 3310 __ aesdec(xmm_result, xmm_temp3); 3311 __ aesdec(xmm_result, xmm_temp4); 3312 3313 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); 3314 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); 3315 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); 3316 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); 3317 3318 __ aesdec(xmm_result, xmm_temp1); 3319 __ aesdec(xmm_result, xmm_temp2); 3320 __ aesdec(xmm_result, xmm_temp3); 3321 __ aesdec(xmm_result, xmm_temp4); 3322 3323 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); 3324 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); 3325 load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask); 3326 3327 __ cmpl(keylen, 44); 3328 __ jccb(Assembler::equal, L_doLast); 3329 3330 __ aesdec(xmm_result, xmm_temp1); 3331 __ aesdec(xmm_result, xmm_temp2); 3332 3333 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); 3334 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); 3335 3336 __ cmpl(keylen, 52); 3337 __ jccb(Assembler::equal, L_doLast); 3338 3339 __ aesdec(xmm_result, xmm_temp1); 3340 __ aesdec(xmm_result, xmm_temp2); 3341 3342 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); 3343 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); 3344 3345 __ BIND(L_doLast); 3346 __ aesdec(xmm_result, xmm_temp1); 3347 __ aesdec(xmm_result, xmm_temp2); 3348 3349 // for decryption the aesdeclast operation is always on key+0x00 3350 __ aesdeclast(xmm_result, xmm_temp3); 3351 __ movdqu(Address(to, 0), xmm_result); // store the result 3352 __ xorptr(rax, rax); // return 0 3353 __ leave(); // required for proper stackwalking of RuntimeStub frame 3354 __ ret(0); 3355 3356 return start; 3357 } 3358 3359 3360 // Arguments: 3361 // 3362 // Inputs: 3363 // c_rarg0 - source byte array address 3364 // c_rarg1 - destination byte array address 3365 // c_rarg2 - K (key) in little endian int array 3366 // c_rarg3 - r vector byte array address 3367 // c_rarg4 - input length 3368 // 3369 // Output: 3370 // rax - input length 3371 // 3372 address generate_cipherBlockChaining_encryptAESCrypt() { 3373 assert(UseAES, "need AES instructions and misaligned SSE support"); 3374 __ align(CodeEntryAlignment); 3375 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 3376 address start = __ pc(); 3377 3378 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; 3379 const Register from = c_rarg0; // source array address 3380 const Register to = c_rarg1; // destination array address 3381 const Register key = c_rarg2; // key array address 3382 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3383 // and left with the results of the last encryption block 3384 #ifndef _WIN64 3385 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3386 #else 3387 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 3388 const Register len_reg = r11; // pick the volatile windows register 3389 #endif 3390 const Register pos = rax; 3391 3392 // xmm register assignments for the loops below 3393 const XMMRegister xmm_result = xmm0; 3394 const XMMRegister xmm_temp = xmm1; 3395 // keys 0-10 preloaded into xmm2-xmm12 3396 const int XMM_REG_NUM_KEY_FIRST = 2; 3397 const int XMM_REG_NUM_KEY_LAST = 15; 3398 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 3399 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10); 3400 const XMMRegister xmm_key11 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11); 3401 const XMMRegister xmm_key12 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12); 3402 const XMMRegister xmm_key13 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13); 3403 3404 __ enter(); // required for proper stackwalking of RuntimeStub frame 3405 3406 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 3407 // context for the registers used, where all instructions below are using 128-bit mode 3408 // On EVEX without VL and BW, these instructions will all be AVX. 3409 if (VM_Version::supports_avx512vlbw()) { 3410 __ movl(rax, 0xffff); 3411 __ kmovql(k1, rax); 3412 } 3413 3414 #ifdef _WIN64 3415 // on win64, fill len_reg from stack position 3416 __ movl(len_reg, len_mem); 3417 #else 3418 __ push(len_reg); // Save 3419 #endif 3420 3421 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front 3422 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3423 // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0 3424 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) { 3425 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 3426 offset += 0x10; 3427 } 3428 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec 3429 3430 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 3431 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3432 __ cmpl(rax, 44); 3433 __ jcc(Assembler::notEqual, L_key_192_256); 3434 3435 // 128 bit code follows here 3436 __ movptr(pos, 0); 3437 __ align(OptoLoopAlignment); 3438 3439 __ BIND(L_loopTop_128); 3440 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3441 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3442 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3443 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) { 3444 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3445 } 3446 __ aesenclast(xmm_result, xmm_key10); 3447 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3448 // no need to store r to memory until we exit 3449 __ addptr(pos, AESBlockSize); 3450 __ subptr(len_reg, AESBlockSize); 3451 __ jcc(Assembler::notEqual, L_loopTop_128); 3452 3453 __ BIND(L_exit); 3454 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object 3455 3456 #ifdef _WIN64 3457 __ movl(rax, len_mem); 3458 #else 3459 __ pop(rax); // return length 3460 #endif 3461 __ leave(); // required for proper stackwalking of RuntimeStub frame 3462 __ ret(0); 3463 3464 __ BIND(L_key_192_256); 3465 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 3466 load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask); 3467 load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask); 3468 __ cmpl(rax, 52); 3469 __ jcc(Assembler::notEqual, L_key_256); 3470 3471 // 192-bit code follows here (could be changed to use more xmm registers) 3472 __ movptr(pos, 0); 3473 __ align(OptoLoopAlignment); 3474 3475 __ BIND(L_loopTop_192); 3476 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3477 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3478 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3479 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) { 3480 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3481 } 3482 __ aesenclast(xmm_result, xmm_key12); 3483 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3484 // no need to store r to memory until we exit 3485 __ addptr(pos, AESBlockSize); 3486 __ subptr(len_reg, AESBlockSize); 3487 __ jcc(Assembler::notEqual, L_loopTop_192); 3488 __ jmp(L_exit); 3489 3490 __ BIND(L_key_256); 3491 // 256-bit code follows here (could be changed to use more xmm registers) 3492 load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask); 3493 __ movptr(pos, 0); 3494 __ align(OptoLoopAlignment); 3495 3496 __ BIND(L_loopTop_256); 3497 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3498 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3499 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3500 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) { 3501 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3502 } 3503 load_key(xmm_temp, key, 0xe0); 3504 __ aesenclast(xmm_result, xmm_temp); 3505 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3506 // no need to store r to memory until we exit 3507 __ addptr(pos, AESBlockSize); 3508 __ subptr(len_reg, AESBlockSize); 3509 __ jcc(Assembler::notEqual, L_loopTop_256); 3510 __ jmp(L_exit); 3511 3512 return start; 3513 } 3514 3515 // Safefetch stubs. 3516 void generate_safefetch(const char* name, int size, address* entry, 3517 address* fault_pc, address* continuation_pc) { 3518 // safefetch signatures: 3519 // int SafeFetch32(int* adr, int errValue); 3520 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3521 // 3522 // arguments: 3523 // c_rarg0 = adr 3524 // c_rarg1 = errValue 3525 // 3526 // result: 3527 // PPC_RET = *adr or errValue 3528 3529 StubCodeMark mark(this, "StubRoutines", name); 3530 3531 // Entry point, pc or function descriptor. 3532 *entry = __ pc(); 3533 3534 // Load *adr into c_rarg1, may fault. 3535 *fault_pc = __ pc(); 3536 switch (size) { 3537 case 4: 3538 // int32_t 3539 __ movl(c_rarg1, Address(c_rarg0, 0)); 3540 break; 3541 case 8: 3542 // int64_t 3543 __ movq(c_rarg1, Address(c_rarg0, 0)); 3544 break; 3545 default: 3546 ShouldNotReachHere(); 3547 } 3548 3549 // return errValue or *adr 3550 *continuation_pc = __ pc(); 3551 __ movq(rax, c_rarg1); 3552 __ ret(0); 3553 } 3554 3555 // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time 3556 // to hide instruction latency 3557 // 3558 // Arguments: 3559 // 3560 // Inputs: 3561 // c_rarg0 - source byte array address 3562 // c_rarg1 - destination byte array address 3563 // c_rarg2 - K (key) in little endian int array 3564 // c_rarg3 - r vector byte array address 3565 // c_rarg4 - input length 3566 // 3567 // Output: 3568 // rax - input length 3569 // 3570 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 3571 assert(UseAES, "need AES instructions and misaligned SSE support"); 3572 __ align(CodeEntryAlignment); 3573 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 3574 address start = __ pc(); 3575 3576 const Register from = c_rarg0; // source array address 3577 const Register to = c_rarg1; // destination array address 3578 const Register key = c_rarg2; // key array address 3579 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3580 // and left with the results of the last encryption block 3581 #ifndef _WIN64 3582 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3583 #else 3584 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 3585 const Register len_reg = r11; // pick the volatile windows register 3586 #endif 3587 const Register pos = rax; 3588 3589 const int PARALLEL_FACTOR = 4; 3590 const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256 3591 3592 Label L_exit; 3593 Label L_singleBlock_loopTopHead[3]; // 128, 192, 256 3594 Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256 3595 Label L_singleBlock_loopTop[3]; // 128, 192, 256 3596 Label L_multiBlock_loopTopHead[3]; // 128, 192, 256 3597 Label L_multiBlock_loopTop[3]; // 128, 192, 256 3598 3599 // keys 0-10 preloaded into xmm5-xmm15 3600 const int XMM_REG_NUM_KEY_FIRST = 5; 3601 const int XMM_REG_NUM_KEY_LAST = 15; 3602 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 3603 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST); 3604 3605 __ enter(); // required for proper stackwalking of RuntimeStub frame 3606 3607 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 3608 // context for the registers used, where all instructions below are using 128-bit mode 3609 // On EVEX without VL and BW, these instructions will all be AVX. 3610 if (VM_Version::supports_avx512vlbw()) { 3611 __ movl(rax, 0xffff); 3612 __ kmovql(k1, rax); 3613 } 3614 3615 #ifdef _WIN64 3616 // on win64, fill len_reg from stack position 3617 __ movl(len_reg, len_mem); 3618 #else 3619 __ push(len_reg); // Save 3620 #endif 3621 __ push(rbx); 3622 // the java expanded key ordering is rotated one position from what we want 3623 // so we start from 0x10 here and hit 0x00 last 3624 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front 3625 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3626 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 3627 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) { 3628 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 3629 offset += 0x10; 3630 } 3631 load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask); 3632 3633 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block 3634 3635 // registers holding the four results in the parallelized loop 3636 const XMMRegister xmm_result0 = xmm0; 3637 const XMMRegister xmm_result1 = xmm2; 3638 const XMMRegister xmm_result2 = xmm3; 3639 const XMMRegister xmm_result3 = xmm4; 3640 3641 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec 3642 3643 __ xorptr(pos, pos); 3644 3645 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 3646 __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3647 __ cmpl(rbx, 52); 3648 __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]); 3649 __ cmpl(rbx, 60); 3650 __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]); 3651 3652 #define DoFour(opc, src_reg) \ 3653 __ opc(xmm_result0, src_reg); \ 3654 __ opc(xmm_result1, src_reg); \ 3655 __ opc(xmm_result2, src_reg); \ 3656 __ opc(xmm_result3, src_reg); \ 3657 3658 for (int k = 0; k < 3; ++k) { 3659 __ BIND(L_multiBlock_loopTopHead[k]); 3660 if (k != 0) { 3661 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left 3662 __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]); 3663 } 3664 if (k == 1) { 3665 __ subptr(rsp, 6 * wordSize); 3666 __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15 3667 load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0 3668 __ movdqu(Address(rsp, 2 * wordSize), xmm15); 3669 load_key(xmm1, key, 0xc0); // 0xc0; 3670 __ movdqu(Address(rsp, 4 * wordSize), xmm1); 3671 } else if (k == 2) { 3672 __ subptr(rsp, 10 * wordSize); 3673 __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15 3674 load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0 3675 __ movdqu(Address(rsp, 6 * wordSize), xmm15); 3676 load_key(xmm1, key, 0xe0); // 0xe0; 3677 __ movdqu(Address(rsp, 8 * wordSize), xmm1); 3678 load_key(xmm15, key, 0xb0); // 0xb0; 3679 __ movdqu(Address(rsp, 2 * wordSize), xmm15); 3680 load_key(xmm1, key, 0xc0); // 0xc0; 3681 __ movdqu(Address(rsp, 4 * wordSize), xmm1); 3682 } 3683 __ align(OptoLoopAlignment); 3684 __ BIND(L_multiBlock_loopTop[k]); 3685 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left 3686 __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]); 3687 3688 if (k != 0) { 3689 __ movdqu(xmm15, Address(rsp, 2 * wordSize)); 3690 __ movdqu(xmm1, Address(rsp, 4 * wordSize)); 3691 } 3692 3693 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers 3694 __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); 3695 __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); 3696 __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); 3697 3698 DoFour(pxor, xmm_key_first); 3699 if (k == 0) { 3700 for (int rnum = 1; rnum < ROUNDS[k]; rnum++) { 3701 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); 3702 } 3703 DoFour(aesdeclast, xmm_key_last); 3704 } else if (k == 1) { 3705 for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) { 3706 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); 3707 } 3708 __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again. 3709 DoFour(aesdec, xmm1); // key : 0xc0 3710 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again 3711 DoFour(aesdeclast, xmm_key_last); 3712 } else if (k == 2) { 3713 for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) { 3714 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); 3715 } 3716 DoFour(aesdec, xmm1); // key : 0xc0 3717 __ movdqu(xmm15, Address(rsp, 6 * wordSize)); 3718 __ movdqu(xmm1, Address(rsp, 8 * wordSize)); 3719 DoFour(aesdec, xmm15); // key : 0xd0 3720 __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again. 3721 DoFour(aesdec, xmm1); // key : 0xe0 3722 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again 3723 DoFour(aesdeclast, xmm_key_last); 3724 } 3725 3726 // for each result, xor with the r vector of previous cipher block 3727 __ pxor(xmm_result0, xmm_prev_block_cipher); 3728 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize)); 3729 __ pxor(xmm_result1, xmm_prev_block_cipher); 3730 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize)); 3731 __ pxor(xmm_result2, xmm_prev_block_cipher); 3732 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize)); 3733 __ pxor(xmm_result3, xmm_prev_block_cipher); 3734 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks 3735 if (k != 0) { 3736 __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher); 3737 } 3738 3739 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output 3740 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); 3741 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); 3742 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); 3743 3744 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); 3745 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); 3746 __ jmp(L_multiBlock_loopTop[k]); 3747 3748 // registers used in the non-parallelized loops 3749 // xmm register assignments for the loops below 3750 const XMMRegister xmm_result = xmm0; 3751 const XMMRegister xmm_prev_block_cipher_save = xmm2; 3752 const XMMRegister xmm_key11 = xmm3; 3753 const XMMRegister xmm_key12 = xmm4; 3754 const XMMRegister key_tmp = xmm4; 3755 3756 __ BIND(L_singleBlock_loopTopHead[k]); 3757 if (k == 1) { 3758 __ addptr(rsp, 6 * wordSize); 3759 } else if (k == 2) { 3760 __ addptr(rsp, 10 * wordSize); 3761 } 3762 __ cmpptr(len_reg, 0); // any blocks left?? 3763 __ jcc(Assembler::equal, L_exit); 3764 __ BIND(L_singleBlock_loopTopHead2[k]); 3765 if (k == 1) { 3766 load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0 3767 load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0 3768 } 3769 if (k == 2) { 3770 load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0 3771 } 3772 __ align(OptoLoopAlignment); 3773 __ BIND(L_singleBlock_loopTop[k]); 3774 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 3775 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector 3776 __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds 3777 for (int rnum = 1; rnum <= 9 ; rnum++) { 3778 __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); 3779 } 3780 if (k == 1) { 3781 __ aesdec(xmm_result, xmm_key11); 3782 __ aesdec(xmm_result, xmm_key12); 3783 } 3784 if (k == 2) { 3785 __ aesdec(xmm_result, xmm_key11); 3786 load_key(key_tmp, key, 0xc0); 3787 __ aesdec(xmm_result, key_tmp); 3788 load_key(key_tmp, key, 0xd0); 3789 __ aesdec(xmm_result, key_tmp); 3790 load_key(key_tmp, key, 0xe0); 3791 __ aesdec(xmm_result, key_tmp); 3792 } 3793 3794 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 3795 __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector 3796 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3797 // no need to store r to memory until we exit 3798 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 3799 __ addptr(pos, AESBlockSize); 3800 __ subptr(len_reg, AESBlockSize); 3801 __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]); 3802 if (k != 2) { 3803 __ jmp(L_exit); 3804 } 3805 } //for 128/192/256 3806 3807 __ BIND(L_exit); 3808 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object 3809 __ pop(rbx); 3810 #ifdef _WIN64 3811 __ movl(rax, len_mem); 3812 #else 3813 __ pop(rax); // return length 3814 #endif 3815 __ leave(); // required for proper stackwalking of RuntimeStub frame 3816 __ ret(0); 3817 return start; 3818 } 3819 3820 address generate_upper_word_mask() { 3821 __ align(64); 3822 StubCodeMark mark(this, "StubRoutines", "upper_word_mask"); 3823 address start = __ pc(); 3824 __ emit_data64(0x0000000000000000, relocInfo::none); 3825 __ emit_data64(0xFFFFFFFF00000000, relocInfo::none); 3826 return start; 3827 } 3828 3829 address generate_shuffle_byte_flip_mask() { 3830 __ align(64); 3831 StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask"); 3832 address start = __ pc(); 3833 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); 3834 __ emit_data64(0x0001020304050607, relocInfo::none); 3835 return start; 3836 } 3837 3838 // ofs and limit are use for multi-block byte array. 3839 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 3840 address generate_sha1_implCompress(bool multi_block, const char *name) { 3841 __ align(CodeEntryAlignment); 3842 StubCodeMark mark(this, "StubRoutines", name); 3843 address start = __ pc(); 3844 3845 Register buf = c_rarg0; 3846 Register state = c_rarg1; 3847 Register ofs = c_rarg2; 3848 Register limit = c_rarg3; 3849 3850 const XMMRegister abcd = xmm0; 3851 const XMMRegister e0 = xmm1; 3852 const XMMRegister e1 = xmm2; 3853 const XMMRegister msg0 = xmm3; 3854 3855 const XMMRegister msg1 = xmm4; 3856 const XMMRegister msg2 = xmm5; 3857 const XMMRegister msg3 = xmm6; 3858 const XMMRegister shuf_mask = xmm7; 3859 3860 __ enter(); 3861 3862 __ subptr(rsp, 4 * wordSize); 3863 3864 __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask, 3865 buf, state, ofs, limit, rsp, multi_block); 3866 3867 __ addptr(rsp, 4 * wordSize); 3868 3869 __ leave(); 3870 __ ret(0); 3871 return start; 3872 } 3873 3874 address generate_pshuffle_byte_flip_mask() { 3875 __ align(64); 3876 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask"); 3877 address start = __ pc(); 3878 __ emit_data64(0x0405060700010203, relocInfo::none); 3879 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none); 3880 3881 if (VM_Version::supports_avx2()) { 3882 __ emit_data64(0x0405060700010203, relocInfo::none); // second copy 3883 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none); 3884 // _SHUF_00BA 3885 __ emit_data64(0x0b0a090803020100, relocInfo::none); 3886 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 3887 __ emit_data64(0x0b0a090803020100, relocInfo::none); 3888 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 3889 // _SHUF_DC00 3890 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 3891 __ emit_data64(0x0b0a090803020100, relocInfo::none); 3892 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 3893 __ emit_data64(0x0b0a090803020100, relocInfo::none); 3894 } 3895 3896 return start; 3897 } 3898 3899 //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. 3900 address generate_pshuffle_byte_flip_mask_sha512() { 3901 __ align(32); 3902 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512"); 3903 address start = __ pc(); 3904 if (VM_Version::supports_avx2()) { 3905 __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK 3906 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); 3907 __ emit_data64(0x1011121314151617, relocInfo::none); 3908 __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none); 3909 __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO 3910 __ emit_data64(0x0000000000000000, relocInfo::none); 3911 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 3912 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 3913 } 3914 3915 return start; 3916 } 3917 3918 // ofs and limit are use for multi-block byte array. 3919 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 3920 address generate_sha256_implCompress(bool multi_block, const char *name) { 3921 assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), ""); 3922 __ align(CodeEntryAlignment); 3923 StubCodeMark mark(this, "StubRoutines", name); 3924 address start = __ pc(); 3925 3926 Register buf = c_rarg0; 3927 Register state = c_rarg1; 3928 Register ofs = c_rarg2; 3929 Register limit = c_rarg3; 3930 3931 const XMMRegister msg = xmm0; 3932 const XMMRegister state0 = xmm1; 3933 const XMMRegister state1 = xmm2; 3934 const XMMRegister msgtmp0 = xmm3; 3935 3936 const XMMRegister msgtmp1 = xmm4; 3937 const XMMRegister msgtmp2 = xmm5; 3938 const XMMRegister msgtmp3 = xmm6; 3939 const XMMRegister msgtmp4 = xmm7; 3940 3941 const XMMRegister shuf_mask = xmm8; 3942 3943 __ enter(); 3944 3945 __ subptr(rsp, 4 * wordSize); 3946 3947 if (VM_Version::supports_sha()) { 3948 __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, 3949 buf, state, ofs, limit, rsp, multi_block, shuf_mask); 3950 } else if (VM_Version::supports_avx2()) { 3951 __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, 3952 buf, state, ofs, limit, rsp, multi_block, shuf_mask); 3953 } 3954 __ addptr(rsp, 4 * wordSize); 3955 __ vzeroupper(); 3956 __ leave(); 3957 __ ret(0); 3958 return start; 3959 } 3960 3961 address generate_sha512_implCompress(bool multi_block, const char *name) { 3962 assert(VM_Version::supports_avx2(), ""); 3963 assert(VM_Version::supports_bmi2(), ""); 3964 __ align(CodeEntryAlignment); 3965 StubCodeMark mark(this, "StubRoutines", name); 3966 address start = __ pc(); 3967 3968 Register buf = c_rarg0; 3969 Register state = c_rarg1; 3970 Register ofs = c_rarg2; 3971 Register limit = c_rarg3; 3972 3973 const XMMRegister msg = xmm0; 3974 const XMMRegister state0 = xmm1; 3975 const XMMRegister state1 = xmm2; 3976 const XMMRegister msgtmp0 = xmm3; 3977 const XMMRegister msgtmp1 = xmm4; 3978 const XMMRegister msgtmp2 = xmm5; 3979 const XMMRegister msgtmp3 = xmm6; 3980 const XMMRegister msgtmp4 = xmm7; 3981 3982 const XMMRegister shuf_mask = xmm8; 3983 3984 __ enter(); 3985 3986 __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, 3987 buf, state, ofs, limit, rsp, multi_block, shuf_mask); 3988 3989 __ vzeroupper(); 3990 __ leave(); 3991 __ ret(0); 3992 return start; 3993 } 3994 3995 // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time 3996 // to hide instruction latency 3997 // 3998 // Arguments: 3999 // 4000 // Inputs: 4001 // c_rarg0 - source byte array address 4002 // c_rarg1 - destination byte array address 4003 // c_rarg2 - K (key) in little endian int array 4004 // c_rarg3 - counter vector byte array address 4005 // Linux 4006 // c_rarg4 - input length 4007 // c_rarg5 - saved encryptedCounter start 4008 // rbp + 6 * wordSize - saved used length 4009 // Windows 4010 // rbp + 6 * wordSize - input length 4011 // rbp + 7 * wordSize - saved encryptedCounter start 4012 // rbp + 8 * wordSize - saved used length 4013 // 4014 // Output: 4015 // rax - input length 4016 // 4017 address generate_counterMode_AESCrypt_Parallel() { 4018 assert(UseAES, "need AES instructions and misaligned SSE support"); 4019 __ align(CodeEntryAlignment); 4020 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 4021 address start = __ pc(); 4022 const Register from = c_rarg0; // source array address 4023 const Register to = c_rarg1; // destination array address 4024 const Register key = c_rarg2; // key array address 4025 const Register counter = c_rarg3; // counter byte array initialized from counter array address 4026 // and updated with the incremented counter in the end 4027 #ifndef _WIN64 4028 const Register len_reg = c_rarg4; 4029 const Register saved_encCounter_start = c_rarg5; 4030 const Register used_addr = r10; 4031 const Address used_mem(rbp, 2 * wordSize); 4032 const Register used = r11; 4033 #else 4034 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 4035 const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64 4036 const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64 4037 const Register len_reg = r10; // pick the first volatile windows register 4038 const Register saved_encCounter_start = r11; 4039 const Register used_addr = r13; 4040 const Register used = r14; 4041 #endif 4042 const Register pos = rax; 4043 4044 const int PARALLEL_FACTOR = 6; 4045 const XMMRegister xmm_counter_shuf_mask = xmm0; 4046 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front 4047 const XMMRegister xmm_curr_counter = xmm2; 4048 4049 const XMMRegister xmm_key_tmp0 = xmm3; 4050 const XMMRegister xmm_key_tmp1 = xmm4; 4051 4052 // registers holding the four results in the parallelized loop 4053 const XMMRegister xmm_result0 = xmm5; 4054 const XMMRegister xmm_result1 = xmm6; 4055 const XMMRegister xmm_result2 = xmm7; 4056 const XMMRegister xmm_result3 = xmm8; 4057 const XMMRegister xmm_result4 = xmm9; 4058 const XMMRegister xmm_result5 = xmm10; 4059 4060 const XMMRegister xmm_from0 = xmm11; 4061 const XMMRegister xmm_from1 = xmm12; 4062 const XMMRegister xmm_from2 = xmm13; 4063 const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64. 4064 const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text 4065 const XMMRegister xmm_from5 = xmm4; 4066 4067 //for key_128, key_192, key_256 4068 const int rounds[3] = {10, 12, 14}; 4069 Label L_exit_preLoop, L_preLoop_start; 4070 Label L_multiBlock_loopTop[3]; 4071 Label L_singleBlockLoopTop[3]; 4072 Label L__incCounter[3][6]; //for 6 blocks 4073 Label L__incCounter_single[3]; //for single block, key128, key192, key256 4074 Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3]; 4075 Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3]; 4076 4077 Label L_exit; 4078 4079 __ enter(); // required for proper stackwalking of RuntimeStub frame 4080 4081 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 4082 // context for the registers used, where all instructions below are using 128-bit mode 4083 // On EVEX without VL and BW, these instructions will all be AVX. 4084 if (VM_Version::supports_avx512vlbw()) { 4085 __ movl(rax, 0xffff); 4086 __ kmovql(k1, rax); 4087 } 4088 4089 #ifdef _WIN64 4090 // allocate spill slots for r13, r14 4091 enum { 4092 saved_r13_offset, 4093 saved_r14_offset 4094 }; 4095 __ subptr(rsp, 2 * wordSize); 4096 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13); 4097 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14); 4098 4099 // on win64, fill len_reg from stack position 4100 __ movl(len_reg, len_mem); 4101 __ movptr(saved_encCounter_start, saved_encCounter_mem); 4102 __ movptr(used_addr, used_mem); 4103 __ movl(used, Address(used_addr, 0)); 4104 #else 4105 __ push(len_reg); // Save 4106 __ movptr(used_addr, used_mem); 4107 __ movl(used, Address(used_addr, 0)); 4108 #endif 4109 4110 __ push(rbx); // Save RBX 4111 __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter 4112 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch 4113 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled 4114 __ movptr(pos, 0); 4115 4116 // Use the partially used encrpyted counter from last invocation 4117 __ BIND(L_preLoop_start); 4118 __ cmpptr(used, 16); 4119 __ jcc(Assembler::aboveEqual, L_exit_preLoop); 4120 __ cmpptr(len_reg, 0); 4121 __ jcc(Assembler::lessEqual, L_exit_preLoop); 4122 __ movb(rbx, Address(saved_encCounter_start, used)); 4123 __ xorb(rbx, Address(from, pos)); 4124 __ movb(Address(to, pos), rbx); 4125 __ addptr(pos, 1); 4126 __ addptr(used, 1); 4127 __ subptr(len_reg, 1); 4128 4129 __ jmp(L_preLoop_start); 4130 4131 __ BIND(L_exit_preLoop); 4132 __ movl(Address(used_addr, 0), used); 4133 4134 // key length could be only {11, 13, 15} * 4 = {44, 52, 60} 4135 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch 4136 __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 4137 __ cmpl(rbx, 52); 4138 __ jcc(Assembler::equal, L_multiBlock_loopTop[1]); 4139 __ cmpl(rbx, 60); 4140 __ jcc(Assembler::equal, L_multiBlock_loopTop[2]); 4141 4142 #define CTR_DoSix(opc, src_reg) \ 4143 __ opc(xmm_result0, src_reg); \ 4144 __ opc(xmm_result1, src_reg); \ 4145 __ opc(xmm_result2, src_reg); \ 4146 __ opc(xmm_result3, src_reg); \ 4147 __ opc(xmm_result4, src_reg); \ 4148 __ opc(xmm_result5, src_reg); 4149 4150 // k == 0 : generate code for key_128 4151 // k == 1 : generate code for key_192 4152 // k == 2 : generate code for key_256 4153 for (int k = 0; k < 3; ++k) { 4154 //multi blocks starts here 4155 __ align(OptoLoopAlignment); 4156 __ BIND(L_multiBlock_loopTop[k]); 4157 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left 4158 __ jcc(Assembler::less, L_singleBlockLoopTop[k]); 4159 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); 4160 4161 //load, then increase counters 4162 CTR_DoSix(movdqa, xmm_curr_counter); 4163 inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]); 4164 inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]); 4165 inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]); 4166 inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]); 4167 inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]); 4168 inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]); 4169 CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR 4170 CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key 4171 4172 //load two ROUND_KEYs at a time 4173 for (int i = 1; i < rounds[k]; ) { 4174 load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask); 4175 load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask); 4176 CTR_DoSix(aesenc, xmm_key_tmp1); 4177 i++; 4178 if (i != rounds[k]) { 4179 CTR_DoSix(aesenc, xmm_key_tmp0); 4180 } else { 4181 CTR_DoSix(aesenclast, xmm_key_tmp0); 4182 } 4183 i++; 4184 } 4185 4186 // get next PARALLEL_FACTOR blocks into xmm_result registers 4187 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); 4188 __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); 4189 __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); 4190 __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); 4191 __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize)); 4192 __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize)); 4193 4194 __ pxor(xmm_result0, xmm_from0); 4195 __ pxor(xmm_result1, xmm_from1); 4196 __ pxor(xmm_result2, xmm_from2); 4197 __ pxor(xmm_result3, xmm_from3); 4198 __ pxor(xmm_result4, xmm_from4); 4199 __ pxor(xmm_result5, xmm_from5); 4200 4201 // store 6 results into the next 64 bytes of output 4202 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); 4203 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); 4204 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); 4205 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); 4206 __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4); 4207 __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5); 4208 4209 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text 4210 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length 4211 __ jmp(L_multiBlock_loopTop[k]); 4212 4213 // singleBlock starts here 4214 __ align(OptoLoopAlignment); 4215 __ BIND(L_singleBlockLoopTop[k]); 4216 __ cmpptr(len_reg, 0); 4217 __ jcc(Assembler::lessEqual, L_exit); 4218 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); 4219 __ movdqa(xmm_result0, xmm_curr_counter); 4220 inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]); 4221 __ pshufb(xmm_result0, xmm_counter_shuf_mask); 4222 __ pxor(xmm_result0, xmm_key_tmp0); 4223 for (int i = 1; i < rounds[k]; i++) { 4224 load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask); 4225 __ aesenc(xmm_result0, xmm_key_tmp0); 4226 } 4227 load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask); 4228 __ aesenclast(xmm_result0, xmm_key_tmp0); 4229 __ cmpptr(len_reg, AESBlockSize); 4230 __ jcc(Assembler::less, L_processTail_insr[k]); 4231 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); 4232 __ pxor(xmm_result0, xmm_from0); 4233 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); 4234 __ addptr(pos, AESBlockSize); 4235 __ subptr(len_reg, AESBlockSize); 4236 __ jmp(L_singleBlockLoopTop[k]); 4237 __ BIND(L_processTail_insr[k]); // Process the tail part of the input array 4238 __ addptr(pos, len_reg); // 1. Insert bytes from src array into xmm_from0 register 4239 __ testptr(len_reg, 8); 4240 __ jcc(Assembler::zero, L_processTail_4_insr[k]); 4241 __ subptr(pos,8); 4242 __ pinsrq(xmm_from0, Address(from, pos), 0); 4243 __ BIND(L_processTail_4_insr[k]); 4244 __ testptr(len_reg, 4); 4245 __ jcc(Assembler::zero, L_processTail_2_insr[k]); 4246 __ subptr(pos,4); 4247 __ pslldq(xmm_from0, 4); 4248 __ pinsrd(xmm_from0, Address(from, pos), 0); 4249 __ BIND(L_processTail_2_insr[k]); 4250 __ testptr(len_reg, 2); 4251 __ jcc(Assembler::zero, L_processTail_1_insr[k]); 4252 __ subptr(pos, 2); 4253 __ pslldq(xmm_from0, 2); 4254 __ pinsrw(xmm_from0, Address(from, pos), 0); 4255 __ BIND(L_processTail_1_insr[k]); 4256 __ testptr(len_reg, 1); 4257 __ jcc(Assembler::zero, L_processTail_exit_insr[k]); 4258 __ subptr(pos, 1); 4259 __ pslldq(xmm_from0, 1); 4260 __ pinsrb(xmm_from0, Address(from, pos), 0); 4261 __ BIND(L_processTail_exit_insr[k]); 4262 4263 __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); // 2. Perform pxor of the encrypted counter and plaintext Bytes. 4264 __ pxor(xmm_result0, xmm_from0); // Also the encrypted counter is saved for next invocation. 4265 4266 __ testptr(len_reg, 8); 4267 __ jcc(Assembler::zero, L_processTail_4_extr[k]); // 3. Extract bytes from xmm_result0 into the dest. array 4268 __ pextrq(Address(to, pos), xmm_result0, 0); 4269 __ psrldq(xmm_result0, 8); 4270 __ addptr(pos, 8); 4271 __ BIND(L_processTail_4_extr[k]); 4272 __ testptr(len_reg, 4); 4273 __ jcc(Assembler::zero, L_processTail_2_extr[k]); 4274 __ pextrd(Address(to, pos), xmm_result0, 0); 4275 __ psrldq(xmm_result0, 4); 4276 __ addptr(pos, 4); 4277 __ BIND(L_processTail_2_extr[k]); 4278 __ testptr(len_reg, 2); 4279 __ jcc(Assembler::zero, L_processTail_1_extr[k]); 4280 __ pextrw(Address(to, pos), xmm_result0, 0); 4281 __ psrldq(xmm_result0, 2); 4282 __ addptr(pos, 2); 4283 __ BIND(L_processTail_1_extr[k]); 4284 __ testptr(len_reg, 1); 4285 __ jcc(Assembler::zero, L_processTail_exit_extr[k]); 4286 __ pextrb(Address(to, pos), xmm_result0, 0); 4287 4288 __ BIND(L_processTail_exit_extr[k]); 4289 __ movl(Address(used_addr, 0), len_reg); 4290 __ jmp(L_exit); 4291 4292 } 4293 4294 __ BIND(L_exit); 4295 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back. 4296 __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back 4297 __ pop(rbx); // pop the saved RBX. 4298 #ifdef _WIN64 4299 __ movl(rax, len_mem); 4300 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize)); 4301 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize)); 4302 __ addptr(rsp, 2 * wordSize); 4303 #else 4304 __ pop(rax); // return 'len' 4305 #endif 4306 __ leave(); // required for proper stackwalking of RuntimeStub frame 4307 __ ret(0); 4308 return start; 4309 } 4310 4311 // byte swap x86 long 4312 address generate_ghash_long_swap_mask() { 4313 __ align(CodeEntryAlignment); 4314 StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask"); 4315 address start = __ pc(); 4316 __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none ); 4317 __ emit_data64(0x0706050403020100, relocInfo::none ); 4318 return start; 4319 } 4320 4321 // byte swap x86 byte array 4322 address generate_ghash_byte_swap_mask() { 4323 __ align(CodeEntryAlignment); 4324 StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask"); 4325 address start = __ pc(); 4326 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none ); 4327 __ emit_data64(0x0001020304050607, relocInfo::none ); 4328 return start; 4329 } 4330 4331 /* Single and multi-block ghash operations */ 4332 address generate_ghash_processBlocks() { 4333 __ align(CodeEntryAlignment); 4334 Label L_ghash_loop, L_exit; 4335 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4336 address start = __ pc(); 4337 4338 const Register state = c_rarg0; 4339 const Register subkeyH = c_rarg1; 4340 const Register data = c_rarg2; 4341 const Register blocks = c_rarg3; 4342 4343 const XMMRegister xmm_temp0 = xmm0; 4344 const XMMRegister xmm_temp1 = xmm1; 4345 const XMMRegister xmm_temp2 = xmm2; 4346 const XMMRegister xmm_temp3 = xmm3; 4347 const XMMRegister xmm_temp4 = xmm4; 4348 const XMMRegister xmm_temp5 = xmm5; 4349 const XMMRegister xmm_temp6 = xmm6; 4350 const XMMRegister xmm_temp7 = xmm7; 4351 const XMMRegister xmm_temp8 = xmm8; 4352 const XMMRegister xmm_temp9 = xmm9; 4353 const XMMRegister xmm_temp10 = xmm10; 4354 4355 __ enter(); 4356 4357 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 4358 // context for the registers used, where all instructions below are using 128-bit mode 4359 // On EVEX without VL and BW, these instructions will all be AVX. 4360 if (VM_Version::supports_avx512vlbw()) { 4361 __ movl(rax, 0xffff); 4362 __ kmovql(k1, rax); 4363 } 4364 4365 __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); 4366 4367 __ movdqu(xmm_temp0, Address(state, 0)); 4368 __ pshufb(xmm_temp0, xmm_temp10); 4369 4370 4371 __ BIND(L_ghash_loop); 4372 __ movdqu(xmm_temp2, Address(data, 0)); 4373 __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); 4374 4375 __ movdqu(xmm_temp1, Address(subkeyH, 0)); 4376 __ pshufb(xmm_temp1, xmm_temp10); 4377 4378 __ pxor(xmm_temp0, xmm_temp2); 4379 4380 // 4381 // Multiply with the hash key 4382 // 4383 __ movdqu(xmm_temp3, xmm_temp0); 4384 __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0 4385 __ movdqu(xmm_temp4, xmm_temp0); 4386 __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1 4387 4388 __ movdqu(xmm_temp5, xmm_temp0); 4389 __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0 4390 __ movdqu(xmm_temp6, xmm_temp0); 4391 __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1 4392 4393 __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0 4394 4395 __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5 4396 __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right 4397 __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left 4398 __ pxor(xmm_temp3, xmm_temp5); 4399 __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result 4400 // of the carry-less multiplication of 4401 // xmm0 by xmm1. 4402 4403 // We shift the result of the multiplication by one bit position 4404 // to the left to cope for the fact that the bits are reversed. 4405 __ movdqu(xmm_temp7, xmm_temp3); 4406 __ movdqu(xmm_temp8, xmm_temp6); 4407 __ pslld(xmm_temp3, 1); 4408 __ pslld(xmm_temp6, 1); 4409 __ psrld(xmm_temp7, 31); 4410 __ psrld(xmm_temp8, 31); 4411 __ movdqu(xmm_temp9, xmm_temp7); 4412 __ pslldq(xmm_temp8, 4); 4413 __ pslldq(xmm_temp7, 4); 4414 __ psrldq(xmm_temp9, 12); 4415 __ por(xmm_temp3, xmm_temp7); 4416 __ por(xmm_temp6, xmm_temp8); 4417 __ por(xmm_temp6, xmm_temp9); 4418 4419 // 4420 // First phase of the reduction 4421 // 4422 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts 4423 // independently. 4424 __ movdqu(xmm_temp7, xmm_temp3); 4425 __ movdqu(xmm_temp8, xmm_temp3); 4426 __ movdqu(xmm_temp9, xmm_temp3); 4427 __ pslld(xmm_temp7, 31); // packed right shift shifting << 31 4428 __ pslld(xmm_temp8, 30); // packed right shift shifting << 30 4429 __ pslld(xmm_temp9, 25); // packed right shift shifting << 25 4430 __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions 4431 __ pxor(xmm_temp7, xmm_temp9); 4432 __ movdqu(xmm_temp8, xmm_temp7); 4433 __ pslldq(xmm_temp7, 12); 4434 __ psrldq(xmm_temp8, 4); 4435 __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete 4436 4437 // 4438 // Second phase of the reduction 4439 // 4440 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these 4441 // shift operations. 4442 __ movdqu(xmm_temp2, xmm_temp3); 4443 __ movdqu(xmm_temp4, xmm_temp3); 4444 __ movdqu(xmm_temp5, xmm_temp3); 4445 __ psrld(xmm_temp2, 1); // packed left shifting >> 1 4446 __ psrld(xmm_temp4, 2); // packed left shifting >> 2 4447 __ psrld(xmm_temp5, 7); // packed left shifting >> 7 4448 __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions 4449 __ pxor(xmm_temp2, xmm_temp5); 4450 __ pxor(xmm_temp2, xmm_temp8); 4451 __ pxor(xmm_temp3, xmm_temp2); 4452 __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6 4453 4454 __ decrement(blocks); 4455 __ jcc(Assembler::zero, L_exit); 4456 __ movdqu(xmm_temp0, xmm_temp6); 4457 __ addptr(data, 16); 4458 __ jmp(L_ghash_loop); 4459 4460 __ BIND(L_exit); 4461 __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result 4462 __ movdqu(Address(state, 0), xmm_temp6); // store the result 4463 __ leave(); 4464 __ ret(0); 4465 return start; 4466 } 4467 4468 /** 4469 * Arguments: 4470 * 4471 * Inputs: 4472 * c_rarg0 - int crc 4473 * c_rarg1 - byte* buf 4474 * c_rarg2 - int length 4475 * 4476 * Ouput: 4477 * rax - int crc result 4478 */ 4479 address generate_updateBytesCRC32() { 4480 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions"); 4481 4482 __ align(CodeEntryAlignment); 4483 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4484 4485 address start = __ pc(); 4486 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 4487 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) 4488 // rscratch1: r10 4489 const Register crc = c_rarg0; // crc 4490 const Register buf = c_rarg1; // source java byte array address 4491 const Register len = c_rarg2; // length 4492 const Register table = c_rarg3; // crc_table address (reuse register) 4493 const Register tmp = r11; 4494 assert_different_registers(crc, buf, len, table, tmp, rax); 4495 4496 BLOCK_COMMENT("Entry:"); 4497 __ enter(); // required for proper stackwalking of RuntimeStub frame 4498 4499 __ kernel_crc32(crc, buf, len, table, tmp); 4500 4501 __ movl(rax, crc); 4502 __ vzeroupper(); 4503 __ leave(); // required for proper stackwalking of RuntimeStub frame 4504 __ ret(0); 4505 4506 return start; 4507 } 4508 4509 /** 4510 * Arguments: 4511 * 4512 * Inputs: 4513 * c_rarg0 - int crc 4514 * c_rarg1 - byte* buf 4515 * c_rarg2 - long length 4516 * c_rarg3 - table_start - optional (present only when doing a library_call, 4517 * not used by x86 algorithm) 4518 * 4519 * Ouput: 4520 * rax - int crc result 4521 */ 4522 address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) { 4523 assert(UseCRC32CIntrinsics, "need SSE4_2"); 4524 __ align(CodeEntryAlignment); 4525 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4526 address start = __ pc(); 4527 //reg.arg int#0 int#1 int#2 int#3 int#4 int#5 float regs 4528 //Windows RCX RDX R8 R9 none none XMM0..XMM3 4529 //Lin / Sol RDI RSI RDX RCX R8 R9 XMM0..XMM7 4530 const Register crc = c_rarg0; // crc 4531 const Register buf = c_rarg1; // source java byte array address 4532 const Register len = c_rarg2; // length 4533 const Register a = rax; 4534 const Register j = r9; 4535 const Register k = r10; 4536 const Register l = r11; 4537 #ifdef _WIN64 4538 const Register y = rdi; 4539 const Register z = rsi; 4540 #else 4541 const Register y = rcx; 4542 const Register z = r8; 4543 #endif 4544 assert_different_registers(crc, buf, len, a, j, k, l, y, z); 4545 4546 BLOCK_COMMENT("Entry:"); 4547 __ enter(); // required for proper stackwalking of RuntimeStub frame 4548 #ifdef _WIN64 4549 __ push(y); 4550 __ push(z); 4551 #endif 4552 __ crc32c_ipl_alg2_alt2(crc, buf, len, 4553 a, j, k, 4554 l, y, z, 4555 c_farg0, c_farg1, c_farg2, 4556 is_pclmulqdq_supported); 4557 __ movl(rax, crc); 4558 #ifdef _WIN64 4559 __ pop(z); 4560 __ pop(y); 4561 #endif 4562 __ vzeroupper(); 4563 __ leave(); // required for proper stackwalking of RuntimeStub frame 4564 __ ret(0); 4565 4566 return start; 4567 } 4568 4569 /** 4570 * Arguments: 4571 * 4572 * Input: 4573 * c_rarg0 - x address 4574 * c_rarg1 - x length 4575 * c_rarg2 - y address 4576 * c_rarg3 - y lenth 4577 * not Win64 4578 * c_rarg4 - z address 4579 * c_rarg5 - z length 4580 * Win64 4581 * rsp+40 - z address 4582 * rsp+48 - z length 4583 */ 4584 address generate_multiplyToLen() { 4585 __ align(CodeEntryAlignment); 4586 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4587 4588 address start = __ pc(); 4589 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 4590 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) 4591 const Register x = rdi; 4592 const Register xlen = rax; 4593 const Register y = rsi; 4594 const Register ylen = rcx; 4595 const Register z = r8; 4596 const Register zlen = r11; 4597 4598 // Next registers will be saved on stack in multiply_to_len(). 4599 const Register tmp1 = r12; 4600 const Register tmp2 = r13; 4601 const Register tmp3 = r14; 4602 const Register tmp4 = r15; 4603 const Register tmp5 = rbx; 4604 4605 BLOCK_COMMENT("Entry:"); 4606 __ enter(); // required for proper stackwalking of RuntimeStub frame 4607 4608 #ifndef _WIN64 4609 __ movptr(zlen, r9); // Save r9 in r11 - zlen 4610 #endif 4611 setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx 4612 // ylen => rcx, z => r8, zlen => r11 4613 // r9 and r10 may be used to save non-volatile registers 4614 #ifdef _WIN64 4615 // last 2 arguments (#4, #5) are on stack on Win64 4616 __ movptr(z, Address(rsp, 6 * wordSize)); 4617 __ movptr(zlen, Address(rsp, 7 * wordSize)); 4618 #endif 4619 4620 __ movptr(xlen, rsi); 4621 __ movptr(y, rdx); 4622 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5); 4623 4624 restore_arg_regs(); 4625 4626 __ leave(); // required for proper stackwalking of RuntimeStub frame 4627 __ ret(0); 4628 4629 return start; 4630 } 4631 4632 /** 4633 * Arguments: 4634 * 4635 * Input: 4636 * c_rarg0 - obja address 4637 * c_rarg1 - objb address 4638 * c_rarg3 - length length 4639 * c_rarg4 - scale log2_array_indxscale 4640 * 4641 * Output: 4642 * rax - int >= mismatched index, < 0 bitwise complement of tail 4643 */ 4644 address generate_vectorizedMismatch() { 4645 __ align(CodeEntryAlignment); 4646 StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch"); 4647 address start = __ pc(); 4648 4649 BLOCK_COMMENT("Entry:"); 4650 __ enter(); 4651 4652 #ifdef _WIN64 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 4653 const Register scale = c_rarg0; //rcx, will exchange with r9 4654 const Register objb = c_rarg1; //rdx 4655 const Register length = c_rarg2; //r8 4656 const Register obja = c_rarg3; //r9 4657 __ xchgq(obja, scale); //now obja and scale contains the correct contents 4658 4659 const Register tmp1 = r10; 4660 const Register tmp2 = r11; 4661 #endif 4662 #ifndef _WIN64 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) 4663 const Register obja = c_rarg0; //U:rdi 4664 const Register objb = c_rarg1; //U:rsi 4665 const Register length = c_rarg2; //U:rdx 4666 const Register scale = c_rarg3; //U:rcx 4667 const Register tmp1 = r8; 4668 const Register tmp2 = r9; 4669 #endif 4670 const Register result = rax; //return value 4671 const XMMRegister vec0 = xmm0; 4672 const XMMRegister vec1 = xmm1; 4673 const XMMRegister vec2 = xmm2; 4674 4675 __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2); 4676 4677 __ vzeroupper(); 4678 __ leave(); 4679 __ ret(0); 4680 4681 return start; 4682 } 4683 4684 /** 4685 * Arguments: 4686 * 4687 // Input: 4688 // c_rarg0 - x address 4689 // c_rarg1 - x length 4690 // c_rarg2 - z address 4691 // c_rarg3 - z lenth 4692 * 4693 */ 4694 address generate_squareToLen() { 4695 4696 __ align(CodeEntryAlignment); 4697 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4698 4699 address start = __ pc(); 4700 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 4701 // Unix: rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...) 4702 const Register x = rdi; 4703 const Register len = rsi; 4704 const Register z = r8; 4705 const Register zlen = rcx; 4706 4707 const Register tmp1 = r12; 4708 const Register tmp2 = r13; 4709 const Register tmp3 = r14; 4710 const Register tmp4 = r15; 4711 const Register tmp5 = rbx; 4712 4713 BLOCK_COMMENT("Entry:"); 4714 __ enter(); // required for proper stackwalking of RuntimeStub frame 4715 4716 setup_arg_regs(4); // x => rdi, len => rsi, z => rdx 4717 // zlen => rcx 4718 // r9 and r10 may be used to save non-volatile registers 4719 __ movptr(r8, rdx); 4720 __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax); 4721 4722 restore_arg_regs(); 4723 4724 __ leave(); // required for proper stackwalking of RuntimeStub frame 4725 __ ret(0); 4726 4727 return start; 4728 } 4729 4730 /** 4731 * Arguments: 4732 * 4733 * Input: 4734 * c_rarg0 - out address 4735 * c_rarg1 - in address 4736 * c_rarg2 - offset 4737 * c_rarg3 - len 4738 * not Win64 4739 * c_rarg4 - k 4740 * Win64 4741 * rsp+40 - k 4742 */ 4743 address generate_mulAdd() { 4744 __ align(CodeEntryAlignment); 4745 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4746 4747 address start = __ pc(); 4748 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 4749 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) 4750 const Register out = rdi; 4751 const Register in = rsi; 4752 const Register offset = r11; 4753 const Register len = rcx; 4754 const Register k = r8; 4755 4756 // Next registers will be saved on stack in mul_add(). 4757 const Register tmp1 = r12; 4758 const Register tmp2 = r13; 4759 const Register tmp3 = r14; 4760 const Register tmp4 = r15; 4761 const Register tmp5 = rbx; 4762 4763 BLOCK_COMMENT("Entry:"); 4764 __ enter(); // required for proper stackwalking of RuntimeStub frame 4765 4766 setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx 4767 // len => rcx, k => r8 4768 // r9 and r10 may be used to save non-volatile registers 4769 #ifdef _WIN64 4770 // last argument is on stack on Win64 4771 __ movl(k, Address(rsp, 6 * wordSize)); 4772 #endif 4773 __ movptr(r11, rdx); // move offset in rdx to offset(r11) 4774 __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax); 4775 4776 restore_arg_regs(); 4777 4778 __ leave(); // required for proper stackwalking of RuntimeStub frame 4779 __ ret(0); 4780 4781 return start; 4782 } 4783 4784 address generate_libmExp() { 4785 StubCodeMark mark(this, "StubRoutines", "libmExp"); 4786 4787 address start = __ pc(); 4788 4789 const XMMRegister x0 = xmm0; 4790 const XMMRegister x1 = xmm1; 4791 const XMMRegister x2 = xmm2; 4792 const XMMRegister x3 = xmm3; 4793 4794 const XMMRegister x4 = xmm4; 4795 const XMMRegister x5 = xmm5; 4796 const XMMRegister x6 = xmm6; 4797 const XMMRegister x7 = xmm7; 4798 4799 const Register tmp = r11; 4800 4801 BLOCK_COMMENT("Entry:"); 4802 __ enter(); // required for proper stackwalking of RuntimeStub frame 4803 4804 __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); 4805 4806 __ leave(); // required for proper stackwalking of RuntimeStub frame 4807 __ ret(0); 4808 4809 return start; 4810 4811 } 4812 4813 address generate_libmLog() { 4814 StubCodeMark mark(this, "StubRoutines", "libmLog"); 4815 4816 address start = __ pc(); 4817 4818 const XMMRegister x0 = xmm0; 4819 const XMMRegister x1 = xmm1; 4820 const XMMRegister x2 = xmm2; 4821 const XMMRegister x3 = xmm3; 4822 4823 const XMMRegister x4 = xmm4; 4824 const XMMRegister x5 = xmm5; 4825 const XMMRegister x6 = xmm6; 4826 const XMMRegister x7 = xmm7; 4827 4828 const Register tmp1 = r11; 4829 const Register tmp2 = r8; 4830 4831 BLOCK_COMMENT("Entry:"); 4832 __ enter(); // required for proper stackwalking of RuntimeStub frame 4833 4834 __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2); 4835 4836 __ leave(); // required for proper stackwalking of RuntimeStub frame 4837 __ ret(0); 4838 4839 return start; 4840 4841 } 4842 4843 address generate_libmLog10() { 4844 StubCodeMark mark(this, "StubRoutines", "libmLog10"); 4845 4846 address start = __ pc(); 4847 4848 const XMMRegister x0 = xmm0; 4849 const XMMRegister x1 = xmm1; 4850 const XMMRegister x2 = xmm2; 4851 const XMMRegister x3 = xmm3; 4852 4853 const XMMRegister x4 = xmm4; 4854 const XMMRegister x5 = xmm5; 4855 const XMMRegister x6 = xmm6; 4856 const XMMRegister x7 = xmm7; 4857 4858 const Register tmp = r11; 4859 4860 BLOCK_COMMENT("Entry:"); 4861 __ enter(); // required for proper stackwalking of RuntimeStub frame 4862 4863 __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); 4864 4865 __ leave(); // required for proper stackwalking of RuntimeStub frame 4866 __ ret(0); 4867 4868 return start; 4869 4870 } 4871 4872 address generate_libmPow() { 4873 StubCodeMark mark(this, "StubRoutines", "libmPow"); 4874 4875 address start = __ pc(); 4876 4877 const XMMRegister x0 = xmm0; 4878 const XMMRegister x1 = xmm1; 4879 const XMMRegister x2 = xmm2; 4880 const XMMRegister x3 = xmm3; 4881 4882 const XMMRegister x4 = xmm4; 4883 const XMMRegister x5 = xmm5; 4884 const XMMRegister x6 = xmm6; 4885 const XMMRegister x7 = xmm7; 4886 4887 const Register tmp1 = r8; 4888 const Register tmp2 = r9; 4889 const Register tmp3 = r10; 4890 const Register tmp4 = r11; 4891 4892 BLOCK_COMMENT("Entry:"); 4893 __ enter(); // required for proper stackwalking of RuntimeStub frame 4894 4895 __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4); 4896 4897 __ leave(); // required for proper stackwalking of RuntimeStub frame 4898 __ ret(0); 4899 4900 return start; 4901 4902 } 4903 4904 address generate_libmSin() { 4905 StubCodeMark mark(this, "StubRoutines", "libmSin"); 4906 4907 address start = __ pc(); 4908 4909 const XMMRegister x0 = xmm0; 4910 const XMMRegister x1 = xmm1; 4911 const XMMRegister x2 = xmm2; 4912 const XMMRegister x3 = xmm3; 4913 4914 const XMMRegister x4 = xmm4; 4915 const XMMRegister x5 = xmm5; 4916 const XMMRegister x6 = xmm6; 4917 const XMMRegister x7 = xmm7; 4918 4919 const Register tmp1 = r8; 4920 const Register tmp2 = r9; 4921 const Register tmp3 = r10; 4922 const Register tmp4 = r11; 4923 4924 BLOCK_COMMENT("Entry:"); 4925 __ enter(); // required for proper stackwalking of RuntimeStub frame 4926 4927 #ifdef _WIN64 4928 __ push(rsi); 4929 __ push(rdi); 4930 #endif 4931 __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4); 4932 4933 #ifdef _WIN64 4934 __ pop(rdi); 4935 __ pop(rsi); 4936 #endif 4937 4938 __ leave(); // required for proper stackwalking of RuntimeStub frame 4939 __ ret(0); 4940 4941 return start; 4942 4943 } 4944 4945 address generate_libmCos() { 4946 StubCodeMark mark(this, "StubRoutines", "libmCos"); 4947 4948 address start = __ pc(); 4949 4950 const XMMRegister x0 = xmm0; 4951 const XMMRegister x1 = xmm1; 4952 const XMMRegister x2 = xmm2; 4953 const XMMRegister x3 = xmm3; 4954 4955 const XMMRegister x4 = xmm4; 4956 const XMMRegister x5 = xmm5; 4957 const XMMRegister x6 = xmm6; 4958 const XMMRegister x7 = xmm7; 4959 4960 const Register tmp1 = r8; 4961 const Register tmp2 = r9; 4962 const Register tmp3 = r10; 4963 const Register tmp4 = r11; 4964 4965 BLOCK_COMMENT("Entry:"); 4966 __ enter(); // required for proper stackwalking of RuntimeStub frame 4967 4968 #ifdef _WIN64 4969 __ push(rsi); 4970 __ push(rdi); 4971 #endif 4972 __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4); 4973 4974 #ifdef _WIN64 4975 __ pop(rdi); 4976 __ pop(rsi); 4977 #endif 4978 4979 __ leave(); // required for proper stackwalking of RuntimeStub frame 4980 __ ret(0); 4981 4982 return start; 4983 4984 } 4985 4986 address generate_libmTan() { 4987 StubCodeMark mark(this, "StubRoutines", "libmTan"); 4988 4989 address start = __ pc(); 4990 4991 const XMMRegister x0 = xmm0; 4992 const XMMRegister x1 = xmm1; 4993 const XMMRegister x2 = xmm2; 4994 const XMMRegister x3 = xmm3; 4995 4996 const XMMRegister x4 = xmm4; 4997 const XMMRegister x5 = xmm5; 4998 const XMMRegister x6 = xmm6; 4999 const XMMRegister x7 = xmm7; 5000 5001 const Register tmp1 = r8; 5002 const Register tmp2 = r9; 5003 const Register tmp3 = r10; 5004 const Register tmp4 = r11; 5005 5006 BLOCK_COMMENT("Entry:"); 5007 __ enter(); // required for proper stackwalking of RuntimeStub frame 5008 5009 #ifdef _WIN64 5010 __ push(rsi); 5011 __ push(rdi); 5012 #endif 5013 __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4); 5014 5015 #ifdef _WIN64 5016 __ pop(rdi); 5017 __ pop(rsi); 5018 #endif 5019 5020 __ leave(); // required for proper stackwalking of RuntimeStub frame 5021 __ ret(0); 5022 5023 return start; 5024 5025 } 5026 5027 #undef __ 5028 #define __ masm-> 5029 5030 // Continuation point for throwing of implicit exceptions that are 5031 // not handled in the current activation. Fabricates an exception 5032 // oop and initiates normal exception dispatching in this 5033 // frame. Since we need to preserve callee-saved values (currently 5034 // only for C2, but done for C1 as well) we need a callee-saved oop 5035 // map and therefore have to make these stubs into RuntimeStubs 5036 // rather than BufferBlobs. If the compiler needs all registers to 5037 // be preserved between the fault point and the exception handler 5038 // then it must assume responsibility for that in 5039 // AbstractCompiler::continuation_for_implicit_null_exception or 5040 // continuation_for_implicit_division_by_zero_exception. All other 5041 // implicit exceptions (e.g., NullPointerException or 5042 // AbstractMethodError on entry) are either at call sites or 5043 // otherwise assume that stack unwinding will be initiated, so 5044 // caller saved registers were assumed volatile in the compiler. 5045 address generate_throw_exception(const char* name, 5046 address runtime_entry, 5047 Register arg1 = noreg, 5048 Register arg2 = noreg) { 5049 // Information about frame layout at time of blocking runtime call. 5050 // Note that we only have to preserve callee-saved registers since 5051 // the compilers are responsible for supplying a continuation point 5052 // if they expect all registers to be preserved. 5053 enum layout { 5054 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 5055 rbp_off2, 5056 return_off, 5057 return_off2, 5058 framesize // inclusive of return address 5059 }; 5060 5061 int insts_size = 512; 5062 int locs_size = 64; 5063 5064 CodeBuffer code(name, insts_size, locs_size); 5065 OopMapSet* oop_maps = new OopMapSet(); 5066 MacroAssembler* masm = new MacroAssembler(&code); 5067 5068 address start = __ pc(); 5069 5070 // This is an inlined and slightly modified version of call_VM 5071 // which has the ability to fetch the return PC out of 5072 // thread-local storage and also sets up last_Java_sp slightly 5073 // differently than the real call_VM 5074 5075 __ enter(); // required for proper stackwalking of RuntimeStub frame 5076 5077 assert(is_even(framesize/2), "sp not 16-byte aligned"); 5078 5079 // return address and rbp are already in place 5080 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog 5081 5082 int frame_complete = __ pc() - start; 5083 5084 // Set up last_Java_sp and last_Java_fp 5085 address the_pc = __ pc(); 5086 __ set_last_Java_frame(rsp, rbp, the_pc); 5087 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 5088 5089 // Call runtime 5090 if (arg1 != noreg) { 5091 assert(arg2 != c_rarg1, "clobbered"); 5092 __ movptr(c_rarg1, arg1); 5093 } 5094 if (arg2 != noreg) { 5095 __ movptr(c_rarg2, arg2); 5096 } 5097 __ movptr(c_rarg0, r15_thread); 5098 BLOCK_COMMENT("call runtime_entry"); 5099 __ call(RuntimeAddress(runtime_entry)); 5100 5101 // Generate oop map 5102 OopMap* map = new OopMap(framesize, 0); 5103 5104 oop_maps->add_gc_map(the_pc - start, map); 5105 5106 __ reset_last_Java_frame(true); 5107 5108 __ leave(); // required for proper stackwalking of RuntimeStub frame 5109 5110 // check for pending exceptions 5111 #ifdef ASSERT 5112 Label L; 5113 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), 5114 (int32_t) NULL_WORD); 5115 __ jcc(Assembler::notEqual, L); 5116 __ should_not_reach_here(); 5117 __ bind(L); 5118 #endif // ASSERT 5119 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 5120 5121 5122 // codeBlob framesize is in words (not VMRegImpl::slot_size) 5123 RuntimeStub* stub = 5124 RuntimeStub::new_runtime_stub(name, 5125 &code, 5126 frame_complete, 5127 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 5128 oop_maps, false); 5129 return stub->entry_point(); 5130 } 5131 5132 void create_control_words() { 5133 // Round to nearest, 53-bit mode, exceptions masked 5134 StubRoutines::_fpu_cntrl_wrd_std = 0x027F; 5135 // Round to zero, 53-bit mode, exception mased 5136 StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F; 5137 // Round to nearest, 24-bit mode, exceptions masked 5138 StubRoutines::_fpu_cntrl_wrd_24 = 0x007F; 5139 // Round to nearest, 64-bit mode, exceptions masked 5140 StubRoutines::_fpu_cntrl_wrd_64 = 0x037F; 5141 // Round to nearest, 64-bit mode, exceptions masked 5142 StubRoutines::_mxcsr_std = 0x1F80; 5143 // Note: the following two constants are 80-bit values 5144 // layout is critical for correct loading by FPU. 5145 // Bias for strict fp multiply/divide 5146 StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000 5147 StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000; 5148 StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff; 5149 // Un-Bias for strict fp multiply/divide 5150 StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000 5151 StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000; 5152 StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff; 5153 } 5154 5155 // Initialization 5156 void generate_initial() { 5157 // Generates all stubs and initializes the entry points 5158 5159 // This platform-specific settings are needed by generate_call_stub() 5160 create_control_words(); 5161 5162 // entry points that exist in all platforms Note: This is code 5163 // that could be shared among different platforms - however the 5164 // benefit seems to be smaller than the disadvantage of having a 5165 // much more complicated generator structure. See also comment in 5166 // stubRoutines.hpp. 5167 5168 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5169 5170 StubRoutines::_call_stub_entry = 5171 generate_call_stub(StubRoutines::_call_stub_return_address); 5172 5173 // is referenced by megamorphic call 5174 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5175 5176 // atomic calls 5177 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg(); 5178 StubRoutines::_atomic_xchg_long_entry = generate_atomic_xchg_long(); 5179 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg(); 5180 StubRoutines::_atomic_cmpxchg_byte_entry = generate_atomic_cmpxchg_byte(); 5181 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long(); 5182 StubRoutines::_atomic_add_entry = generate_atomic_add(); 5183 StubRoutines::_atomic_add_long_entry = generate_atomic_add_long(); 5184 StubRoutines::_fence_entry = generate_orderaccess_fence(); 5185 5186 // platform dependent 5187 StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp(); 5188 StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp(); 5189 5190 StubRoutines::x86::_verify_mxcsr_entry = generate_verify_mxcsr(); 5191 5192 // Build this early so it's available for the interpreter. 5193 StubRoutines::_throw_StackOverflowError_entry = 5194 generate_throw_exception("StackOverflowError throw_exception", 5195 CAST_FROM_FN_PTR(address, 5196 SharedRuntime:: 5197 throw_StackOverflowError)); 5198 StubRoutines::_throw_delayed_StackOverflowError_entry = 5199 generate_throw_exception("delayed StackOverflowError throw_exception", 5200 CAST_FROM_FN_PTR(address, 5201 SharedRuntime:: 5202 throw_delayed_StackOverflowError)); 5203 if (UseCRC32Intrinsics) { 5204 // set table address before stub generation which use it 5205 StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table; 5206 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5207 } 5208 5209 if (UseCRC32CIntrinsics) { 5210 bool supports_clmul = VM_Version::supports_clmul(); 5211 StubRoutines::x86::generate_CRC32C_table(supports_clmul); 5212 StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table; 5213 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul); 5214 } 5215 if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) { 5216 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) || 5217 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) || 5218 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) { 5219 StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF; 5220 StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2; 5221 StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4; 5222 StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable; 5223 StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2; 5224 StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3; 5225 StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1; 5226 StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE; 5227 StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4; 5228 StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV; 5229 StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK; 5230 StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1; 5231 StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3; 5232 StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO; 5233 } 5234 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) { 5235 StubRoutines::_dexp = generate_libmExp(); 5236 } 5237 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5238 StubRoutines::_dlog = generate_libmLog(); 5239 } 5240 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) { 5241 StubRoutines::_dlog10 = generate_libmLog10(); 5242 } 5243 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) { 5244 StubRoutines::_dpow = generate_libmPow(); 5245 } 5246 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5247 StubRoutines::_dsin = generate_libmSin(); 5248 } 5249 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5250 StubRoutines::_dcos = generate_libmCos(); 5251 } 5252 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) { 5253 StubRoutines::_dtan = generate_libmTan(); 5254 } 5255 } 5256 } 5257 5258 void generate_all() { 5259 // Generates all stubs and initializes the entry points 5260 5261 // These entry points require SharedInfo::stack0 to be set up in 5262 // non-core builds and need to be relocatable, so they each 5263 // fabricate a RuntimeStub internally. 5264 StubRoutines::_throw_AbstractMethodError_entry = 5265 generate_throw_exception("AbstractMethodError throw_exception", 5266 CAST_FROM_FN_PTR(address, 5267 SharedRuntime:: 5268 throw_AbstractMethodError)); 5269 5270 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5271 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5272 CAST_FROM_FN_PTR(address, 5273 SharedRuntime:: 5274 throw_IncompatibleClassChangeError)); 5275 5276 StubRoutines::_throw_NullPointerException_at_call_entry = 5277 generate_throw_exception("NullPointerException at call throw_exception", 5278 CAST_FROM_FN_PTR(address, 5279 SharedRuntime:: 5280 throw_NullPointerException_at_call)); 5281 5282 // entry points that are platform specific 5283 StubRoutines::x86::_f2i_fixup = generate_f2i_fixup(); 5284 StubRoutines::x86::_f2l_fixup = generate_f2l_fixup(); 5285 StubRoutines::x86::_d2i_fixup = generate_d2i_fixup(); 5286 StubRoutines::x86::_d2l_fixup = generate_d2l_fixup(); 5287 5288 StubRoutines::x86::_float_sign_mask = generate_fp_mask("float_sign_mask", 0x7FFFFFFF7FFFFFFF); 5289 StubRoutines::x86::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000); 5290 StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF); 5291 StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000); 5292 5293 // support for verify_oop (must happen after universe_init) 5294 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5295 5296 // arraycopy stubs used by compilers 5297 generate_arraycopy_stubs(); 5298 5299 // Load barrier stubs 5300 if (UseLoadBarrier) { 5301 address loadbarrier_address = CAST_FROM_FN_PTR(address, SharedRuntime::z_load_barrier_on_oop_field_preloaded); 5302 address loadbarrier_weak_address = CAST_FROM_FN_PTR(address, SharedRuntime::z_load_barrier_on_weak_oop_field_preloaded); 5303 5304 Register rr = as_Register(0); 5305 for (int i = 0; i < RegisterImpl::number_of_registers; i++) { 5306 if (rr != rsp) { 5307 StubRoutines::x86::_load_barrier_slow_stub[i] = generate_load_barrier_stub(rr, loadbarrier_address, false); 5308 StubRoutines::x86::_load_barrier_weak_slow_stub[i] = generate_load_barrier_stub(rr, loadbarrier_weak_address, true); 5309 5310 } else { 5311 StubRoutines::x86::_load_barrier_slow_stub[i] = (address)NULL; 5312 StubRoutines::x86::_load_barrier_weak_slow_stub[i] = (address)NULL; 5313 } 5314 rr = rr->successor(); 5315 } 5316 } else { 5317 for (int i = 0; i < RegisterImpl::number_of_registers; i++) { 5318 StubRoutines::x86::_load_barrier_slow_stub[i] = (address)NULL; 5319 StubRoutines::x86::_load_barrier_weak_slow_stub[i] = (address)NULL; 5320 } 5321 } 5322 5323 // don't bother generating these AES intrinsic stubs unless global flag is set 5324 if (UseAESIntrinsics) { 5325 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others 5326 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5327 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5328 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5329 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 5330 } 5331 if (UseAESCTRIntrinsics){ 5332 StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask(); 5333 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel(); 5334 } 5335 5336 if (UseSHA1Intrinsics) { 5337 StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask(); 5338 StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask(); 5339 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5340 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5341 } 5342 if (UseSHA256Intrinsics) { 5343 StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256; 5344 char* dst = (char*)StubRoutines::x86::_k256_W; 5345 char* src = (char*)StubRoutines::x86::_k256; 5346 for (int ii = 0; ii < 16; ++ii) { 5347 memcpy(dst + 32 * ii, src + 16 * ii, 16); 5348 memcpy(dst + 32 * ii + 16, src + 16 * ii, 16); 5349 } 5350 StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W; 5351 StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask(); 5352 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5353 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5354 } 5355 if (UseSHA512Intrinsics) { 5356 StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W; 5357 StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512(); 5358 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 5359 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 5360 } 5361 5362 // Generate GHASH intrinsics code 5363 if (UseGHASHIntrinsics) { 5364 StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); 5365 StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask(); 5366 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5367 } 5368 5369 // Safefetch stubs. 5370 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5371 &StubRoutines::_safefetch32_fault_pc, 5372 &StubRoutines::_safefetch32_continuation_pc); 5373 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5374 &StubRoutines::_safefetchN_fault_pc, 5375 &StubRoutines::_safefetchN_continuation_pc); 5376 #ifdef COMPILER2 5377 if (UseMultiplyToLenIntrinsic) { 5378 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5379 } 5380 if (UseSquareToLenIntrinsic) { 5381 StubRoutines::_squareToLen = generate_squareToLen(); 5382 } 5383 if (UseMulAddIntrinsic) { 5384 StubRoutines::_mulAdd = generate_mulAdd(); 5385 } 5386 #ifndef _WINDOWS 5387 if (UseMontgomeryMultiplyIntrinsic) { 5388 StubRoutines::_montgomeryMultiply 5389 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply); 5390 } 5391 if (UseMontgomerySquareIntrinsic) { 5392 StubRoutines::_montgomerySquare 5393 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); 5394 } 5395 #endif // WINDOWS 5396 #endif // COMPILER2 5397 5398 if (UseVectorizedMismatchIntrinsic) { 5399 StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch(); 5400 } 5401 } 5402 5403 public: 5404 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5405 if (all) { 5406 generate_all(); 5407 } else { 5408 generate_initial(); 5409 } 5410 } 5411 }; // end class declaration 5412 5413 void StubGenerator_generate(CodeBuffer* code, bool all) { 5414 StubGenerator g(code, all); 5415 }