1 /* 2 * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "ci/ciUtilities.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "gc/shared/barrierSetNMethod.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/universe.hpp" 34 #include "nativeInst_x86.hpp" 35 #include "oops/instanceOop.hpp" 36 #include "oops/method.hpp" 37 #include "oops/objArrayKlass.hpp" 38 #include "oops/oop.inline.hpp" 39 #include "prims/methodHandles.hpp" 40 #include "runtime/frame.inline.hpp" 41 #include "runtime/handles.inline.hpp" 42 #include "runtime/sharedRuntime.hpp" 43 #include "runtime/stubCodeGenerator.hpp" 44 #include "runtime/stubRoutines.hpp" 45 #include "runtime/thread.inline.hpp" 46 #ifdef COMPILER2 47 #include "opto/runtime.hpp" 48 #endif 49 #if INCLUDE_ZGC 50 #include "gc/z/zThreadLocalData.hpp" 51 #endif 52 53 // Declaration and definition of StubGenerator (no .hpp file). 54 // For a more detailed description of the stub routine structure 55 // see the comment in stubRoutines.hpp 56 57 #define __ _masm-> 58 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8) 59 #define a__ ((Assembler*)_masm)-> 60 61 #ifdef PRODUCT 62 #define BLOCK_COMMENT(str) /* nothing */ 63 #else 64 #define BLOCK_COMMENT(str) __ block_comment(str) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 const int MXCSR_MASK = 0xFFC0; // Mask out any pending exceptions 69 70 // Stub Code definitions 71 72 class StubGenerator: public StubCodeGenerator { 73 private: 74 75 #ifdef PRODUCT 76 #define inc_counter_np(counter) ((void)0) 77 #else 78 void inc_counter_np_(int& counter) { 79 // This can destroy rscratch1 if counter is far from the code cache 80 __ incrementl(ExternalAddress((address)&counter)); 81 } 82 #define inc_counter_np(counter) \ 83 BLOCK_COMMENT("inc_counter " #counter); \ 84 inc_counter_np_(counter); 85 #endif 86 87 // Call stubs are used to call Java from C 88 // 89 // Linux Arguments: 90 // c_rarg0: call wrapper address address 91 // c_rarg1: result address 92 // c_rarg2: result type BasicType 93 // c_rarg3: method Method* 94 // c_rarg4: (interpreter) entry point address 95 // c_rarg5: parameters intptr_t* 96 // 16(rbp): parameter size (in words) int 97 // 24(rbp): thread Thread* 98 // 99 // [ return_from_Java ] <--- rsp 100 // [ argument word n ] 101 // ... 102 // -12 [ argument word 1 ] 103 // -11 [ saved r15 ] <--- rsp_after_call 104 // -10 [ saved r14 ] 105 // -9 [ saved r13 ] 106 // -8 [ saved r12 ] 107 // -7 [ saved rbx ] 108 // -6 [ call wrapper ] 109 // -5 [ result ] 110 // -4 [ result type ] 111 // -3 [ method ] 112 // -2 [ entry point ] 113 // -1 [ parameters ] 114 // 0 [ saved rbp ] <--- rbp 115 // 1 [ return address ] 116 // 2 [ parameter size ] 117 // 3 [ thread ] 118 // 119 // Windows Arguments: 120 // c_rarg0: call wrapper address address 121 // c_rarg1: result address 122 // c_rarg2: result type BasicType 123 // c_rarg3: method Method* 124 // 48(rbp): (interpreter) entry point address 125 // 56(rbp): parameters intptr_t* 126 // 64(rbp): parameter size (in words) int 127 // 72(rbp): thread Thread* 128 // 129 // [ return_from_Java ] <--- rsp 130 // [ argument word n ] 131 // ... 132 // -60 [ argument word 1 ] 133 // -59 [ saved xmm31 ] <--- rsp after_call 134 // [ saved xmm16-xmm30 ] (EVEX enabled, else the space is blank) 135 // -27 [ saved xmm15 ] 136 // [ saved xmm7-xmm14 ] 137 // -9 [ saved xmm6 ] (each xmm register takes 2 slots) 138 // -7 [ saved r15 ] 139 // -6 [ saved r14 ] 140 // -5 [ saved r13 ] 141 // -4 [ saved r12 ] 142 // -3 [ saved rdi ] 143 // -2 [ saved rsi ] 144 // -1 [ saved rbx ] 145 // 0 [ saved rbp ] <--- rbp 146 // 1 [ return address ] 147 // 2 [ call wrapper ] 148 // 3 [ result ] 149 // 4 [ result type ] 150 // 5 [ method ] 151 // 6 [ entry point ] 152 // 7 [ parameters ] 153 // 8 [ parameter size ] 154 // 9 [ thread ] 155 // 156 // Windows reserves the callers stack space for arguments 1-4. 157 // We spill c_rarg0-c_rarg3 to this space. 158 159 // Call stub stack layout word offsets from rbp 160 enum call_stub_layout { 161 #ifdef _WIN64 162 xmm_save_first = 6, // save from xmm6 163 xmm_save_last = 31, // to xmm31 164 xmm_save_base = -9, 165 rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27 166 r15_off = -7, 167 r14_off = -6, 168 r13_off = -5, 169 r12_off = -4, 170 rdi_off = -3, 171 rsi_off = -2, 172 rbx_off = -1, 173 rbp_off = 0, 174 retaddr_off = 1, 175 call_wrapper_off = 2, 176 result_off = 3, 177 result_type_off = 4, 178 method_off = 5, 179 entry_point_off = 6, 180 parameters_off = 7, 181 parameter_size_off = 8, 182 thread_off = 9 183 #else 184 rsp_after_call_off = -12, 185 mxcsr_off = rsp_after_call_off, 186 r15_off = -11, 187 r14_off = -10, 188 r13_off = -9, 189 r12_off = -8, 190 rbx_off = -7, 191 call_wrapper_off = -6, 192 result_off = -5, 193 result_type_off = -4, 194 method_off = -3, 195 entry_point_off = -2, 196 parameters_off = -1, 197 rbp_off = 0, 198 retaddr_off = 1, 199 parameter_size_off = 2, 200 thread_off = 3 201 #endif 202 }; 203 204 #ifdef _WIN64 205 Address xmm_save(int reg) { 206 assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range"); 207 return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize); 208 } 209 #endif 210 211 address generate_call_stub(address& return_address) { 212 assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 && 213 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 214 "adjust this code"); 215 StubCodeMark mark(this, "StubRoutines", "call_stub"); 216 address start = __ pc(); 217 218 // same as in generate_catch_exception()! 219 const Address rsp_after_call(rbp, rsp_after_call_off * wordSize); 220 221 const Address call_wrapper (rbp, call_wrapper_off * wordSize); 222 const Address result (rbp, result_off * wordSize); 223 const Address result_type (rbp, result_type_off * wordSize); 224 const Address method (rbp, method_off * wordSize); 225 const Address entry_point (rbp, entry_point_off * wordSize); 226 const Address parameters (rbp, parameters_off * wordSize); 227 const Address parameter_size(rbp, parameter_size_off * wordSize); 228 229 // same as in generate_catch_exception()! 230 const Address thread (rbp, thread_off * wordSize); 231 232 const Address r15_save(rbp, r15_off * wordSize); 233 const Address r14_save(rbp, r14_off * wordSize); 234 const Address r13_save(rbp, r13_off * wordSize); 235 const Address r12_save(rbp, r12_off * wordSize); 236 const Address rbx_save(rbp, rbx_off * wordSize); 237 238 // stub code 239 __ enter(); 240 __ subptr(rsp, -rsp_after_call_off * wordSize); 241 242 // save register parameters 243 #ifndef _WIN64 244 __ movptr(parameters, c_rarg5); // parameters 245 __ movptr(entry_point, c_rarg4); // entry_point 246 #endif 247 248 __ movptr(method, c_rarg3); // method 249 __ movl(result_type, c_rarg2); // result type 250 __ movptr(result, c_rarg1); // result 251 __ movptr(call_wrapper, c_rarg0); // call wrapper 252 253 // save regs belonging to calling function 254 __ movptr(rbx_save, rbx); 255 __ movptr(r12_save, r12); 256 __ movptr(r13_save, r13); 257 __ movptr(r14_save, r14); 258 __ movptr(r15_save, r15); 259 260 #ifdef _WIN64 261 int last_reg = 15; 262 if (UseAVX > 2) { 263 last_reg = 31; 264 } 265 if (VM_Version::supports_evex()) { 266 for (int i = xmm_save_first; i <= last_reg; i++) { 267 __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0); 268 } 269 } else { 270 for (int i = xmm_save_first; i <= last_reg; i++) { 271 __ movdqu(xmm_save(i), as_XMMRegister(i)); 272 } 273 } 274 275 const Address rdi_save(rbp, rdi_off * wordSize); 276 const Address rsi_save(rbp, rsi_off * wordSize); 277 278 __ movptr(rsi_save, rsi); 279 __ movptr(rdi_save, rdi); 280 #else 281 const Address mxcsr_save(rbp, mxcsr_off * wordSize); 282 { 283 Label skip_ldmx; 284 __ stmxcsr(mxcsr_save); 285 __ movl(rax, mxcsr_save); 286 __ andl(rax, MXCSR_MASK); // Only check control and mask bits 287 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std()); 288 __ cmp32(rax, mxcsr_std); 289 __ jcc(Assembler::equal, skip_ldmx); 290 __ ldmxcsr(mxcsr_std); 291 __ bind(skip_ldmx); 292 } 293 #endif 294 295 // Load up thread register 296 __ movptr(r15_thread, thread); 297 __ reinit_heapbase(); 298 299 #ifdef ASSERT 300 // make sure we have no pending exceptions 301 { 302 Label L; 303 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 304 __ jcc(Assembler::equal, L); 305 __ stop("StubRoutines::call_stub: entered with pending exception"); 306 __ bind(L); 307 } 308 #endif 309 310 // pass parameters if any 311 BLOCK_COMMENT("pass parameters if any"); 312 Label parameters_done; 313 __ movl(c_rarg3, parameter_size); 314 __ testl(c_rarg3, c_rarg3); 315 __ jcc(Assembler::zero, parameters_done); 316 317 Label loop; 318 __ movptr(c_rarg2, parameters); // parameter pointer 319 __ movl(c_rarg1, c_rarg3); // parameter counter is in c_rarg1 320 __ BIND(loop); 321 __ movptr(rax, Address(c_rarg2, 0));// get parameter 322 __ addptr(c_rarg2, wordSize); // advance to next parameter 323 __ decrementl(c_rarg1); // decrement counter 324 __ push(rax); // pass parameter 325 __ jcc(Assembler::notZero, loop); 326 327 // call Java function 328 __ BIND(parameters_done); 329 __ movptr(rbx, method); // get Method* 330 __ movptr(c_rarg1, entry_point); // get entry_point 331 __ mov(r13, rsp); // set sender sp 332 BLOCK_COMMENT("call Java function"); 333 __ call(c_rarg1); 334 335 BLOCK_COMMENT("call_stub_return_address:"); 336 return_address = __ pc(); 337 338 // store result depending on type (everything that is not 339 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 340 __ movptr(c_rarg0, result); 341 Label is_long, is_float, is_double, exit; 342 __ movl(c_rarg1, result_type); 343 __ cmpl(c_rarg1, T_OBJECT); 344 __ jcc(Assembler::equal, is_long); 345 __ cmpl(c_rarg1, T_LONG); 346 __ jcc(Assembler::equal, is_long); 347 __ cmpl(c_rarg1, T_FLOAT); 348 __ jcc(Assembler::equal, is_float); 349 __ cmpl(c_rarg1, T_DOUBLE); 350 __ jcc(Assembler::equal, is_double); 351 352 // handle T_INT case 353 __ movl(Address(c_rarg0, 0), rax); 354 355 __ BIND(exit); 356 357 // pop parameters 358 __ lea(rsp, rsp_after_call); 359 360 #ifdef ASSERT 361 // verify that threads correspond 362 { 363 Label L1, L2, L3; 364 __ cmpptr(r15_thread, thread); 365 __ jcc(Assembler::equal, L1); 366 __ stop("StubRoutines::call_stub: r15_thread is corrupted"); 367 __ bind(L1); 368 __ get_thread(rbx); 369 __ cmpptr(r15_thread, thread); 370 __ jcc(Assembler::equal, L2); 371 __ stop("StubRoutines::call_stub: r15_thread is modified by call"); 372 __ bind(L2); 373 __ cmpptr(r15_thread, rbx); 374 __ jcc(Assembler::equal, L3); 375 __ stop("StubRoutines::call_stub: threads must correspond"); 376 __ bind(L3); 377 } 378 #endif 379 380 // restore regs belonging to calling function 381 #ifdef _WIN64 382 // emit the restores for xmm regs 383 if (VM_Version::supports_evex()) { 384 for (int i = xmm_save_first; i <= last_reg; i++) { 385 __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0); 386 } 387 } else { 388 for (int i = xmm_save_first; i <= last_reg; i++) { 389 __ movdqu(as_XMMRegister(i), xmm_save(i)); 390 } 391 } 392 #endif 393 __ movptr(r15, r15_save); 394 __ movptr(r14, r14_save); 395 __ movptr(r13, r13_save); 396 __ movptr(r12, r12_save); 397 __ movptr(rbx, rbx_save); 398 399 #ifdef _WIN64 400 __ movptr(rdi, rdi_save); 401 __ movptr(rsi, rsi_save); 402 #else 403 __ ldmxcsr(mxcsr_save); 404 #endif 405 406 // restore rsp 407 __ addptr(rsp, -rsp_after_call_off * wordSize); 408 409 // return 410 __ vzeroupper(); 411 __ pop(rbp); 412 __ ret(0); 413 414 // handle return types different from T_INT 415 __ BIND(is_long); 416 __ movq(Address(c_rarg0, 0), rax); 417 __ jmp(exit); 418 419 __ BIND(is_float); 420 __ movflt(Address(c_rarg0, 0), xmm0); 421 __ jmp(exit); 422 423 __ BIND(is_double); 424 __ movdbl(Address(c_rarg0, 0), xmm0); 425 __ jmp(exit); 426 427 return start; 428 } 429 430 // Return point for a Java call if there's an exception thrown in 431 // Java code. The exception is caught and transformed into a 432 // pending exception stored in JavaThread that can be tested from 433 // within the VM. 434 // 435 // Note: Usually the parameters are removed by the callee. In case 436 // of an exception crossing an activation frame boundary, that is 437 // not the case if the callee is compiled code => need to setup the 438 // rsp. 439 // 440 // rax: exception oop 441 442 address generate_catch_exception() { 443 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 444 address start = __ pc(); 445 446 // same as in generate_call_stub(): 447 const Address rsp_after_call(rbp, rsp_after_call_off * wordSize); 448 const Address thread (rbp, thread_off * wordSize); 449 450 #ifdef ASSERT 451 // verify that threads correspond 452 { 453 Label L1, L2, L3; 454 __ cmpptr(r15_thread, thread); 455 __ jcc(Assembler::equal, L1); 456 __ stop("StubRoutines::catch_exception: r15_thread is corrupted"); 457 __ bind(L1); 458 __ get_thread(rbx); 459 __ cmpptr(r15_thread, thread); 460 __ jcc(Assembler::equal, L2); 461 __ stop("StubRoutines::catch_exception: r15_thread is modified by call"); 462 __ bind(L2); 463 __ cmpptr(r15_thread, rbx); 464 __ jcc(Assembler::equal, L3); 465 __ stop("StubRoutines::catch_exception: threads must correspond"); 466 __ bind(L3); 467 } 468 #endif 469 470 // set pending exception 471 __ verify_oop(rax); 472 473 __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax); 474 __ lea(rscratch1, ExternalAddress((address)__FILE__)); 475 __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1); 476 __ movl(Address(r15_thread, Thread::exception_line_offset()), (int) __LINE__); 477 478 // complete return to VM 479 assert(StubRoutines::_call_stub_return_address != NULL, 480 "_call_stub_return_address must have been generated before"); 481 __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address)); 482 483 return start; 484 } 485 486 // Continuation point for runtime calls returning with a pending 487 // exception. The pending exception check happened in the runtime 488 // or native call stub. The pending exception in Thread is 489 // converted into a Java-level exception. 490 // 491 // Contract with Java-level exception handlers: 492 // rax: exception 493 // rdx: throwing pc 494 // 495 // NOTE: At entry of this stub, exception-pc must be on stack !! 496 497 address generate_forward_exception() { 498 StubCodeMark mark(this, "StubRoutines", "forward exception"); 499 address start = __ pc(); 500 501 // Upon entry, the sp points to the return address returning into 502 // Java (interpreted or compiled) code; i.e., the return address 503 // becomes the throwing pc. 504 // 505 // Arguments pushed before the runtime call are still on the stack 506 // but the exception handler will reset the stack pointer -> 507 // ignore them. A potential result in registers can be ignored as 508 // well. 509 510 #ifdef ASSERT 511 // make sure this code is only executed if there is a pending exception 512 { 513 Label L; 514 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL); 515 __ jcc(Assembler::notEqual, L); 516 __ stop("StubRoutines::forward exception: no pending exception (1)"); 517 __ bind(L); 518 } 519 #endif 520 521 // compute exception handler into rbx 522 __ movptr(c_rarg0, Address(rsp, 0)); 523 BLOCK_COMMENT("call exception_handler_for_return_address"); 524 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 525 SharedRuntime::exception_handler_for_return_address), 526 r15_thread, c_rarg0); 527 __ mov(rbx, rax); 528 529 // setup rax & rdx, remove return address & clear pending exception 530 __ pop(rdx); 531 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 532 __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 533 534 #ifdef ASSERT 535 // make sure exception is set 536 { 537 Label L; 538 __ testptr(rax, rax); 539 __ jcc(Assembler::notEqual, L); 540 __ stop("StubRoutines::forward exception: no pending exception (2)"); 541 __ bind(L); 542 } 543 #endif 544 545 // continue at exception handler (return address removed) 546 // rax: exception 547 // rbx: exception handler 548 // rdx: throwing pc 549 __ verify_oop(rax); 550 __ jmp(rbx); 551 552 return start; 553 } 554 555 // Implementation of jint atomic_xchg(jint add_value, volatile jint* dest) 556 // used by Atomic::xchg(volatile jint* dest, jint exchange_value) 557 // 558 // Arguments : 559 // c_rarg0: exchange_value 560 // c_rarg0: dest 561 // 562 // Result: 563 // *dest <- ex, return (orig *dest) 564 address generate_atomic_xchg() { 565 StubCodeMark mark(this, "StubRoutines", "atomic_xchg"); 566 address start = __ pc(); 567 568 __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow 569 __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK 570 __ ret(0); 571 572 return start; 573 } 574 575 // Implementation of intptr_t atomic_xchg(jlong add_value, volatile jlong* dest) 576 // used by Atomic::xchg(volatile jlong* dest, jlong exchange_value) 577 // 578 // Arguments : 579 // c_rarg0: exchange_value 580 // c_rarg1: dest 581 // 582 // Result: 583 // *dest <- ex, return (orig *dest) 584 address generate_atomic_xchg_long() { 585 StubCodeMark mark(this, "StubRoutines", "atomic_xchg_long"); 586 address start = __ pc(); 587 588 __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow 589 __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK 590 __ ret(0); 591 592 return start; 593 } 594 595 // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest, 596 // jint compare_value) 597 // 598 // Arguments : 599 // c_rarg0: exchange_value 600 // c_rarg1: dest 601 // c_rarg2: compare_value 602 // 603 // Result: 604 // if ( compare_value == *dest ) { 605 // *dest = exchange_value 606 // return compare_value; 607 // else 608 // return *dest; 609 address generate_atomic_cmpxchg() { 610 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg"); 611 address start = __ pc(); 612 613 __ movl(rax, c_rarg2); 614 __ lock(); 615 __ cmpxchgl(c_rarg0, Address(c_rarg1, 0)); 616 __ ret(0); 617 618 return start; 619 } 620 621 // Support for int8_t atomic::atomic_cmpxchg(int8_t exchange_value, volatile int8_t* dest, 622 // int8_t compare_value) 623 // 624 // Arguments : 625 // c_rarg0: exchange_value 626 // c_rarg1: dest 627 // c_rarg2: compare_value 628 // 629 // Result: 630 // if ( compare_value == *dest ) { 631 // *dest = exchange_value 632 // return compare_value; 633 // else 634 // return *dest; 635 address generate_atomic_cmpxchg_byte() { 636 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_byte"); 637 address start = __ pc(); 638 639 __ movsbq(rax, c_rarg2); 640 __ lock(); 641 __ cmpxchgb(c_rarg0, Address(c_rarg1, 0)); 642 __ ret(0); 643 644 return start; 645 } 646 647 // Support for int64_t atomic::atomic_cmpxchg(int64_t exchange_value, 648 // volatile int64_t* dest, 649 // int64_t compare_value) 650 // Arguments : 651 // c_rarg0: exchange_value 652 // c_rarg1: dest 653 // c_rarg2: compare_value 654 // 655 // Result: 656 // if ( compare_value == *dest ) { 657 // *dest = exchange_value 658 // return compare_value; 659 // else 660 // return *dest; 661 address generate_atomic_cmpxchg_long() { 662 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long"); 663 address start = __ pc(); 664 665 __ movq(rax, c_rarg2); 666 __ lock(); 667 __ cmpxchgq(c_rarg0, Address(c_rarg1, 0)); 668 __ ret(0); 669 670 return start; 671 } 672 673 // Implementation of jint atomic_add(jint add_value, volatile jint* dest) 674 // used by Atomic::add(volatile jint* dest, jint add_value) 675 // 676 // Arguments : 677 // c_rarg0: add_value 678 // c_rarg1: dest 679 // 680 // Result: 681 // *dest += add_value 682 // return *dest; 683 address generate_atomic_add() { 684 StubCodeMark mark(this, "StubRoutines", "atomic_add"); 685 address start = __ pc(); 686 687 __ movl(rax, c_rarg0); 688 __ lock(); 689 __ xaddl(Address(c_rarg1, 0), c_rarg0); 690 __ addl(rax, c_rarg0); 691 __ ret(0); 692 693 return start; 694 } 695 696 // Implementation of intptr_t atomic_add(intptr_t add_value, volatile intptr_t* dest) 697 // used by Atomic::add(volatile intptr_t* dest, intptr_t add_value) 698 // 699 // Arguments : 700 // c_rarg0: add_value 701 // c_rarg1: dest 702 // 703 // Result: 704 // *dest += add_value 705 // return *dest; 706 address generate_atomic_add_long() { 707 StubCodeMark mark(this, "StubRoutines", "atomic_add_long"); 708 address start = __ pc(); 709 710 __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow 711 __ lock(); 712 __ xaddptr(Address(c_rarg1, 0), c_rarg0); 713 __ addptr(rax, c_rarg0); 714 __ ret(0); 715 716 return start; 717 } 718 719 // Support for intptr_t OrderAccess::fence() 720 // 721 // Arguments : 722 // 723 // Result: 724 address generate_orderaccess_fence() { 725 StubCodeMark mark(this, "StubRoutines", "orderaccess_fence"); 726 address start = __ pc(); 727 __ membar(Assembler::StoreLoad); 728 __ ret(0); 729 730 return start; 731 } 732 733 // Support for intptr_t get_previous_fp() 734 // 735 // This routine is used to find the previous frame pointer for the 736 // caller (current_frame_guess). This is used as part of debugging 737 // ps() is seemingly lost trying to find frames. 738 // This code assumes that caller current_frame_guess) has a frame. 739 address generate_get_previous_fp() { 740 StubCodeMark mark(this, "StubRoutines", "get_previous_fp"); 741 const Address old_fp(rbp, 0); 742 const Address older_fp(rax, 0); 743 address start = __ pc(); 744 745 __ enter(); 746 __ movptr(rax, old_fp); // callers fp 747 __ movptr(rax, older_fp); // the frame for ps() 748 __ pop(rbp); 749 __ ret(0); 750 751 return start; 752 } 753 754 // Support for intptr_t get_previous_sp() 755 // 756 // This routine is used to find the previous stack pointer for the 757 // caller. 758 address generate_get_previous_sp() { 759 StubCodeMark mark(this, "StubRoutines", "get_previous_sp"); 760 address start = __ pc(); 761 762 __ movptr(rax, rsp); 763 __ addptr(rax, 8); // return address is at the top of the stack. 764 __ ret(0); 765 766 return start; 767 } 768 769 //---------------------------------------------------------------------------------------------------- 770 // Support for void verify_mxcsr() 771 // 772 // This routine is used with -Xcheck:jni to verify that native 773 // JNI code does not return to Java code without restoring the 774 // MXCSR register to our expected state. 775 776 address generate_verify_mxcsr() { 777 StubCodeMark mark(this, "StubRoutines", "verify_mxcsr"); 778 address start = __ pc(); 779 780 const Address mxcsr_save(rsp, 0); 781 782 if (CheckJNICalls) { 783 Label ok_ret; 784 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std()); 785 __ push(rax); 786 __ subptr(rsp, wordSize); // allocate a temp location 787 __ stmxcsr(mxcsr_save); 788 __ movl(rax, mxcsr_save); 789 __ andl(rax, MXCSR_MASK); // Only check control and mask bits 790 __ cmp32(rax, mxcsr_std); 791 __ jcc(Assembler::equal, ok_ret); 792 793 __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall"); 794 795 __ ldmxcsr(mxcsr_std); 796 797 __ bind(ok_ret); 798 __ addptr(rsp, wordSize); 799 __ pop(rax); 800 } 801 802 __ ret(0); 803 804 return start; 805 } 806 807 address generate_f2i_fixup() { 808 StubCodeMark mark(this, "StubRoutines", "f2i_fixup"); 809 Address inout(rsp, 5 * wordSize); // return address + 4 saves 810 811 address start = __ pc(); 812 813 Label L; 814 815 __ push(rax); 816 __ push(c_rarg3); 817 __ push(c_rarg2); 818 __ push(c_rarg1); 819 820 __ movl(rax, 0x7f800000); 821 __ xorl(c_rarg3, c_rarg3); 822 __ movl(c_rarg2, inout); 823 __ movl(c_rarg1, c_rarg2); 824 __ andl(c_rarg1, 0x7fffffff); 825 __ cmpl(rax, c_rarg1); // NaN? -> 0 826 __ jcc(Assembler::negative, L); 827 __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint 828 __ movl(c_rarg3, 0x80000000); 829 __ movl(rax, 0x7fffffff); 830 __ cmovl(Assembler::positive, c_rarg3, rax); 831 832 __ bind(L); 833 __ movptr(inout, c_rarg3); 834 835 __ pop(c_rarg1); 836 __ pop(c_rarg2); 837 __ pop(c_rarg3); 838 __ pop(rax); 839 840 __ ret(0); 841 842 return start; 843 } 844 845 address generate_f2l_fixup() { 846 StubCodeMark mark(this, "StubRoutines", "f2l_fixup"); 847 Address inout(rsp, 5 * wordSize); // return address + 4 saves 848 address start = __ pc(); 849 850 Label L; 851 852 __ push(rax); 853 __ push(c_rarg3); 854 __ push(c_rarg2); 855 __ push(c_rarg1); 856 857 __ movl(rax, 0x7f800000); 858 __ xorl(c_rarg3, c_rarg3); 859 __ movl(c_rarg2, inout); 860 __ movl(c_rarg1, c_rarg2); 861 __ andl(c_rarg1, 0x7fffffff); 862 __ cmpl(rax, c_rarg1); // NaN? -> 0 863 __ jcc(Assembler::negative, L); 864 __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong 865 __ mov64(c_rarg3, 0x8000000000000000); 866 __ mov64(rax, 0x7fffffffffffffff); 867 __ cmov(Assembler::positive, c_rarg3, rax); 868 869 __ bind(L); 870 __ movptr(inout, c_rarg3); 871 872 __ pop(c_rarg1); 873 __ pop(c_rarg2); 874 __ pop(c_rarg3); 875 __ pop(rax); 876 877 __ ret(0); 878 879 return start; 880 } 881 882 address generate_d2i_fixup() { 883 StubCodeMark mark(this, "StubRoutines", "d2i_fixup"); 884 Address inout(rsp, 6 * wordSize); // return address + 5 saves 885 886 address start = __ pc(); 887 888 Label L; 889 890 __ push(rax); 891 __ push(c_rarg3); 892 __ push(c_rarg2); 893 __ push(c_rarg1); 894 __ push(c_rarg0); 895 896 __ movl(rax, 0x7ff00000); 897 __ movq(c_rarg2, inout); 898 __ movl(c_rarg3, c_rarg2); 899 __ mov(c_rarg1, c_rarg2); 900 __ mov(c_rarg0, c_rarg2); 901 __ negl(c_rarg3); 902 __ shrptr(c_rarg1, 0x20); 903 __ orl(c_rarg3, c_rarg2); 904 __ andl(c_rarg1, 0x7fffffff); 905 __ xorl(c_rarg2, c_rarg2); 906 __ shrl(c_rarg3, 0x1f); 907 __ orl(c_rarg1, c_rarg3); 908 __ cmpl(rax, c_rarg1); 909 __ jcc(Assembler::negative, L); // NaN -> 0 910 __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint 911 __ movl(c_rarg2, 0x80000000); 912 __ movl(rax, 0x7fffffff); 913 __ cmov(Assembler::positive, c_rarg2, rax); 914 915 __ bind(L); 916 __ movptr(inout, c_rarg2); 917 918 __ pop(c_rarg0); 919 __ pop(c_rarg1); 920 __ pop(c_rarg2); 921 __ pop(c_rarg3); 922 __ pop(rax); 923 924 __ ret(0); 925 926 return start; 927 } 928 929 address generate_d2l_fixup() { 930 StubCodeMark mark(this, "StubRoutines", "d2l_fixup"); 931 Address inout(rsp, 6 * wordSize); // return address + 5 saves 932 933 address start = __ pc(); 934 935 Label L; 936 937 __ push(rax); 938 __ push(c_rarg3); 939 __ push(c_rarg2); 940 __ push(c_rarg1); 941 __ push(c_rarg0); 942 943 __ movl(rax, 0x7ff00000); 944 __ movq(c_rarg2, inout); 945 __ movl(c_rarg3, c_rarg2); 946 __ mov(c_rarg1, c_rarg2); 947 __ mov(c_rarg0, c_rarg2); 948 __ negl(c_rarg3); 949 __ shrptr(c_rarg1, 0x20); 950 __ orl(c_rarg3, c_rarg2); 951 __ andl(c_rarg1, 0x7fffffff); 952 __ xorl(c_rarg2, c_rarg2); 953 __ shrl(c_rarg3, 0x1f); 954 __ orl(c_rarg1, c_rarg3); 955 __ cmpl(rax, c_rarg1); 956 __ jcc(Assembler::negative, L); // NaN -> 0 957 __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong 958 __ mov64(c_rarg2, 0x8000000000000000); 959 __ mov64(rax, 0x7fffffffffffffff); 960 __ cmovq(Assembler::positive, c_rarg2, rax); 961 962 __ bind(L); 963 __ movq(inout, c_rarg2); 964 965 __ pop(c_rarg0); 966 __ pop(c_rarg1); 967 __ pop(c_rarg2); 968 __ pop(c_rarg3); 969 __ pop(rax); 970 971 __ ret(0); 972 973 return start; 974 } 975 976 address generate_fp_mask(const char *stub_name, int64_t mask) { 977 __ align(CodeEntryAlignment); 978 StubCodeMark mark(this, "StubRoutines", stub_name); 979 address start = __ pc(); 980 981 __ emit_data64( mask, relocInfo::none ); 982 __ emit_data64( mask, relocInfo::none ); 983 984 return start; 985 } 986 987 address generate_vector_mask(const char *stub_name, int64_t mask) { 988 __ align(CodeEntryAlignment); 989 StubCodeMark mark(this, "StubRoutines", stub_name); 990 address start = __ pc(); 991 992 __ emit_data64(mask, relocInfo::none); 993 __ emit_data64(mask, relocInfo::none); 994 __ emit_data64(mask, relocInfo::none); 995 __ emit_data64(mask, relocInfo::none); 996 __ emit_data64(mask, relocInfo::none); 997 __ emit_data64(mask, relocInfo::none); 998 __ emit_data64(mask, relocInfo::none); 999 __ emit_data64(mask, relocInfo::none); 1000 1001 return start; 1002 } 1003 1004 address generate_vector_byte_perm_mask(const char *stub_name) { 1005 __ align(CodeEntryAlignment); 1006 StubCodeMark mark(this, "StubRoutines", stub_name); 1007 address start = __ pc(); 1008 1009 __ emit_data64(0x0000000000000001, relocInfo::none); 1010 __ emit_data64(0x0000000000000003, relocInfo::none); 1011 __ emit_data64(0x0000000000000005, relocInfo::none); 1012 __ emit_data64(0x0000000000000007, relocInfo::none); 1013 __ emit_data64(0x0000000000000000, relocInfo::none); 1014 __ emit_data64(0x0000000000000002, relocInfo::none); 1015 __ emit_data64(0x0000000000000004, relocInfo::none); 1016 __ emit_data64(0x0000000000000006, relocInfo::none); 1017 1018 return start; 1019 } 1020 1021 // Non-destructive plausibility checks for oops 1022 // 1023 // Arguments: 1024 // all args on stack! 1025 // 1026 // Stack after saving c_rarg3: 1027 // [tos + 0]: saved c_rarg3 1028 // [tos + 1]: saved c_rarg2 1029 // [tos + 2]: saved r12 (several TemplateTable methods use it) 1030 // [tos + 3]: saved flags 1031 // [tos + 4]: return address 1032 // * [tos + 5]: error message (char*) 1033 // * [tos + 6]: object to verify (oop) 1034 // * [tos + 7]: saved rax - saved by caller and bashed 1035 // * [tos + 8]: saved r10 (rscratch1) - saved by caller 1036 // * = popped on exit 1037 address generate_verify_oop() { 1038 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 1039 address start = __ pc(); 1040 1041 Label exit, error; 1042 1043 __ pushf(); 1044 __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 1045 1046 __ push(r12); 1047 1048 // save c_rarg2 and c_rarg3 1049 __ push(c_rarg2); 1050 __ push(c_rarg3); 1051 1052 enum { 1053 // After previous pushes. 1054 oop_to_verify = 6 * wordSize, 1055 saved_rax = 7 * wordSize, 1056 saved_r10 = 8 * wordSize, 1057 1058 // Before the call to MacroAssembler::debug(), see below. 1059 return_addr = 16 * wordSize, 1060 error_msg = 17 * wordSize 1061 }; 1062 1063 // get object 1064 __ movptr(rax, Address(rsp, oop_to_verify)); 1065 1066 // make sure object is 'reasonable' 1067 __ testptr(rax, rax); 1068 __ jcc(Assembler::zero, exit); // if obj is NULL it is OK 1069 1070 #if INCLUDE_ZGC 1071 if (UseZGC) { 1072 // Check if metadata bits indicate a bad oop 1073 __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset())); 1074 __ jcc(Assembler::notZero, error); 1075 } 1076 #endif 1077 1078 // Check if the oop is in the right area of memory 1079 __ movptr(c_rarg2, rax); 1080 __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 1081 __ andptr(c_rarg2, c_rarg3); 1082 __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 1083 __ cmpptr(c_rarg2, c_rarg3); 1084 __ jcc(Assembler::notZero, error); 1085 1086 // set r12 to heapbase for load_klass() 1087 __ reinit_heapbase(); 1088 1089 // make sure klass is 'reasonable', which is not zero. 1090 __ load_klass(rax, rax); // get klass 1091 __ testptr(rax, rax); 1092 __ jcc(Assembler::zero, error); // if klass is NULL it is broken 1093 1094 // return if everything seems ok 1095 __ bind(exit); 1096 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back 1097 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back 1098 __ pop(c_rarg3); // restore c_rarg3 1099 __ pop(c_rarg2); // restore c_rarg2 1100 __ pop(r12); // restore r12 1101 __ popf(); // restore flags 1102 __ ret(4 * wordSize); // pop caller saved stuff 1103 1104 // handle errors 1105 __ bind(error); 1106 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back 1107 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back 1108 __ pop(c_rarg3); // get saved c_rarg3 back 1109 __ pop(c_rarg2); // get saved c_rarg2 back 1110 __ pop(r12); // get saved r12 back 1111 __ popf(); // get saved flags off stack -- 1112 // will be ignored 1113 1114 __ pusha(); // push registers 1115 // (rip is already 1116 // already pushed) 1117 // debug(char* msg, int64_t pc, int64_t regs[]) 1118 // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and 1119 // pushed all the registers, so now the stack looks like: 1120 // [tos + 0] 16 saved registers 1121 // [tos + 16] return address 1122 // * [tos + 17] error message (char*) 1123 // * [tos + 18] object to verify (oop) 1124 // * [tos + 19] saved rax - saved by caller and bashed 1125 // * [tos + 20] saved r10 (rscratch1) - saved by caller 1126 // * = popped on exit 1127 1128 __ movptr(c_rarg0, Address(rsp, error_msg)); // pass address of error message 1129 __ movptr(c_rarg1, Address(rsp, return_addr)); // pass return address 1130 __ movq(c_rarg2, rsp); // pass address of regs on stack 1131 __ mov(r12, rsp); // remember rsp 1132 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 1133 __ andptr(rsp, -16); // align stack as required by ABI 1134 BLOCK_COMMENT("call MacroAssembler::debug"); 1135 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64))); 1136 __ hlt(); 1137 return start; 1138 } 1139 1140 // 1141 // Verify that a register contains clean 32-bits positive value 1142 // (high 32-bits are 0) so it could be used in 64-bits shifts. 1143 // 1144 // Input: 1145 // Rint - 32-bits value 1146 // Rtmp - scratch 1147 // 1148 void assert_clean_int(Register Rint, Register Rtmp) { 1149 #ifdef ASSERT 1150 Label L; 1151 assert_different_registers(Rtmp, Rint); 1152 __ movslq(Rtmp, Rint); 1153 __ cmpq(Rtmp, Rint); 1154 __ jcc(Assembler::equal, L); 1155 __ stop("high 32-bits of int value are not 0"); 1156 __ bind(L); 1157 #endif 1158 } 1159 1160 // Generate overlap test for array copy stubs 1161 // 1162 // Input: 1163 // c_rarg0 - from 1164 // c_rarg1 - to 1165 // c_rarg2 - element count 1166 // 1167 // Output: 1168 // rax - &from[element count - 1] 1169 // 1170 void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) { 1171 assert(no_overlap_target != NULL, "must be generated"); 1172 array_overlap_test(no_overlap_target, NULL, sf); 1173 } 1174 void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) { 1175 array_overlap_test(NULL, &L_no_overlap, sf); 1176 } 1177 void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) { 1178 const Register from = c_rarg0; 1179 const Register to = c_rarg1; 1180 const Register count = c_rarg2; 1181 const Register end_from = rax; 1182 1183 __ cmpptr(to, from); 1184 __ lea(end_from, Address(from, count, sf, 0)); 1185 if (NOLp == NULL) { 1186 ExternalAddress no_overlap(no_overlap_target); 1187 __ jump_cc(Assembler::belowEqual, no_overlap); 1188 __ cmpptr(to, end_from); 1189 __ jump_cc(Assembler::aboveEqual, no_overlap); 1190 } else { 1191 __ jcc(Assembler::belowEqual, (*NOLp)); 1192 __ cmpptr(to, end_from); 1193 __ jcc(Assembler::aboveEqual, (*NOLp)); 1194 } 1195 } 1196 1197 // Shuffle first three arg regs on Windows into Linux/Solaris locations. 1198 // 1199 // Outputs: 1200 // rdi - rcx 1201 // rsi - rdx 1202 // rdx - r8 1203 // rcx - r9 1204 // 1205 // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter 1206 // are non-volatile. r9 and r10 should not be used by the caller. 1207 // 1208 DEBUG_ONLY(bool regs_in_thread;) 1209 1210 void setup_arg_regs(int nargs = 3) { 1211 const Register saved_rdi = r9; 1212 const Register saved_rsi = r10; 1213 assert(nargs == 3 || nargs == 4, "else fix"); 1214 #ifdef _WIN64 1215 assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9, 1216 "unexpected argument registers"); 1217 if (nargs >= 4) 1218 __ mov(rax, r9); // r9 is also saved_rdi 1219 __ movptr(saved_rdi, rdi); 1220 __ movptr(saved_rsi, rsi); 1221 __ mov(rdi, rcx); // c_rarg0 1222 __ mov(rsi, rdx); // c_rarg1 1223 __ mov(rdx, r8); // c_rarg2 1224 if (nargs >= 4) 1225 __ mov(rcx, rax); // c_rarg3 (via rax) 1226 #else 1227 assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx, 1228 "unexpected argument registers"); 1229 #endif 1230 DEBUG_ONLY(regs_in_thread = false;) 1231 } 1232 1233 void restore_arg_regs() { 1234 assert(!regs_in_thread, "wrong call to restore_arg_regs"); 1235 const Register saved_rdi = r9; 1236 const Register saved_rsi = r10; 1237 #ifdef _WIN64 1238 __ movptr(rdi, saved_rdi); 1239 __ movptr(rsi, saved_rsi); 1240 #endif 1241 } 1242 1243 // This is used in places where r10 is a scratch register, and can 1244 // be adapted if r9 is needed also. 1245 void setup_arg_regs_using_thread() { 1246 const Register saved_r15 = r9; 1247 #ifdef _WIN64 1248 __ mov(saved_r15, r15); // r15 is callee saved and needs to be restored 1249 __ get_thread(r15_thread); 1250 assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9, 1251 "unexpected argument registers"); 1252 __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi); 1253 __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi); 1254 1255 __ mov(rdi, rcx); // c_rarg0 1256 __ mov(rsi, rdx); // c_rarg1 1257 __ mov(rdx, r8); // c_rarg2 1258 #else 1259 assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx, 1260 "unexpected argument registers"); 1261 #endif 1262 DEBUG_ONLY(regs_in_thread = true;) 1263 } 1264 1265 void restore_arg_regs_using_thread() { 1266 assert(regs_in_thread, "wrong call to restore_arg_regs"); 1267 const Register saved_r15 = r9; 1268 #ifdef _WIN64 1269 __ get_thread(r15_thread); 1270 __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset()))); 1271 __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset()))); 1272 __ mov(r15, saved_r15); // r15 is callee saved and needs to be restored 1273 #endif 1274 } 1275 1276 // Copy big chunks forward 1277 // 1278 // Inputs: 1279 // end_from - source arrays end address 1280 // end_to - destination array end address 1281 // qword_count - 64-bits element count, negative 1282 // to - scratch 1283 // L_copy_bytes - entry label 1284 // L_copy_8_bytes - exit label 1285 // 1286 void copy_bytes_forward(Register end_from, Register end_to, 1287 Register qword_count, Register to, 1288 Label& L_copy_bytes, Label& L_copy_8_bytes) { 1289 DEBUG_ONLY(__ stop("enter at entry label, not here")); 1290 Label L_loop; 1291 __ align(OptoLoopAlignment); 1292 if (UseUnalignedLoadStores) { 1293 Label L_end; 1294 // Copy 64-bytes per iteration 1295 if (UseAVX > 2) { 1296 Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold; 1297 1298 __ BIND(L_copy_bytes); 1299 __ cmpptr(qword_count, (-1 * AVX3Threshold / 8)); 1300 __ jccb(Assembler::less, L_above_threshold); 1301 __ jmpb(L_below_threshold); 1302 1303 __ bind(L_loop_avx512); 1304 __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit); 1305 __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit); 1306 __ bind(L_above_threshold); 1307 __ addptr(qword_count, 8); 1308 __ jcc(Assembler::lessEqual, L_loop_avx512); 1309 __ jmpb(L_32_byte_head); 1310 1311 __ bind(L_loop_avx2); 1312 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); 1313 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); 1314 __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24)); 1315 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1); 1316 __ bind(L_below_threshold); 1317 __ addptr(qword_count, 8); 1318 __ jcc(Assembler::lessEqual, L_loop_avx2); 1319 1320 __ bind(L_32_byte_head); 1321 __ subptr(qword_count, 4); // sub(8) and add(4) 1322 __ jccb(Assembler::greater, L_end); 1323 } else { 1324 __ BIND(L_loop); 1325 if (UseAVX == 2) { 1326 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); 1327 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); 1328 __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24)); 1329 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1); 1330 } else { 1331 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); 1332 __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); 1333 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40)); 1334 __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1); 1335 __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24)); 1336 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2); 1337 __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8)); 1338 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3); 1339 } 1340 1341 __ BIND(L_copy_bytes); 1342 __ addptr(qword_count, 8); 1343 __ jcc(Assembler::lessEqual, L_loop); 1344 __ subptr(qword_count, 4); // sub(8) and add(4) 1345 __ jccb(Assembler::greater, L_end); 1346 } 1347 // Copy trailing 32 bytes 1348 if (UseAVX >= 2) { 1349 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); 1350 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); 1351 } else { 1352 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); 1353 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); 1354 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8)); 1355 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1); 1356 } 1357 __ addptr(qword_count, 4); 1358 __ BIND(L_end); 1359 if (UseAVX >= 2) { 1360 // clean upper bits of YMM registers 1361 __ vpxor(xmm0, xmm0); 1362 __ vpxor(xmm1, xmm1); 1363 } 1364 } else { 1365 // Copy 32-bytes per iteration 1366 __ BIND(L_loop); 1367 __ movq(to, Address(end_from, qword_count, Address::times_8, -24)); 1368 __ movq(Address(end_to, qword_count, Address::times_8, -24), to); 1369 __ movq(to, Address(end_from, qword_count, Address::times_8, -16)); 1370 __ movq(Address(end_to, qword_count, Address::times_8, -16), to); 1371 __ movq(to, Address(end_from, qword_count, Address::times_8, - 8)); 1372 __ movq(Address(end_to, qword_count, Address::times_8, - 8), to); 1373 __ movq(to, Address(end_from, qword_count, Address::times_8, - 0)); 1374 __ movq(Address(end_to, qword_count, Address::times_8, - 0), to); 1375 1376 __ BIND(L_copy_bytes); 1377 __ addptr(qword_count, 4); 1378 __ jcc(Assembler::lessEqual, L_loop); 1379 } 1380 __ subptr(qword_count, 4); 1381 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords 1382 } 1383 1384 // Copy big chunks backward 1385 // 1386 // Inputs: 1387 // from - source arrays address 1388 // dest - destination array address 1389 // qword_count - 64-bits element count 1390 // to - scratch 1391 // L_copy_bytes - entry label 1392 // L_copy_8_bytes - exit label 1393 // 1394 void copy_bytes_backward(Register from, Register dest, 1395 Register qword_count, Register to, 1396 Label& L_copy_bytes, Label& L_copy_8_bytes) { 1397 DEBUG_ONLY(__ stop("enter at entry label, not here")); 1398 Label L_loop; 1399 __ align(OptoLoopAlignment); 1400 if (UseUnalignedLoadStores) { 1401 Label L_end; 1402 // Copy 64-bytes per iteration 1403 if (UseAVX > 2) { 1404 Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold; 1405 1406 __ BIND(L_copy_bytes); 1407 __ cmpptr(qword_count, (AVX3Threshold / 8)); 1408 __ jccb(Assembler::greater, L_above_threshold); 1409 __ jmpb(L_below_threshold); 1410 1411 __ BIND(L_loop_avx512); 1412 __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit); 1413 __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit); 1414 __ bind(L_above_threshold); 1415 __ subptr(qword_count, 8); 1416 __ jcc(Assembler::greaterEqual, L_loop_avx512); 1417 __ jmpb(L_32_byte_head); 1418 1419 __ bind(L_loop_avx2); 1420 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); 1421 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); 1422 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); 1423 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); 1424 __ bind(L_below_threshold); 1425 __ subptr(qword_count, 8); 1426 __ jcc(Assembler::greaterEqual, L_loop_avx2); 1427 1428 __ bind(L_32_byte_head); 1429 __ addptr(qword_count, 4); // add(8) and sub(4) 1430 __ jccb(Assembler::less, L_end); 1431 } else { 1432 __ BIND(L_loop); 1433 if (UseAVX == 2) { 1434 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); 1435 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); 1436 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); 1437 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); 1438 } else { 1439 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48)); 1440 __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0); 1441 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32)); 1442 __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1); 1443 __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16)); 1444 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2); 1445 __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0)); 1446 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3); 1447 } 1448 1449 __ BIND(L_copy_bytes); 1450 __ subptr(qword_count, 8); 1451 __ jcc(Assembler::greaterEqual, L_loop); 1452 1453 __ addptr(qword_count, 4); // add(8) and sub(4) 1454 __ jccb(Assembler::less, L_end); 1455 } 1456 // Copy trailing 32 bytes 1457 if (UseAVX >= 2) { 1458 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0)); 1459 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0); 1460 } else { 1461 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16)); 1462 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0); 1463 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); 1464 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); 1465 } 1466 __ subptr(qword_count, 4); 1467 __ BIND(L_end); 1468 if (UseAVX >= 2) { 1469 // clean upper bits of YMM registers 1470 __ vpxor(xmm0, xmm0); 1471 __ vpxor(xmm1, xmm1); 1472 } 1473 } else { 1474 // Copy 32-bytes per iteration 1475 __ BIND(L_loop); 1476 __ movq(to, Address(from, qword_count, Address::times_8, 24)); 1477 __ movq(Address(dest, qword_count, Address::times_8, 24), to); 1478 __ movq(to, Address(from, qword_count, Address::times_8, 16)); 1479 __ movq(Address(dest, qword_count, Address::times_8, 16), to); 1480 __ movq(to, Address(from, qword_count, Address::times_8, 8)); 1481 __ movq(Address(dest, qword_count, Address::times_8, 8), to); 1482 __ movq(to, Address(from, qword_count, Address::times_8, 0)); 1483 __ movq(Address(dest, qword_count, Address::times_8, 0), to); 1484 1485 __ BIND(L_copy_bytes); 1486 __ subptr(qword_count, 4); 1487 __ jcc(Assembler::greaterEqual, L_loop); 1488 } 1489 __ addptr(qword_count, 4); 1490 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords 1491 } 1492 1493 // Arguments: 1494 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1495 // ignored 1496 // name - stub name string 1497 // 1498 // Inputs: 1499 // c_rarg0 - source array address 1500 // c_rarg1 - destination array address 1501 // c_rarg2 - element count, treated as ssize_t, can be zero 1502 // 1503 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1504 // we let the hardware handle it. The one to eight bytes within words, 1505 // dwords or qwords that span cache line boundaries will still be loaded 1506 // and stored atomically. 1507 // 1508 // Side Effects: 1509 // disjoint_byte_copy_entry is set to the no-overlap entry point 1510 // used by generate_conjoint_byte_copy(). 1511 // 1512 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1513 __ align(CodeEntryAlignment); 1514 StubCodeMark mark(this, "StubRoutines", name); 1515 address start = __ pc(); 1516 1517 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1518 Label L_copy_byte, L_exit; 1519 const Register from = rdi; // source array address 1520 const Register to = rsi; // destination array address 1521 const Register count = rdx; // elements count 1522 const Register byte_count = rcx; 1523 const Register qword_count = count; 1524 const Register end_from = from; // source array end address 1525 const Register end_to = to; // destination array end address 1526 // End pointers are inclusive, and if count is not zero they point 1527 // to the last unit copied: end_to[0] := end_from[0] 1528 1529 __ enter(); // required for proper stackwalking of RuntimeStub frame 1530 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1531 1532 if (entry != NULL) { 1533 *entry = __ pc(); 1534 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1535 BLOCK_COMMENT("Entry:"); 1536 } 1537 1538 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1539 // r9 and r10 may be used to save non-volatile registers 1540 1541 { 1542 // UnsafeCopyMemory page error: continue after ucm 1543 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1544 // 'from', 'to' and 'count' are now valid 1545 __ movptr(byte_count, count); 1546 __ shrptr(count, 3); // count => qword_count 1547 1548 // Copy from low to high addresses. Use 'to' as scratch. 1549 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1550 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1551 __ negptr(qword_count); // make the count negative 1552 __ jmp(L_copy_bytes); 1553 1554 // Copy trailing qwords 1555 __ BIND(L_copy_8_bytes); 1556 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1557 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1558 __ increment(qword_count); 1559 __ jcc(Assembler::notZero, L_copy_8_bytes); 1560 1561 // Check for and copy trailing dword 1562 __ BIND(L_copy_4_bytes); 1563 __ testl(byte_count, 4); 1564 __ jccb(Assembler::zero, L_copy_2_bytes); 1565 __ movl(rax, Address(end_from, 8)); 1566 __ movl(Address(end_to, 8), rax); 1567 1568 __ addptr(end_from, 4); 1569 __ addptr(end_to, 4); 1570 1571 // Check for and copy trailing word 1572 __ BIND(L_copy_2_bytes); 1573 __ testl(byte_count, 2); 1574 __ jccb(Assembler::zero, L_copy_byte); 1575 __ movw(rax, Address(end_from, 8)); 1576 __ movw(Address(end_to, 8), rax); 1577 1578 __ addptr(end_from, 2); 1579 __ addptr(end_to, 2); 1580 1581 // Check for and copy trailing byte 1582 __ BIND(L_copy_byte); 1583 __ testl(byte_count, 1); 1584 __ jccb(Assembler::zero, L_exit); 1585 __ movb(rax, Address(end_from, 8)); 1586 __ movb(Address(end_to, 8), rax); 1587 } 1588 __ BIND(L_exit); 1589 address ucme_exit_pc = __ pc(); 1590 restore_arg_regs(); 1591 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free 1592 __ xorptr(rax, rax); // return 0 1593 __ vzeroupper(); 1594 __ leave(); // required for proper stackwalking of RuntimeStub frame 1595 __ ret(0); 1596 1597 { 1598 UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc); 1599 // Copy in multi-bytes chunks 1600 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1601 __ jmp(L_copy_4_bytes); 1602 } 1603 return start; 1604 } 1605 1606 // Arguments: 1607 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1608 // ignored 1609 // name - stub name string 1610 // 1611 // Inputs: 1612 // c_rarg0 - source array address 1613 // c_rarg1 - destination array address 1614 // c_rarg2 - element count, treated as ssize_t, can be zero 1615 // 1616 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1617 // we let the hardware handle it. The one to eight bytes within words, 1618 // dwords or qwords that span cache line boundaries will still be loaded 1619 // and stored atomically. 1620 // 1621 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1622 address* entry, const char *name) { 1623 __ align(CodeEntryAlignment); 1624 StubCodeMark mark(this, "StubRoutines", name); 1625 address start = __ pc(); 1626 1627 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1628 const Register from = rdi; // source array address 1629 const Register to = rsi; // destination array address 1630 const Register count = rdx; // elements count 1631 const Register byte_count = rcx; 1632 const Register qword_count = count; 1633 1634 __ enter(); // required for proper stackwalking of RuntimeStub frame 1635 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1636 1637 if (entry != NULL) { 1638 *entry = __ pc(); 1639 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1640 BLOCK_COMMENT("Entry:"); 1641 } 1642 1643 array_overlap_test(nooverlap_target, Address::times_1); 1644 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1645 // r9 and r10 may be used to save non-volatile registers 1646 1647 { 1648 // UnsafeCopyMemory page error: continue after ucm 1649 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1650 // 'from', 'to' and 'count' are now valid 1651 __ movptr(byte_count, count); 1652 __ shrptr(count, 3); // count => qword_count 1653 1654 // Copy from high to low addresses. 1655 1656 // Check for and copy trailing byte 1657 __ testl(byte_count, 1); 1658 __ jcc(Assembler::zero, L_copy_2_bytes); 1659 __ movb(rax, Address(from, byte_count, Address::times_1, -1)); 1660 __ movb(Address(to, byte_count, Address::times_1, -1), rax); 1661 __ decrement(byte_count); // Adjust for possible trailing word 1662 1663 // Check for and copy trailing word 1664 __ BIND(L_copy_2_bytes); 1665 __ testl(byte_count, 2); 1666 __ jcc(Assembler::zero, L_copy_4_bytes); 1667 __ movw(rax, Address(from, byte_count, Address::times_1, -2)); 1668 __ movw(Address(to, byte_count, Address::times_1, -2), rax); 1669 1670 // Check for and copy trailing dword 1671 __ BIND(L_copy_4_bytes); 1672 __ testl(byte_count, 4); 1673 __ jcc(Assembler::zero, L_copy_bytes); 1674 __ movl(rax, Address(from, qword_count, Address::times_8)); 1675 __ movl(Address(to, qword_count, Address::times_8), rax); 1676 __ jmp(L_copy_bytes); 1677 1678 // Copy trailing qwords 1679 __ BIND(L_copy_8_bytes); 1680 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1681 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1682 __ decrement(qword_count); 1683 __ jcc(Assembler::notZero, L_copy_8_bytes); 1684 } 1685 restore_arg_regs(); 1686 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free 1687 __ xorptr(rax, rax); // return 0 1688 __ vzeroupper(); 1689 __ leave(); // required for proper stackwalking of RuntimeStub frame 1690 __ ret(0); 1691 1692 { 1693 // UnsafeCopyMemory page error: continue after ucm 1694 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1695 // Copy in multi-bytes chunks 1696 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1697 } 1698 restore_arg_regs(); 1699 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free 1700 __ xorptr(rax, rax); // return 0 1701 __ vzeroupper(); 1702 __ leave(); // required for proper stackwalking of RuntimeStub frame 1703 __ ret(0); 1704 1705 return start; 1706 } 1707 1708 // Arguments: 1709 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1710 // ignored 1711 // name - stub name string 1712 // 1713 // Inputs: 1714 // c_rarg0 - source array address 1715 // c_rarg1 - destination array address 1716 // c_rarg2 - element count, treated as ssize_t, can be zero 1717 // 1718 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1719 // let the hardware handle it. The two or four words within dwords 1720 // or qwords that span cache line boundaries will still be loaded 1721 // and stored atomically. 1722 // 1723 // Side Effects: 1724 // disjoint_short_copy_entry is set to the no-overlap entry point 1725 // used by generate_conjoint_short_copy(). 1726 // 1727 address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) { 1728 __ align(CodeEntryAlignment); 1729 StubCodeMark mark(this, "StubRoutines", name); 1730 address start = __ pc(); 1731 1732 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit; 1733 const Register from = rdi; // source array address 1734 const Register to = rsi; // destination array address 1735 const Register count = rdx; // elements count 1736 const Register word_count = rcx; 1737 const Register qword_count = count; 1738 const Register end_from = from; // source array end address 1739 const Register end_to = to; // destination array end address 1740 // End pointers are inclusive, and if count is not zero they point 1741 // to the last unit copied: end_to[0] := end_from[0] 1742 1743 __ enter(); // required for proper stackwalking of RuntimeStub frame 1744 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1745 1746 if (entry != NULL) { 1747 *entry = __ pc(); 1748 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1749 BLOCK_COMMENT("Entry:"); 1750 } 1751 1752 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1753 // r9 and r10 may be used to save non-volatile registers 1754 1755 { 1756 // UnsafeCopyMemory page error: continue after ucm 1757 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1758 // 'from', 'to' and 'count' are now valid 1759 __ movptr(word_count, count); 1760 __ shrptr(count, 2); // count => qword_count 1761 1762 // Copy from low to high addresses. Use 'to' as scratch. 1763 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1764 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1765 __ negptr(qword_count); 1766 __ jmp(L_copy_bytes); 1767 1768 // Copy trailing qwords 1769 __ BIND(L_copy_8_bytes); 1770 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1771 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1772 __ increment(qword_count); 1773 __ jcc(Assembler::notZero, L_copy_8_bytes); 1774 1775 // Original 'dest' is trashed, so we can't use it as a 1776 // base register for a possible trailing word copy 1777 1778 // Check for and copy trailing dword 1779 __ BIND(L_copy_4_bytes); 1780 __ testl(word_count, 2); 1781 __ jccb(Assembler::zero, L_copy_2_bytes); 1782 __ movl(rax, Address(end_from, 8)); 1783 __ movl(Address(end_to, 8), rax); 1784 1785 __ addptr(end_from, 4); 1786 __ addptr(end_to, 4); 1787 1788 // Check for and copy trailing word 1789 __ BIND(L_copy_2_bytes); 1790 __ testl(word_count, 1); 1791 __ jccb(Assembler::zero, L_exit); 1792 __ movw(rax, Address(end_from, 8)); 1793 __ movw(Address(end_to, 8), rax); 1794 } 1795 __ BIND(L_exit); 1796 address ucme_exit_pc = __ pc(); 1797 restore_arg_regs(); 1798 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free 1799 __ xorptr(rax, rax); // return 0 1800 __ vzeroupper(); 1801 __ leave(); // required for proper stackwalking of RuntimeStub frame 1802 __ ret(0); 1803 1804 { 1805 UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc); 1806 // Copy in multi-bytes chunks 1807 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1808 __ jmp(L_copy_4_bytes); 1809 } 1810 1811 return start; 1812 } 1813 1814 address generate_fill(BasicType t, bool aligned, const char *name) { 1815 __ align(CodeEntryAlignment); 1816 StubCodeMark mark(this, "StubRoutines", name); 1817 address start = __ pc(); 1818 1819 BLOCK_COMMENT("Entry:"); 1820 1821 const Register to = c_rarg0; // source array address 1822 const Register value = c_rarg1; // value 1823 const Register count = c_rarg2; // elements count 1824 1825 __ enter(); // required for proper stackwalking of RuntimeStub frame 1826 1827 __ generate_fill(t, aligned, to, value, count, rax, xmm0); 1828 1829 __ vzeroupper(); 1830 __ leave(); // required for proper stackwalking of RuntimeStub frame 1831 __ ret(0); 1832 return start; 1833 } 1834 1835 // Arguments: 1836 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1837 // ignored 1838 // name - stub name string 1839 // 1840 // Inputs: 1841 // c_rarg0 - source array address 1842 // c_rarg1 - destination array address 1843 // c_rarg2 - element count, treated as ssize_t, can be zero 1844 // 1845 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1846 // let the hardware handle it. The two or four words within dwords 1847 // or qwords that span cache line boundaries will still be loaded 1848 // and stored atomically. 1849 // 1850 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1851 address *entry, const char *name) { 1852 __ align(CodeEntryAlignment); 1853 StubCodeMark mark(this, "StubRoutines", name); 1854 address start = __ pc(); 1855 1856 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes; 1857 const Register from = rdi; // source array address 1858 const Register to = rsi; // destination array address 1859 const Register count = rdx; // elements count 1860 const Register word_count = rcx; 1861 const Register qword_count = count; 1862 1863 __ enter(); // required for proper stackwalking of RuntimeStub frame 1864 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1865 1866 if (entry != NULL) { 1867 *entry = __ pc(); 1868 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1869 BLOCK_COMMENT("Entry:"); 1870 } 1871 1872 array_overlap_test(nooverlap_target, Address::times_2); 1873 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1874 // r9 and r10 may be used to save non-volatile registers 1875 1876 { 1877 // UnsafeCopyMemory page error: continue after ucm 1878 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1879 // 'from', 'to' and 'count' are now valid 1880 __ movptr(word_count, count); 1881 __ shrptr(count, 2); // count => qword_count 1882 1883 // Copy from high to low addresses. Use 'to' as scratch. 1884 1885 // Check for and copy trailing word 1886 __ testl(word_count, 1); 1887 __ jccb(Assembler::zero, L_copy_4_bytes); 1888 __ movw(rax, Address(from, word_count, Address::times_2, -2)); 1889 __ movw(Address(to, word_count, Address::times_2, -2), rax); 1890 1891 // Check for and copy trailing dword 1892 __ BIND(L_copy_4_bytes); 1893 __ testl(word_count, 2); 1894 __ jcc(Assembler::zero, L_copy_bytes); 1895 __ movl(rax, Address(from, qword_count, Address::times_8)); 1896 __ movl(Address(to, qword_count, Address::times_8), rax); 1897 __ jmp(L_copy_bytes); 1898 1899 // Copy trailing qwords 1900 __ BIND(L_copy_8_bytes); 1901 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1902 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1903 __ decrement(qword_count); 1904 __ jcc(Assembler::notZero, L_copy_8_bytes); 1905 } 1906 restore_arg_regs(); 1907 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free 1908 __ xorptr(rax, rax); // return 0 1909 __ vzeroupper(); 1910 __ leave(); // required for proper stackwalking of RuntimeStub frame 1911 __ ret(0); 1912 1913 { 1914 // UnsafeCopyMemory page error: continue after ucm 1915 UnsafeCopyMemoryMark ucmm(this, !aligned, true); 1916 // Copy in multi-bytes chunks 1917 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1918 } 1919 restore_arg_regs(); 1920 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free 1921 __ xorptr(rax, rax); // return 0 1922 __ vzeroupper(); 1923 __ leave(); // required for proper stackwalking of RuntimeStub frame 1924 __ ret(0); 1925 1926 return start; 1927 } 1928 1929 // Arguments: 1930 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1931 // ignored 1932 // is_oop - true => oop array, so generate store check code 1933 // name - stub name string 1934 // 1935 // Inputs: 1936 // c_rarg0 - source array address 1937 // c_rarg1 - destination array address 1938 // c_rarg2 - element count, treated as ssize_t, can be zero 1939 // 1940 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1941 // the hardware handle it. The two dwords within qwords that span 1942 // cache line boundaries will still be loaded and stored atomicly. 1943 // 1944 // Side Effects: 1945 // disjoint_int_copy_entry is set to the no-overlap entry point 1946 // used by generate_conjoint_int_oop_copy(). 1947 // 1948 address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry, 1949 const char *name, bool dest_uninitialized = false) { 1950 __ align(CodeEntryAlignment); 1951 StubCodeMark mark(this, "StubRoutines", name); 1952 address start = __ pc(); 1953 1954 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit; 1955 const Register from = rdi; // source array address 1956 const Register to = rsi; // destination array address 1957 const Register count = rdx; // elements count 1958 const Register dword_count = rcx; 1959 const Register qword_count = count; 1960 const Register end_from = from; // source array end address 1961 const Register end_to = to; // destination array end address 1962 // End pointers are inclusive, and if count is not zero they point 1963 // to the last unit copied: end_to[0] := end_from[0] 1964 1965 __ enter(); // required for proper stackwalking of RuntimeStub frame 1966 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1967 1968 if (entry != NULL) { 1969 *entry = __ pc(); 1970 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1971 BLOCK_COMMENT("Entry:"); 1972 } 1973 1974 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 1975 // r9 is used to save r15_thread 1976 1977 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1978 if (dest_uninitialized) { 1979 decorators |= IS_DEST_UNINITIALIZED; 1980 } 1981 if (aligned) { 1982 decorators |= ARRAYCOPY_ALIGNED; 1983 } 1984 1985 BasicType type = is_oop ? T_OBJECT : T_INT; 1986 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1987 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 1988 1989 { 1990 // UnsafeCopyMemory page error: continue after ucm 1991 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 1992 // 'from', 'to' and 'count' are now valid 1993 __ movptr(dword_count, count); 1994 __ shrptr(count, 1); // count => qword_count 1995 1996 // Copy from low to high addresses. Use 'to' as scratch. 1997 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1998 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1999 __ negptr(qword_count); 2000 __ jmp(L_copy_bytes); 2001 2002 // Copy trailing qwords 2003 __ BIND(L_copy_8_bytes); 2004 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 2005 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 2006 __ increment(qword_count); 2007 __ jcc(Assembler::notZero, L_copy_8_bytes); 2008 2009 // Check for and copy trailing dword 2010 __ BIND(L_copy_4_bytes); 2011 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1 2012 __ jccb(Assembler::zero, L_exit); 2013 __ movl(rax, Address(end_from, 8)); 2014 __ movl(Address(end_to, 8), rax); 2015 } 2016 __ BIND(L_exit); 2017 address ucme_exit_pc = __ pc(); 2018 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); 2019 restore_arg_regs_using_thread(); 2020 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free 2021 __ vzeroupper(); 2022 __ xorptr(rax, rax); // return 0 2023 __ leave(); // required for proper stackwalking of RuntimeStub frame 2024 __ ret(0); 2025 2026 { 2027 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc); 2028 // Copy in multi-bytes chunks 2029 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 2030 __ jmp(L_copy_4_bytes); 2031 } 2032 2033 return start; 2034 } 2035 2036 // Arguments: 2037 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 2038 // ignored 2039 // is_oop - true => oop array, so generate store check code 2040 // name - stub name string 2041 // 2042 // Inputs: 2043 // c_rarg0 - source array address 2044 // c_rarg1 - destination array address 2045 // c_rarg2 - element count, treated as ssize_t, can be zero 2046 // 2047 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 2048 // the hardware handle it. The two dwords within qwords that span 2049 // cache line boundaries will still be loaded and stored atomicly. 2050 // 2051 address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target, 2052 address *entry, const char *name, 2053 bool dest_uninitialized = false) { 2054 __ align(CodeEntryAlignment); 2055 StubCodeMark mark(this, "StubRoutines", name); 2056 address start = __ pc(); 2057 2058 Label L_copy_bytes, L_copy_8_bytes, L_exit; 2059 const Register from = rdi; // source array address 2060 const Register to = rsi; // destination array address 2061 const Register count = rdx; // elements count 2062 const Register dword_count = rcx; 2063 const Register qword_count = count; 2064 2065 __ enter(); // required for proper stackwalking of RuntimeStub frame 2066 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2067 2068 if (entry != NULL) { 2069 *entry = __ pc(); 2070 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2071 BLOCK_COMMENT("Entry:"); 2072 } 2073 2074 array_overlap_test(nooverlap_target, Address::times_4); 2075 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2076 // r9 is used to save r15_thread 2077 2078 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 2079 if (dest_uninitialized) { 2080 decorators |= IS_DEST_UNINITIALIZED; 2081 } 2082 if (aligned) { 2083 decorators |= ARRAYCOPY_ALIGNED; 2084 } 2085 2086 BasicType type = is_oop ? T_OBJECT : T_INT; 2087 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2088 // no registers are destroyed by this call 2089 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 2090 2091 assert_clean_int(count, rax); // Make sure 'count' is clean int. 2092 { 2093 // UnsafeCopyMemory page error: continue after ucm 2094 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 2095 // 'from', 'to' and 'count' are now valid 2096 __ movptr(dword_count, count); 2097 __ shrptr(count, 1); // count => qword_count 2098 2099 // Copy from high to low addresses. Use 'to' as scratch. 2100 2101 // Check for and copy trailing dword 2102 __ testl(dword_count, 1); 2103 __ jcc(Assembler::zero, L_copy_bytes); 2104 __ movl(rax, Address(from, dword_count, Address::times_4, -4)); 2105 __ movl(Address(to, dword_count, Address::times_4, -4), rax); 2106 __ jmp(L_copy_bytes); 2107 2108 // Copy trailing qwords 2109 __ BIND(L_copy_8_bytes); 2110 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 2111 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 2112 __ decrement(qword_count); 2113 __ jcc(Assembler::notZero, L_copy_8_bytes); 2114 } 2115 if (is_oop) { 2116 __ jmp(L_exit); 2117 } 2118 restore_arg_regs_using_thread(); 2119 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free 2120 __ xorptr(rax, rax); // return 0 2121 __ vzeroupper(); 2122 __ leave(); // required for proper stackwalking of RuntimeStub frame 2123 __ ret(0); 2124 2125 { 2126 // UnsafeCopyMemory page error: continue after ucm 2127 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 2128 // Copy in multi-bytes chunks 2129 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 2130 } 2131 2132 __ BIND(L_exit); 2133 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); 2134 restore_arg_regs_using_thread(); 2135 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free 2136 __ xorptr(rax, rax); // return 0 2137 __ vzeroupper(); 2138 __ leave(); // required for proper stackwalking of RuntimeStub frame 2139 __ ret(0); 2140 2141 return start; 2142 } 2143 2144 // Arguments: 2145 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 2146 // ignored 2147 // is_oop - true => oop array, so generate store check code 2148 // name - stub name string 2149 // 2150 // Inputs: 2151 // c_rarg0 - source array address 2152 // c_rarg1 - destination array address 2153 // c_rarg2 - element count, treated as ssize_t, can be zero 2154 // 2155 // Side Effects: 2156 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 2157 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 2158 // 2159 address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry, 2160 const char *name, bool dest_uninitialized = false) { 2161 __ align(CodeEntryAlignment); 2162 StubCodeMark mark(this, "StubRoutines", name); 2163 address start = __ pc(); 2164 2165 Label L_copy_bytes, L_copy_8_bytes, L_exit; 2166 const Register from = rdi; // source array address 2167 const Register to = rsi; // destination array address 2168 const Register qword_count = rdx; // elements count 2169 const Register end_from = from; // source array end address 2170 const Register end_to = rcx; // destination array end address 2171 const Register saved_count = r11; 2172 // End pointers are inclusive, and if count is not zero they point 2173 // to the last unit copied: end_to[0] := end_from[0] 2174 2175 __ enter(); // required for proper stackwalking of RuntimeStub frame 2176 // Save no-overlap entry point for generate_conjoint_long_oop_copy() 2177 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2178 2179 if (entry != NULL) { 2180 *entry = __ pc(); 2181 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2182 BLOCK_COMMENT("Entry:"); 2183 } 2184 2185 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2186 // r9 is used to save r15_thread 2187 // 'from', 'to' and 'qword_count' are now valid 2188 2189 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 2190 if (dest_uninitialized) { 2191 decorators |= IS_DEST_UNINITIALIZED; 2192 } 2193 if (aligned) { 2194 decorators |= ARRAYCOPY_ALIGNED; 2195 } 2196 2197 BasicType type = is_oop ? T_OBJECT : T_LONG; 2198 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2199 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); 2200 { 2201 // UnsafeCopyMemory page error: continue after ucm 2202 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 2203 2204 // Copy from low to high addresses. Use 'to' as scratch. 2205 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 2206 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 2207 __ negptr(qword_count); 2208 __ jmp(L_copy_bytes); 2209 2210 // Copy trailing qwords 2211 __ BIND(L_copy_8_bytes); 2212 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 2213 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 2214 __ increment(qword_count); 2215 __ jcc(Assembler::notZero, L_copy_8_bytes); 2216 } 2217 if (is_oop) { 2218 __ jmp(L_exit); 2219 } else { 2220 restore_arg_regs_using_thread(); 2221 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free 2222 __ xorptr(rax, rax); // return 0 2223 __ vzeroupper(); 2224 __ leave(); // required for proper stackwalking of RuntimeStub frame 2225 __ ret(0); 2226 } 2227 2228 { 2229 // UnsafeCopyMemory page error: continue after ucm 2230 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 2231 // Copy in multi-bytes chunks 2232 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 2233 } 2234 2235 __ BIND(L_exit); 2236 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); 2237 restore_arg_regs_using_thread(); 2238 if (is_oop) { 2239 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free 2240 } else { 2241 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free 2242 } 2243 __ vzeroupper(); 2244 __ xorptr(rax, rax); // return 0 2245 __ leave(); // required for proper stackwalking of RuntimeStub frame 2246 __ ret(0); 2247 2248 return start; 2249 } 2250 2251 // Arguments: 2252 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 2253 // ignored 2254 // is_oop - true => oop array, so generate store check code 2255 // name - stub name string 2256 // 2257 // Inputs: 2258 // c_rarg0 - source array address 2259 // c_rarg1 - destination array address 2260 // c_rarg2 - element count, treated as ssize_t, can be zero 2261 // 2262 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, 2263 address nooverlap_target, address *entry, 2264 const char *name, bool dest_uninitialized = false) { 2265 __ align(CodeEntryAlignment); 2266 StubCodeMark mark(this, "StubRoutines", name); 2267 address start = __ pc(); 2268 2269 Label L_copy_bytes, L_copy_8_bytes, L_exit; 2270 const Register from = rdi; // source array address 2271 const Register to = rsi; // destination array address 2272 const Register qword_count = rdx; // elements count 2273 const Register saved_count = rcx; 2274 2275 __ enter(); // required for proper stackwalking of RuntimeStub frame 2276 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2277 2278 if (entry != NULL) { 2279 *entry = __ pc(); 2280 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2281 BLOCK_COMMENT("Entry:"); 2282 } 2283 2284 array_overlap_test(nooverlap_target, Address::times_8); 2285 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2286 // r9 is used to save r15_thread 2287 // 'from', 'to' and 'qword_count' are now valid 2288 2289 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 2290 if (dest_uninitialized) { 2291 decorators |= IS_DEST_UNINITIALIZED; 2292 } 2293 if (aligned) { 2294 decorators |= ARRAYCOPY_ALIGNED; 2295 } 2296 2297 BasicType type = is_oop ? T_OBJECT : T_LONG; 2298 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2299 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); 2300 { 2301 // UnsafeCopyMemory page error: continue after ucm 2302 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 2303 2304 __ jmp(L_copy_bytes); 2305 2306 // Copy trailing qwords 2307 __ BIND(L_copy_8_bytes); 2308 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 2309 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 2310 __ decrement(qword_count); 2311 __ jcc(Assembler::notZero, L_copy_8_bytes); 2312 } 2313 if (is_oop) { 2314 __ jmp(L_exit); 2315 } else { 2316 restore_arg_regs_using_thread(); 2317 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free 2318 __ xorptr(rax, rax); // return 0 2319 __ vzeroupper(); 2320 __ leave(); // required for proper stackwalking of RuntimeStub frame 2321 __ ret(0); 2322 } 2323 { 2324 // UnsafeCopyMemory page error: continue after ucm 2325 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true); 2326 2327 // Copy in multi-bytes chunks 2328 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 2329 } 2330 __ BIND(L_exit); 2331 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); 2332 restore_arg_regs_using_thread(); 2333 if (is_oop) { 2334 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free 2335 } else { 2336 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free 2337 } 2338 __ vzeroupper(); 2339 __ xorptr(rax, rax); // return 0 2340 __ leave(); // required for proper stackwalking of RuntimeStub frame 2341 __ ret(0); 2342 2343 return start; 2344 } 2345 2346 2347 // Helper for generating a dynamic type check. 2348 // Smashes no registers. 2349 void generate_type_check(Register sub_klass, 2350 Register super_check_offset, 2351 Register super_klass, 2352 Label& L_success) { 2353 assert_different_registers(sub_klass, super_check_offset, super_klass); 2354 2355 BLOCK_COMMENT("type_check:"); 2356 2357 Label L_miss; 2358 2359 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 2360 super_check_offset); 2361 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 2362 2363 // Fall through on failure! 2364 __ BIND(L_miss); 2365 } 2366 2367 // 2368 // Generate checkcasting array copy stub 2369 // 2370 // Input: 2371 // c_rarg0 - source array address 2372 // c_rarg1 - destination array address 2373 // c_rarg2 - element count, treated as ssize_t, can be zero 2374 // c_rarg3 - size_t ckoff (super_check_offset) 2375 // not Win64 2376 // c_rarg4 - oop ckval (super_klass) 2377 // Win64 2378 // rsp+40 - oop ckval (super_klass) 2379 // 2380 // Output: 2381 // rax == 0 - success 2382 // rax == -1^K - failure, where K is partial transfer count 2383 // 2384 address generate_checkcast_copy(const char *name, address *entry, 2385 bool dest_uninitialized = false) { 2386 2387 Label L_load_element, L_store_element, L_do_card_marks, L_done; 2388 2389 // Input registers (after setup_arg_regs) 2390 const Register from = rdi; // source array address 2391 const Register to = rsi; // destination array address 2392 const Register length = rdx; // elements count 2393 const Register ckoff = rcx; // super_check_offset 2394 const Register ckval = r8; // super_klass 2395 2396 // Registers used as temps (r13, r14 are save-on-entry) 2397 const Register end_from = from; // source array end address 2398 const Register end_to = r13; // destination array end address 2399 const Register count = rdx; // -(count_remaining) 2400 const Register r14_length = r14; // saved copy of length 2401 // End pointers are inclusive, and if length is not zero they point 2402 // to the last unit copied: end_to[0] := end_from[0] 2403 2404 const Register rax_oop = rax; // actual oop copied 2405 const Register r11_klass = r11; // oop._klass 2406 2407 //--------------------------------------------------------------- 2408 // Assembler stub will be used for this call to arraycopy 2409 // if the two arrays are subtypes of Object[] but the 2410 // destination array type is not equal to or a supertype 2411 // of the source type. Each element must be separately 2412 // checked. 2413 2414 __ align(CodeEntryAlignment); 2415 StubCodeMark mark(this, "StubRoutines", name); 2416 address start = __ pc(); 2417 2418 __ enter(); // required for proper stackwalking of RuntimeStub frame 2419 2420 #ifdef ASSERT 2421 // caller guarantees that the arrays really are different 2422 // otherwise, we would have to make conjoint checks 2423 { Label L; 2424 array_overlap_test(L, TIMES_OOP); 2425 __ stop("checkcast_copy within a single array"); 2426 __ bind(L); 2427 } 2428 #endif //ASSERT 2429 2430 setup_arg_regs(4); // from => rdi, to => rsi, length => rdx 2431 // ckoff => rcx, ckval => r8 2432 // r9 and r10 may be used to save non-volatile registers 2433 #ifdef _WIN64 2434 // last argument (#4) is on stack on Win64 2435 __ movptr(ckval, Address(rsp, 6 * wordSize)); 2436 #endif 2437 2438 // Caller of this entry point must set up the argument registers. 2439 if (entry != NULL) { 2440 *entry = __ pc(); 2441 BLOCK_COMMENT("Entry:"); 2442 } 2443 2444 // allocate spill slots for r13, r14 2445 enum { 2446 saved_r13_offset, 2447 saved_r14_offset, 2448 saved_r10_offset, 2449 saved_rbp_offset 2450 }; 2451 __ subptr(rsp, saved_rbp_offset * wordSize); 2452 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13); 2453 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14); 2454 __ movptr(Address(rsp, saved_r10_offset * wordSize), r10); 2455 2456 #ifdef ASSERT 2457 Label L2; 2458 __ get_thread(r14); 2459 __ cmpptr(r15_thread, r14); 2460 __ jcc(Assembler::equal, L2); 2461 __ stop("StubRoutines::call_stub: r15_thread is modified by call"); 2462 __ bind(L2); 2463 #endif // ASSERT 2464 2465 // check that int operands are properly extended to size_t 2466 assert_clean_int(length, rax); 2467 assert_clean_int(ckoff, rax); 2468 2469 #ifdef ASSERT 2470 BLOCK_COMMENT("assert consistent ckoff/ckval"); 2471 // The ckoff and ckval must be mutually consistent, 2472 // even though caller generates both. 2473 { Label L; 2474 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2475 __ cmpl(ckoff, Address(ckval, sco_offset)); 2476 __ jcc(Assembler::equal, L); 2477 __ stop("super_check_offset inconsistent"); 2478 __ bind(L); 2479 } 2480 #endif //ASSERT 2481 2482 // Loop-invariant addresses. They are exclusive end pointers. 2483 Address end_from_addr(from, length, TIMES_OOP, 0); 2484 Address end_to_addr(to, length, TIMES_OOP, 0); 2485 // Loop-variant addresses. They assume post-incremented count < 0. 2486 Address from_element_addr(end_from, count, TIMES_OOP, 0); 2487 Address to_element_addr(end_to, count, TIMES_OOP, 0); 2488 2489 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 2490 if (dest_uninitialized) { 2491 decorators |= IS_DEST_UNINITIALIZED; 2492 } 2493 2494 BasicType type = T_OBJECT; 2495 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2496 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 2497 2498 // Copy from low to high addresses, indexed from the end of each array. 2499 __ lea(end_from, end_from_addr); 2500 __ lea(end_to, end_to_addr); 2501 __ movptr(r14_length, length); // save a copy of the length 2502 assert(length == count, ""); // else fix next line: 2503 __ negptr(count); // negate and test the length 2504 __ jcc(Assembler::notZero, L_load_element); 2505 2506 // Empty array: Nothing to do. 2507 __ xorptr(rax, rax); // return 0 on (trivial) success 2508 __ jmp(L_done); 2509 2510 // ======== begin loop ======== 2511 // (Loop is rotated; its entry is L_load_element.) 2512 // Loop control: 2513 // for (count = -count; count != 0; count++) 2514 // Base pointers src, dst are biased by 8*(count-1),to last element. 2515 __ align(OptoLoopAlignment); 2516 2517 __ BIND(L_store_element); 2518 __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, AS_RAW); // store the oop 2519 __ increment(count); // increment the count toward zero 2520 __ jcc(Assembler::zero, L_do_card_marks); 2521 2522 // ======== loop entry is here ======== 2523 __ BIND(L_load_element); 2524 __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop 2525 __ testptr(rax_oop, rax_oop); 2526 __ jcc(Assembler::zero, L_store_element); 2527 2528 __ load_klass(r11_klass, rax_oop);// query the object klass 2529 generate_type_check(r11_klass, ckoff, ckval, L_store_element); 2530 // ======== end loop ======== 2531 2532 // It was a real error; we must depend on the caller to finish the job. 2533 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops. 2534 // Emit GC store barriers for the oops we have copied (r14 + rdx), 2535 // and report their number to the caller. 2536 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1); 2537 Label L_post_barrier; 2538 __ addptr(r14_length, count); // K = (original - remaining) oops 2539 __ movptr(rax, r14_length); // save the value 2540 __ notptr(rax); // report (-1^K) to caller (does not affect flags) 2541 __ jccb(Assembler::notZero, L_post_barrier); 2542 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier 2543 2544 // Come here on success only. 2545 __ BIND(L_do_card_marks); 2546 __ xorptr(rax, rax); // return 0 on success 2547 2548 __ BIND(L_post_barrier); 2549 bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length); 2550 2551 // Common exit point (success or failure). 2552 __ BIND(L_done); 2553 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize)); 2554 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize)); 2555 __ movptr(r10, Address(rsp, saved_r10_offset * wordSize)); 2556 restore_arg_regs(); 2557 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free 2558 __ leave(); // required for proper stackwalking of RuntimeStub frame 2559 __ ret(0); 2560 2561 return start; 2562 } 2563 2564 // 2565 // Generate 'unsafe' array copy stub 2566 // Though just as safe as the other stubs, it takes an unscaled 2567 // size_t argument instead of an element count. 2568 // 2569 // Input: 2570 // c_rarg0 - source array address 2571 // c_rarg1 - destination array address 2572 // c_rarg2 - byte count, treated as ssize_t, can be zero 2573 // 2574 // Examines the alignment of the operands and dispatches 2575 // to a long, int, short, or byte copy loop. 2576 // 2577 address generate_unsafe_copy(const char *name, 2578 address byte_copy_entry, address short_copy_entry, 2579 address int_copy_entry, address long_copy_entry) { 2580 2581 Label L_long_aligned, L_int_aligned, L_short_aligned; 2582 2583 // Input registers (before setup_arg_regs) 2584 const Register from = c_rarg0; // source array address 2585 const Register to = c_rarg1; // destination array address 2586 const Register size = c_rarg2; // byte count (size_t) 2587 2588 // Register used as a temp 2589 const Register bits = rax; // test copy of low bits 2590 2591 __ align(CodeEntryAlignment); 2592 StubCodeMark mark(this, "StubRoutines", name); 2593 address start = __ pc(); 2594 2595 __ enter(); // required for proper stackwalking of RuntimeStub frame 2596 2597 // bump this on entry, not on exit: 2598 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2599 2600 __ mov(bits, from); 2601 __ orptr(bits, to); 2602 __ orptr(bits, size); 2603 2604 __ testb(bits, BytesPerLong-1); 2605 __ jccb(Assembler::zero, L_long_aligned); 2606 2607 __ testb(bits, BytesPerInt-1); 2608 __ jccb(Assembler::zero, L_int_aligned); 2609 2610 __ testb(bits, BytesPerShort-1); 2611 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry)); 2612 2613 __ BIND(L_short_aligned); 2614 __ shrptr(size, LogBytesPerShort); // size => short_count 2615 __ jump(RuntimeAddress(short_copy_entry)); 2616 2617 __ BIND(L_int_aligned); 2618 __ shrptr(size, LogBytesPerInt); // size => int_count 2619 __ jump(RuntimeAddress(int_copy_entry)); 2620 2621 __ BIND(L_long_aligned); 2622 __ shrptr(size, LogBytesPerLong); // size => qword_count 2623 __ jump(RuntimeAddress(long_copy_entry)); 2624 2625 return start; 2626 } 2627 2628 // Perform range checks on the proposed arraycopy. 2629 // Kills temp, but nothing else. 2630 // Also, clean the sign bits of src_pos and dst_pos. 2631 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2632 Register src_pos, // source position (c_rarg1) 2633 Register dst, // destination array oo (c_rarg2) 2634 Register dst_pos, // destination position (c_rarg3) 2635 Register length, 2636 Register temp, 2637 Label& L_failed) { 2638 BLOCK_COMMENT("arraycopy_range_checks:"); 2639 2640 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2641 __ movl(temp, length); 2642 __ addl(temp, src_pos); // src_pos + length 2643 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes())); 2644 __ jcc(Assembler::above, L_failed); 2645 2646 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2647 __ movl(temp, length); 2648 __ addl(temp, dst_pos); // dst_pos + length 2649 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2650 __ jcc(Assembler::above, L_failed); 2651 2652 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'. 2653 // Move with sign extension can be used since they are positive. 2654 __ movslq(src_pos, src_pos); 2655 __ movslq(dst_pos, dst_pos); 2656 2657 BLOCK_COMMENT("arraycopy_range_checks done"); 2658 } 2659 2660 // 2661 // Generate generic array copy stubs 2662 // 2663 // Input: 2664 // c_rarg0 - src oop 2665 // c_rarg1 - src_pos (32-bits) 2666 // c_rarg2 - dst oop 2667 // c_rarg3 - dst_pos (32-bits) 2668 // not Win64 2669 // c_rarg4 - element count (32-bits) 2670 // Win64 2671 // rsp+40 - element count (32-bits) 2672 // 2673 // Output: 2674 // rax == 0 - success 2675 // rax == -1^K - failure, where K is partial transfer count 2676 // 2677 address generate_generic_copy(const char *name, 2678 address byte_copy_entry, address short_copy_entry, 2679 address int_copy_entry, address oop_copy_entry, 2680 address long_copy_entry, address checkcast_copy_entry) { 2681 2682 Label L_failed, L_failed_0, L_objArray; 2683 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2684 2685 // Input registers 2686 const Register src = c_rarg0; // source array oop 2687 const Register src_pos = c_rarg1; // source position 2688 const Register dst = c_rarg2; // destination array oop 2689 const Register dst_pos = c_rarg3; // destination position 2690 #ifndef _WIN64 2691 const Register length = c_rarg4; 2692 #else 2693 const Address length(rsp, 6 * wordSize); // elements count is on stack on Win64 2694 #endif 2695 2696 { int modulus = CodeEntryAlignment; 2697 int target = modulus - 5; // 5 = sizeof jmp(L_failed) 2698 int advance = target - (__ offset() % modulus); 2699 if (advance < 0) advance += modulus; 2700 if (advance > 0) __ nop(advance); 2701 } 2702 StubCodeMark mark(this, "StubRoutines", name); 2703 2704 // Short-hop target to L_failed. Makes for denser prologue code. 2705 __ BIND(L_failed_0); 2706 __ jmp(L_failed); 2707 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed"); 2708 2709 __ align(CodeEntryAlignment); 2710 address start = __ pc(); 2711 2712 __ enter(); // required for proper stackwalking of RuntimeStub frame 2713 2714 // bump this on entry, not on exit: 2715 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2716 2717 //----------------------------------------------------------------------- 2718 // Assembler stub will be used for this call to arraycopy 2719 // if the following conditions are met: 2720 // 2721 // (1) src and dst must not be null. 2722 // (2) src_pos must not be negative. 2723 // (3) dst_pos must not be negative. 2724 // (4) length must not be negative. 2725 // (5) src klass and dst klass should be the same and not NULL. 2726 // (6) src and dst should be arrays. 2727 // (7) src_pos + length must not exceed length of src. 2728 // (8) dst_pos + length must not exceed length of dst. 2729 // 2730 2731 // if (src == NULL) return -1; 2732 __ testptr(src, src); // src oop 2733 size_t j1off = __ offset(); 2734 __ jccb(Assembler::zero, L_failed_0); 2735 2736 // if (src_pos < 0) return -1; 2737 __ testl(src_pos, src_pos); // src_pos (32-bits) 2738 __ jccb(Assembler::negative, L_failed_0); 2739 2740 // if (dst == NULL) return -1; 2741 __ testptr(dst, dst); // dst oop 2742 __ jccb(Assembler::zero, L_failed_0); 2743 2744 // if (dst_pos < 0) return -1; 2745 __ testl(dst_pos, dst_pos); // dst_pos (32-bits) 2746 size_t j4off = __ offset(); 2747 __ jccb(Assembler::negative, L_failed_0); 2748 2749 // The first four tests are very dense code, 2750 // but not quite dense enough to put four 2751 // jumps in a 16-byte instruction fetch buffer. 2752 // That's good, because some branch predicters 2753 // do not like jumps so close together. 2754 // Make sure of this. 2755 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps"); 2756 2757 // registers used as temp 2758 const Register r11_length = r11; // elements count to copy 2759 const Register r10_src_klass = r10; // array klass 2760 2761 // if (length < 0) return -1; 2762 __ movl(r11_length, length); // length (elements count, 32-bits value) 2763 __ testl(r11_length, r11_length); 2764 __ jccb(Assembler::negative, L_failed_0); 2765 2766 __ load_klass(r10_src_klass, src); 2767 #ifdef ASSERT 2768 // assert(src->klass() != NULL); 2769 { 2770 BLOCK_COMMENT("assert klasses not null {"); 2771 Label L1, L2; 2772 __ testptr(r10_src_klass, r10_src_klass); 2773 __ jcc(Assembler::notZero, L2); // it is broken if klass is NULL 2774 __ bind(L1); 2775 __ stop("broken null klass"); 2776 __ bind(L2); 2777 __ load_klass(rax, dst); 2778 __ cmpq(rax, 0); 2779 __ jcc(Assembler::equal, L1); // this would be broken also 2780 BLOCK_COMMENT("} assert klasses not null done"); 2781 } 2782 #endif 2783 2784 // Load layout helper (32-bits) 2785 // 2786 // |array_tag| | header_size | element_type | |log2_element_size| 2787 // 32 30 24 16 8 2 0 2788 // 2789 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2790 // 2791 2792 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2793 2794 // Handle objArrays completely differently... 2795 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2796 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh); 2797 __ jcc(Assembler::equal, L_objArray); 2798 2799 // if (src->klass() != dst->klass()) return -1; 2800 __ load_klass(rax, dst); 2801 __ cmpq(r10_src_klass, rax); 2802 __ jcc(Assembler::notEqual, L_failed); 2803 2804 const Register rax_lh = rax; // layout helper 2805 __ movl(rax_lh, Address(r10_src_klass, lh_offset)); 2806 2807 // if (!src->is_Array()) return -1; 2808 __ cmpl(rax_lh, Klass::_lh_neutral_value); 2809 __ jcc(Assembler::greaterEqual, L_failed); 2810 2811 // At this point, it is known to be a typeArray (array_tag 0x3). 2812 #ifdef ASSERT 2813 { 2814 BLOCK_COMMENT("assert primitive array {"); 2815 Label L; 2816 __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift)); 2817 __ jcc(Assembler::greaterEqual, L); 2818 __ stop("must be a primitive array"); 2819 __ bind(L); 2820 BLOCK_COMMENT("} assert primitive array done"); 2821 } 2822 #endif 2823 2824 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2825 r10, L_failed); 2826 2827 // TypeArrayKlass 2828 // 2829 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2830 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2831 // 2832 2833 const Register r10_offset = r10; // array offset 2834 const Register rax_elsize = rax_lh; // element size 2835 2836 __ movl(r10_offset, rax_lh); 2837 __ shrl(r10_offset, Klass::_lh_header_size_shift); 2838 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset 2839 __ addptr(src, r10_offset); // src array offset 2840 __ addptr(dst, r10_offset); // dst array offset 2841 BLOCK_COMMENT("choose copy loop based on element size"); 2842 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize 2843 2844 // next registers should be set before the jump to corresponding stub 2845 const Register from = c_rarg0; // source array address 2846 const Register to = c_rarg1; // destination array address 2847 const Register count = c_rarg2; // elements count 2848 2849 // 'from', 'to', 'count' registers should be set in such order 2850 // since they are the same as 'src', 'src_pos', 'dst'. 2851 2852 __ BIND(L_copy_bytes); 2853 __ cmpl(rax_elsize, 0); 2854 __ jccb(Assembler::notEqual, L_copy_shorts); 2855 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr 2856 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr 2857 __ movl2ptr(count, r11_length); // length 2858 __ jump(RuntimeAddress(byte_copy_entry)); 2859 2860 __ BIND(L_copy_shorts); 2861 __ cmpl(rax_elsize, LogBytesPerShort); 2862 __ jccb(Assembler::notEqual, L_copy_ints); 2863 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr 2864 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr 2865 __ movl2ptr(count, r11_length); // length 2866 __ jump(RuntimeAddress(short_copy_entry)); 2867 2868 __ BIND(L_copy_ints); 2869 __ cmpl(rax_elsize, LogBytesPerInt); 2870 __ jccb(Assembler::notEqual, L_copy_longs); 2871 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr 2872 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr 2873 __ movl2ptr(count, r11_length); // length 2874 __ jump(RuntimeAddress(int_copy_entry)); 2875 2876 __ BIND(L_copy_longs); 2877 #ifdef ASSERT 2878 { 2879 BLOCK_COMMENT("assert long copy {"); 2880 Label L; 2881 __ cmpl(rax_elsize, LogBytesPerLong); 2882 __ jcc(Assembler::equal, L); 2883 __ stop("must be long copy, but elsize is wrong"); 2884 __ bind(L); 2885 BLOCK_COMMENT("} assert long copy done"); 2886 } 2887 #endif 2888 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr 2889 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr 2890 __ movl2ptr(count, r11_length); // length 2891 __ jump(RuntimeAddress(long_copy_entry)); 2892 2893 // ObjArrayKlass 2894 __ BIND(L_objArray); 2895 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos] 2896 2897 Label L_plain_copy, L_checkcast_copy; 2898 // test array classes for subtyping 2899 __ load_klass(rax, dst); 2900 __ cmpq(r10_src_klass, rax); // usual case is exact equality 2901 __ jcc(Assembler::notEqual, L_checkcast_copy); 2902 2903 // Identically typed arrays can be copied without element-wise checks. 2904 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2905 r10, L_failed); 2906 2907 __ lea(from, Address(src, src_pos, TIMES_OOP, 2908 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr 2909 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 2910 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr 2911 __ movl2ptr(count, r11_length); // length 2912 __ BIND(L_plain_copy); 2913 __ jump(RuntimeAddress(oop_copy_entry)); 2914 2915 __ BIND(L_checkcast_copy); 2916 // live at this point: r10_src_klass, r11_length, rax (dst_klass) 2917 { 2918 // Before looking at dst.length, make sure dst is also an objArray. 2919 __ cmpl(Address(rax, lh_offset), objArray_lh); 2920 __ jcc(Assembler::notEqual, L_failed); 2921 2922 // It is safe to examine both src.length and dst.length. 2923 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 2924 rax, L_failed); 2925 2926 const Register r11_dst_klass = r11; 2927 __ load_klass(r11_dst_klass, dst); // reload 2928 2929 // Marshal the base address arguments now, freeing registers. 2930 __ lea(from, Address(src, src_pos, TIMES_OOP, 2931 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 2932 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 2933 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 2934 __ movl(count, length); // length (reloaded) 2935 Register sco_temp = c_rarg3; // this register is free now 2936 assert_different_registers(from, to, count, sco_temp, 2937 r11_dst_klass, r10_src_klass); 2938 assert_clean_int(count, sco_temp); 2939 2940 // Generate the type check. 2941 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2942 __ movl(sco_temp, Address(r11_dst_klass, sco_offset)); 2943 assert_clean_int(sco_temp, rax); 2944 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy); 2945 2946 // Fetch destination element klass from the ObjArrayKlass header. 2947 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2948 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset)); 2949 __ movl( sco_temp, Address(r11_dst_klass, sco_offset)); 2950 assert_clean_int(sco_temp, rax); 2951 2952 // the checkcast_copy loop needs two extra arguments: 2953 assert(c_rarg3 == sco_temp, "#3 already in place"); 2954 // Set up arguments for checkcast_copy_entry. 2955 setup_arg_regs(4); 2956 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris 2957 __ jump(RuntimeAddress(checkcast_copy_entry)); 2958 } 2959 2960 __ BIND(L_failed); 2961 __ xorptr(rax, rax); 2962 __ notptr(rax); // return -1 2963 __ leave(); // required for proper stackwalking of RuntimeStub frame 2964 __ ret(0); 2965 2966 return start; 2967 } 2968 2969 address generate_data_cache_writeback() { 2970 const Register src = c_rarg0; // source address 2971 2972 __ align(CodeEntryAlignment); 2973 2974 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2975 2976 address start = __ pc(); 2977 __ enter(); 2978 __ cache_wb(Address(src, 0)); 2979 __ leave(); 2980 __ ret(0); 2981 2982 return start; 2983 } 2984 2985 address generate_data_cache_writeback_sync() { 2986 const Register is_pre = c_rarg0; // pre or post sync 2987 2988 __ align(CodeEntryAlignment); 2989 2990 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2991 2992 // pre wbsync is a no-op 2993 // post wbsync translates to an sfence 2994 2995 Label skip; 2996 address start = __ pc(); 2997 __ enter(); 2998 __ cmpl(is_pre, 0); 2999 __ jcc(Assembler::notEqual, skip); 3000 __ cache_wbsync(false); 3001 __ bind(skip); 3002 __ leave(); 3003 __ ret(0); 3004 3005 return start; 3006 } 3007 3008 void generate_arraycopy_stubs() { 3009 address entry; 3010 address entry_jbyte_arraycopy; 3011 address entry_jshort_arraycopy; 3012 address entry_jint_arraycopy; 3013 address entry_oop_arraycopy; 3014 address entry_jlong_arraycopy; 3015 address entry_checkcast_arraycopy; 3016 3017 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 3018 "jbyte_disjoint_arraycopy"); 3019 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy, 3020 "jbyte_arraycopy"); 3021 3022 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 3023 "jshort_disjoint_arraycopy"); 3024 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy, 3025 "jshort_arraycopy"); 3026 3027 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, &entry, 3028 "jint_disjoint_arraycopy"); 3029 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, entry, 3030 &entry_jint_arraycopy, "jint_arraycopy"); 3031 3032 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, false, &entry, 3033 "jlong_disjoint_arraycopy"); 3034 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(false, false, entry, 3035 &entry_jlong_arraycopy, "jlong_arraycopy"); 3036 3037 3038 if (UseCompressedOops) { 3039 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, &entry, 3040 "oop_disjoint_arraycopy"); 3041 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, entry, 3042 &entry_oop_arraycopy, "oop_arraycopy"); 3043 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(false, true, &entry, 3044 "oop_disjoint_arraycopy_uninit", 3045 /*dest_uninitialized*/true); 3046 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(false, true, entry, 3047 NULL, "oop_arraycopy_uninit", 3048 /*dest_uninitialized*/true); 3049 } else { 3050 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, &entry, 3051 "oop_disjoint_arraycopy"); 3052 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, entry, 3053 &entry_oop_arraycopy, "oop_arraycopy"); 3054 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(false, true, &entry, 3055 "oop_disjoint_arraycopy_uninit", 3056 /*dest_uninitialized*/true); 3057 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(false, true, entry, 3058 NULL, "oop_arraycopy_uninit", 3059 /*dest_uninitialized*/true); 3060 } 3061 3062 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 3063 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 3064 /*dest_uninitialized*/true); 3065 3066 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 3067 entry_jbyte_arraycopy, 3068 entry_jshort_arraycopy, 3069 entry_jint_arraycopy, 3070 entry_jlong_arraycopy); 3071 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 3072 entry_jbyte_arraycopy, 3073 entry_jshort_arraycopy, 3074 entry_jint_arraycopy, 3075 entry_oop_arraycopy, 3076 entry_jlong_arraycopy, 3077 entry_checkcast_arraycopy); 3078 3079 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 3080 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 3081 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 3082 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 3083 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 3084 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 3085 3086 // We don't generate specialized code for HeapWord-aligned source 3087 // arrays, so just use the code we've already generated 3088 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy; 3089 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy; 3090 3091 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy; 3092 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy; 3093 3094 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy; 3095 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy; 3096 3097 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy; 3098 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy; 3099 3100 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy; 3101 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy; 3102 3103 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit; 3104 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit; 3105 } 3106 3107 // AES intrinsic stubs 3108 enum {AESBlockSize = 16}; 3109 3110 address generate_key_shuffle_mask() { 3111 __ align(16); 3112 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask"); 3113 address start = __ pc(); 3114 __ emit_data64( 0x0405060700010203, relocInfo::none ); 3115 __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none ); 3116 return start; 3117 } 3118 3119 address generate_counter_shuffle_mask() { 3120 __ align(16); 3121 StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask"); 3122 address start = __ pc(); 3123 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); 3124 __ emit_data64(0x0001020304050607, relocInfo::none); 3125 return start; 3126 } 3127 3128 // Utility routine for loading a 128-bit key word in little endian format 3129 // can optionally specify that the shuffle mask is already in an xmmregister 3130 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 3131 __ movdqu(xmmdst, Address(key, offset)); 3132 if (xmm_shuf_mask != NULL) { 3133 __ pshufb(xmmdst, xmm_shuf_mask); 3134 } else { 3135 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3136 } 3137 } 3138 3139 // Utility routine for increase 128bit counter (iv in CTR mode) 3140 void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) { 3141 __ pextrq(reg, xmmdst, 0x0); 3142 __ addq(reg, inc_delta); 3143 __ pinsrq(xmmdst, reg, 0x0); 3144 __ jcc(Assembler::carryClear, next_block); // jump if no carry 3145 __ pextrq(reg, xmmdst, 0x01); // Carry 3146 __ addq(reg, 0x01); 3147 __ pinsrq(xmmdst, reg, 0x01); //Carry end 3148 __ BIND(next_block); // next instruction 3149 } 3150 3151 // Arguments: 3152 // 3153 // Inputs: 3154 // c_rarg0 - source byte array address 3155 // c_rarg1 - destination byte array address 3156 // c_rarg2 - K (key) in little endian int array 3157 // 3158 address generate_aescrypt_encryptBlock() { 3159 assert(UseAES, "need AES instructions and misaligned SSE support"); 3160 __ align(CodeEntryAlignment); 3161 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 3162 Label L_doLast; 3163 address start = __ pc(); 3164 3165 const Register from = c_rarg0; // source array address 3166 const Register to = c_rarg1; // destination array address 3167 const Register key = c_rarg2; // key array address 3168 const Register keylen = rax; 3169 3170 const XMMRegister xmm_result = xmm0; 3171 const XMMRegister xmm_key_shuf_mask = xmm1; 3172 // On win64 xmm6-xmm15 must be preserved so don't use them. 3173 const XMMRegister xmm_temp1 = xmm2; 3174 const XMMRegister xmm_temp2 = xmm3; 3175 const XMMRegister xmm_temp3 = xmm4; 3176 const XMMRegister xmm_temp4 = xmm5; 3177 3178 __ enter(); // required for proper stackwalking of RuntimeStub frame 3179 3180 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} 3181 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3182 3183 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3184 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input 3185 3186 // For encryption, the java expanded key ordering is just what we need 3187 // we don't know if the key is aligned, hence not using load-execute form 3188 3189 load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask); 3190 __ pxor(xmm_result, xmm_temp1); 3191 3192 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); 3193 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); 3194 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); 3195 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); 3196 3197 __ aesenc(xmm_result, xmm_temp1); 3198 __ aesenc(xmm_result, xmm_temp2); 3199 __ aesenc(xmm_result, xmm_temp3); 3200 __ aesenc(xmm_result, xmm_temp4); 3201 3202 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); 3203 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); 3204 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); 3205 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); 3206 3207 __ aesenc(xmm_result, xmm_temp1); 3208 __ aesenc(xmm_result, xmm_temp2); 3209 __ aesenc(xmm_result, xmm_temp3); 3210 __ aesenc(xmm_result, xmm_temp4); 3211 3212 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); 3213 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); 3214 3215 __ cmpl(keylen, 44); 3216 __ jccb(Assembler::equal, L_doLast); 3217 3218 __ aesenc(xmm_result, xmm_temp1); 3219 __ aesenc(xmm_result, xmm_temp2); 3220 3221 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); 3222 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); 3223 3224 __ cmpl(keylen, 52); 3225 __ jccb(Assembler::equal, L_doLast); 3226 3227 __ aesenc(xmm_result, xmm_temp1); 3228 __ aesenc(xmm_result, xmm_temp2); 3229 3230 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); 3231 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); 3232 3233 __ BIND(L_doLast); 3234 __ aesenc(xmm_result, xmm_temp1); 3235 __ aesenclast(xmm_result, xmm_temp2); 3236 __ movdqu(Address(to, 0), xmm_result); // store the result 3237 __ xorptr(rax, rax); // return 0 3238 __ leave(); // required for proper stackwalking of RuntimeStub frame 3239 __ ret(0); 3240 3241 return start; 3242 } 3243 3244 3245 // Arguments: 3246 // 3247 // Inputs: 3248 // c_rarg0 - source byte array address 3249 // c_rarg1 - destination byte array address 3250 // c_rarg2 - K (key) in little endian int array 3251 // 3252 address generate_aescrypt_decryptBlock() { 3253 assert(UseAES, "need AES instructions and misaligned SSE support"); 3254 __ align(CodeEntryAlignment); 3255 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 3256 Label L_doLast; 3257 address start = __ pc(); 3258 3259 const Register from = c_rarg0; // source array address 3260 const Register to = c_rarg1; // destination array address 3261 const Register key = c_rarg2; // key array address 3262 const Register keylen = rax; 3263 3264 const XMMRegister xmm_result = xmm0; 3265 const XMMRegister xmm_key_shuf_mask = xmm1; 3266 // On win64 xmm6-xmm15 must be preserved so don't use them. 3267 const XMMRegister xmm_temp1 = xmm2; 3268 const XMMRegister xmm_temp2 = xmm3; 3269 const XMMRegister xmm_temp3 = xmm4; 3270 const XMMRegister xmm_temp4 = xmm5; 3271 3272 __ enter(); // required for proper stackwalking of RuntimeStub frame 3273 3274 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} 3275 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3276 3277 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3278 __ movdqu(xmm_result, Address(from, 0)); 3279 3280 // for decryption java expanded key ordering is rotated one position from what we want 3281 // so we start from 0x10 here and hit 0x00 last 3282 // we don't know if the key is aligned, hence not using load-execute form 3283 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); 3284 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); 3285 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); 3286 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); 3287 3288 __ pxor (xmm_result, xmm_temp1); 3289 __ aesdec(xmm_result, xmm_temp2); 3290 __ aesdec(xmm_result, xmm_temp3); 3291 __ aesdec(xmm_result, xmm_temp4); 3292 3293 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); 3294 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); 3295 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); 3296 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); 3297 3298 __ aesdec(xmm_result, xmm_temp1); 3299 __ aesdec(xmm_result, xmm_temp2); 3300 __ aesdec(xmm_result, xmm_temp3); 3301 __ aesdec(xmm_result, xmm_temp4); 3302 3303 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); 3304 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); 3305 load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask); 3306 3307 __ cmpl(keylen, 44); 3308 __ jccb(Assembler::equal, L_doLast); 3309 3310 __ aesdec(xmm_result, xmm_temp1); 3311 __ aesdec(xmm_result, xmm_temp2); 3312 3313 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); 3314 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); 3315 3316 __ cmpl(keylen, 52); 3317 __ jccb(Assembler::equal, L_doLast); 3318 3319 __ aesdec(xmm_result, xmm_temp1); 3320 __ aesdec(xmm_result, xmm_temp2); 3321 3322 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); 3323 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); 3324 3325 __ BIND(L_doLast); 3326 __ aesdec(xmm_result, xmm_temp1); 3327 __ aesdec(xmm_result, xmm_temp2); 3328 3329 // for decryption the aesdeclast operation is always on key+0x00 3330 __ aesdeclast(xmm_result, xmm_temp3); 3331 __ movdqu(Address(to, 0), xmm_result); // store the result 3332 __ xorptr(rax, rax); // return 0 3333 __ leave(); // required for proper stackwalking of RuntimeStub frame 3334 __ ret(0); 3335 3336 return start; 3337 } 3338 3339 3340 // Arguments: 3341 // 3342 // Inputs: 3343 // c_rarg0 - source byte array address 3344 // c_rarg1 - destination byte array address 3345 // c_rarg2 - K (key) in little endian int array 3346 // c_rarg3 - r vector byte array address 3347 // c_rarg4 - input length 3348 // 3349 // Output: 3350 // rax - input length 3351 // 3352 address generate_cipherBlockChaining_encryptAESCrypt() { 3353 assert(UseAES, "need AES instructions and misaligned SSE support"); 3354 __ align(CodeEntryAlignment); 3355 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 3356 address start = __ pc(); 3357 3358 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; 3359 const Register from = c_rarg0; // source array address 3360 const Register to = c_rarg1; // destination array address 3361 const Register key = c_rarg2; // key array address 3362 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3363 // and left with the results of the last encryption block 3364 #ifndef _WIN64 3365 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3366 #else 3367 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 3368 const Register len_reg = r11; // pick the volatile windows register 3369 #endif 3370 const Register pos = rax; 3371 3372 // xmm register assignments for the loops below 3373 const XMMRegister xmm_result = xmm0; 3374 const XMMRegister xmm_temp = xmm1; 3375 // keys 0-10 preloaded into xmm2-xmm12 3376 const int XMM_REG_NUM_KEY_FIRST = 2; 3377 const int XMM_REG_NUM_KEY_LAST = 15; 3378 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 3379 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10); 3380 const XMMRegister xmm_key11 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11); 3381 const XMMRegister xmm_key12 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12); 3382 const XMMRegister xmm_key13 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13); 3383 3384 __ enter(); // required for proper stackwalking of RuntimeStub frame 3385 3386 #ifdef _WIN64 3387 // on win64, fill len_reg from stack position 3388 __ movl(len_reg, len_mem); 3389 #else 3390 __ push(len_reg); // Save 3391 #endif 3392 3393 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front 3394 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3395 // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0 3396 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) { 3397 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 3398 offset += 0x10; 3399 } 3400 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec 3401 3402 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 3403 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3404 __ cmpl(rax, 44); 3405 __ jcc(Assembler::notEqual, L_key_192_256); 3406 3407 // 128 bit code follows here 3408 __ movptr(pos, 0); 3409 __ align(OptoLoopAlignment); 3410 3411 __ BIND(L_loopTop_128); 3412 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3413 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3414 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3415 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) { 3416 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3417 } 3418 __ aesenclast(xmm_result, xmm_key10); 3419 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3420 // no need to store r to memory until we exit 3421 __ addptr(pos, AESBlockSize); 3422 __ subptr(len_reg, AESBlockSize); 3423 __ jcc(Assembler::notEqual, L_loopTop_128); 3424 3425 __ BIND(L_exit); 3426 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object 3427 3428 #ifdef _WIN64 3429 __ movl(rax, len_mem); 3430 #else 3431 __ pop(rax); // return length 3432 #endif 3433 __ leave(); // required for proper stackwalking of RuntimeStub frame 3434 __ ret(0); 3435 3436 __ BIND(L_key_192_256); 3437 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 3438 load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask); 3439 load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask); 3440 __ cmpl(rax, 52); 3441 __ jcc(Assembler::notEqual, L_key_256); 3442 3443 // 192-bit code follows here (could be changed to use more xmm registers) 3444 __ movptr(pos, 0); 3445 __ align(OptoLoopAlignment); 3446 3447 __ BIND(L_loopTop_192); 3448 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3449 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3450 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3451 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) { 3452 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3453 } 3454 __ aesenclast(xmm_result, xmm_key12); 3455 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3456 // no need to store r to memory until we exit 3457 __ addptr(pos, AESBlockSize); 3458 __ subptr(len_reg, AESBlockSize); 3459 __ jcc(Assembler::notEqual, L_loopTop_192); 3460 __ jmp(L_exit); 3461 3462 __ BIND(L_key_256); 3463 // 256-bit code follows here (could be changed to use more xmm registers) 3464 load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask); 3465 __ movptr(pos, 0); 3466 __ align(OptoLoopAlignment); 3467 3468 __ BIND(L_loopTop_256); 3469 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3470 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3471 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3472 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) { 3473 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3474 } 3475 load_key(xmm_temp, key, 0xe0); 3476 __ aesenclast(xmm_result, xmm_temp); 3477 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3478 // no need to store r to memory until we exit 3479 __ addptr(pos, AESBlockSize); 3480 __ subptr(len_reg, AESBlockSize); 3481 __ jcc(Assembler::notEqual, L_loopTop_256); 3482 __ jmp(L_exit); 3483 3484 return start; 3485 } 3486 3487 // Safefetch stubs. 3488 void generate_safefetch(const char* name, int size, address* entry, 3489 address* fault_pc, address* continuation_pc) { 3490 // safefetch signatures: 3491 // int SafeFetch32(int* adr, int errValue); 3492 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3493 // 3494 // arguments: 3495 // c_rarg0 = adr 3496 // c_rarg1 = errValue 3497 // 3498 // result: 3499 // PPC_RET = *adr or errValue 3500 3501 StubCodeMark mark(this, "StubRoutines", name); 3502 3503 // Entry point, pc or function descriptor. 3504 *entry = __ pc(); 3505 3506 // Load *adr into c_rarg1, may fault. 3507 *fault_pc = __ pc(); 3508 switch (size) { 3509 case 4: 3510 // int32_t 3511 __ movl(c_rarg1, Address(c_rarg0, 0)); 3512 break; 3513 case 8: 3514 // int64_t 3515 __ movq(c_rarg1, Address(c_rarg0, 0)); 3516 break; 3517 default: 3518 ShouldNotReachHere(); 3519 } 3520 3521 // return errValue or *adr 3522 *continuation_pc = __ pc(); 3523 __ movq(rax, c_rarg1); 3524 __ ret(0); 3525 } 3526 3527 // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time 3528 // to hide instruction latency 3529 // 3530 // Arguments: 3531 // 3532 // Inputs: 3533 // c_rarg0 - source byte array address 3534 // c_rarg1 - destination byte array address 3535 // c_rarg2 - K (key) in little endian int array 3536 // c_rarg3 - r vector byte array address 3537 // c_rarg4 - input length 3538 // 3539 // Output: 3540 // rax - input length 3541 // 3542 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 3543 assert(UseAES, "need AES instructions and misaligned SSE support"); 3544 __ align(CodeEntryAlignment); 3545 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 3546 address start = __ pc(); 3547 3548 const Register from = c_rarg0; // source array address 3549 const Register to = c_rarg1; // destination array address 3550 const Register key = c_rarg2; // key array address 3551 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3552 // and left with the results of the last encryption block 3553 #ifndef _WIN64 3554 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3555 #else 3556 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 3557 const Register len_reg = r11; // pick the volatile windows register 3558 #endif 3559 const Register pos = rax; 3560 3561 const int PARALLEL_FACTOR = 4; 3562 const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256 3563 3564 Label L_exit; 3565 Label L_singleBlock_loopTopHead[3]; // 128, 192, 256 3566 Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256 3567 Label L_singleBlock_loopTop[3]; // 128, 192, 256 3568 Label L_multiBlock_loopTopHead[3]; // 128, 192, 256 3569 Label L_multiBlock_loopTop[3]; // 128, 192, 256 3570 3571 // keys 0-10 preloaded into xmm5-xmm15 3572 const int XMM_REG_NUM_KEY_FIRST = 5; 3573 const int XMM_REG_NUM_KEY_LAST = 15; 3574 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 3575 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST); 3576 3577 __ enter(); // required for proper stackwalking of RuntimeStub frame 3578 3579 #ifdef _WIN64 3580 // on win64, fill len_reg from stack position 3581 __ movl(len_reg, len_mem); 3582 #else 3583 __ push(len_reg); // Save 3584 #endif 3585 __ push(rbx); 3586 // the java expanded key ordering is rotated one position from what we want 3587 // so we start from 0x10 here and hit 0x00 last 3588 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front 3589 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3590 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 3591 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) { 3592 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 3593 offset += 0x10; 3594 } 3595 load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask); 3596 3597 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block 3598 3599 // registers holding the four results in the parallelized loop 3600 const XMMRegister xmm_result0 = xmm0; 3601 const XMMRegister xmm_result1 = xmm2; 3602 const XMMRegister xmm_result2 = xmm3; 3603 const XMMRegister xmm_result3 = xmm4; 3604 3605 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec 3606 3607 __ xorptr(pos, pos); 3608 3609 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 3610 __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3611 __ cmpl(rbx, 52); 3612 __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]); 3613 __ cmpl(rbx, 60); 3614 __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]); 3615 3616 #define DoFour(opc, src_reg) \ 3617 __ opc(xmm_result0, src_reg); \ 3618 __ opc(xmm_result1, src_reg); \ 3619 __ opc(xmm_result2, src_reg); \ 3620 __ opc(xmm_result3, src_reg); \ 3621 3622 for (int k = 0; k < 3; ++k) { 3623 __ BIND(L_multiBlock_loopTopHead[k]); 3624 if (k != 0) { 3625 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left 3626 __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]); 3627 } 3628 if (k == 1) { 3629 __ subptr(rsp, 6 * wordSize); 3630 __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15 3631 load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0 3632 __ movdqu(Address(rsp, 2 * wordSize), xmm15); 3633 load_key(xmm1, key, 0xc0); // 0xc0; 3634 __ movdqu(Address(rsp, 4 * wordSize), xmm1); 3635 } else if (k == 2) { 3636 __ subptr(rsp, 10 * wordSize); 3637 __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15 3638 load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0 3639 __ movdqu(Address(rsp, 6 * wordSize), xmm15); 3640 load_key(xmm1, key, 0xe0); // 0xe0; 3641 __ movdqu(Address(rsp, 8 * wordSize), xmm1); 3642 load_key(xmm15, key, 0xb0); // 0xb0; 3643 __ movdqu(Address(rsp, 2 * wordSize), xmm15); 3644 load_key(xmm1, key, 0xc0); // 0xc0; 3645 __ movdqu(Address(rsp, 4 * wordSize), xmm1); 3646 } 3647 __ align(OptoLoopAlignment); 3648 __ BIND(L_multiBlock_loopTop[k]); 3649 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left 3650 __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]); 3651 3652 if (k != 0) { 3653 __ movdqu(xmm15, Address(rsp, 2 * wordSize)); 3654 __ movdqu(xmm1, Address(rsp, 4 * wordSize)); 3655 } 3656 3657 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers 3658 __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); 3659 __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); 3660 __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); 3661 3662 DoFour(pxor, xmm_key_first); 3663 if (k == 0) { 3664 for (int rnum = 1; rnum < ROUNDS[k]; rnum++) { 3665 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); 3666 } 3667 DoFour(aesdeclast, xmm_key_last); 3668 } else if (k == 1) { 3669 for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) { 3670 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); 3671 } 3672 __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again. 3673 DoFour(aesdec, xmm1); // key : 0xc0 3674 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again 3675 DoFour(aesdeclast, xmm_key_last); 3676 } else if (k == 2) { 3677 for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) { 3678 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); 3679 } 3680 DoFour(aesdec, xmm1); // key : 0xc0 3681 __ movdqu(xmm15, Address(rsp, 6 * wordSize)); 3682 __ movdqu(xmm1, Address(rsp, 8 * wordSize)); 3683 DoFour(aesdec, xmm15); // key : 0xd0 3684 __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again. 3685 DoFour(aesdec, xmm1); // key : 0xe0 3686 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again 3687 DoFour(aesdeclast, xmm_key_last); 3688 } 3689 3690 // for each result, xor with the r vector of previous cipher block 3691 __ pxor(xmm_result0, xmm_prev_block_cipher); 3692 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize)); 3693 __ pxor(xmm_result1, xmm_prev_block_cipher); 3694 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize)); 3695 __ pxor(xmm_result2, xmm_prev_block_cipher); 3696 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize)); 3697 __ pxor(xmm_result3, xmm_prev_block_cipher); 3698 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks 3699 if (k != 0) { 3700 __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher); 3701 } 3702 3703 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output 3704 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); 3705 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); 3706 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); 3707 3708 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); 3709 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); 3710 __ jmp(L_multiBlock_loopTop[k]); 3711 3712 // registers used in the non-parallelized loops 3713 // xmm register assignments for the loops below 3714 const XMMRegister xmm_result = xmm0; 3715 const XMMRegister xmm_prev_block_cipher_save = xmm2; 3716 const XMMRegister xmm_key11 = xmm3; 3717 const XMMRegister xmm_key12 = xmm4; 3718 const XMMRegister key_tmp = xmm4; 3719 3720 __ BIND(L_singleBlock_loopTopHead[k]); 3721 if (k == 1) { 3722 __ addptr(rsp, 6 * wordSize); 3723 } else if (k == 2) { 3724 __ addptr(rsp, 10 * wordSize); 3725 } 3726 __ cmpptr(len_reg, 0); // any blocks left?? 3727 __ jcc(Assembler::equal, L_exit); 3728 __ BIND(L_singleBlock_loopTopHead2[k]); 3729 if (k == 1) { 3730 load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0 3731 load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0 3732 } 3733 if (k == 2) { 3734 load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0 3735 } 3736 __ align(OptoLoopAlignment); 3737 __ BIND(L_singleBlock_loopTop[k]); 3738 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 3739 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector 3740 __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds 3741 for (int rnum = 1; rnum <= 9 ; rnum++) { 3742 __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); 3743 } 3744 if (k == 1) { 3745 __ aesdec(xmm_result, xmm_key11); 3746 __ aesdec(xmm_result, xmm_key12); 3747 } 3748 if (k == 2) { 3749 __ aesdec(xmm_result, xmm_key11); 3750 load_key(key_tmp, key, 0xc0); 3751 __ aesdec(xmm_result, key_tmp); 3752 load_key(key_tmp, key, 0xd0); 3753 __ aesdec(xmm_result, key_tmp); 3754 load_key(key_tmp, key, 0xe0); 3755 __ aesdec(xmm_result, key_tmp); 3756 } 3757 3758 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 3759 __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector 3760 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3761 // no need to store r to memory until we exit 3762 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 3763 __ addptr(pos, AESBlockSize); 3764 __ subptr(len_reg, AESBlockSize); 3765 __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]); 3766 if (k != 2) { 3767 __ jmp(L_exit); 3768 } 3769 } //for 128/192/256 3770 3771 __ BIND(L_exit); 3772 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object 3773 __ pop(rbx); 3774 #ifdef _WIN64 3775 __ movl(rax, len_mem); 3776 #else 3777 __ pop(rax); // return length 3778 #endif 3779 __ leave(); // required for proper stackwalking of RuntimeStub frame 3780 __ ret(0); 3781 return start; 3782 } 3783 3784 address generate_electronicCodeBook_encryptAESCrypt() { 3785 __ align(CodeEntryAlignment); 3786 StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_encryptAESCrypt"); 3787 address start = __ pc(); 3788 const Register from = c_rarg0; // source array address 3789 const Register to = c_rarg1; // destination array address 3790 const Register key = c_rarg2; // key array address 3791 const Register len = c_rarg3; // src len (must be multiple of blocksize 16) 3792 __ enter(); // required for proper stackwalking of RuntimeStub frame 3793 __ aesecb_encrypt(from, to, key, len); 3794 __ leave(); // required for proper stackwalking of RuntimeStub frame 3795 __ ret(0); 3796 return start; 3797 } 3798 3799 address generate_electronicCodeBook_decryptAESCrypt() { 3800 __ align(CodeEntryAlignment); 3801 StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_decryptAESCrypt"); 3802 address start = __ pc(); 3803 const Register from = c_rarg0; // source array address 3804 const Register to = c_rarg1; // destination array address 3805 const Register key = c_rarg2; // key array address 3806 const Register len = c_rarg3; // src len (must be multiple of blocksize 16) 3807 __ enter(); // required for proper stackwalking of RuntimeStub frame 3808 __ aesecb_decrypt(from, to, key, len); 3809 __ leave(); // required for proper stackwalking of RuntimeStub frame 3810 __ ret(0); 3811 return start; 3812 } 3813 3814 address generate_upper_word_mask() { 3815 __ align(64); 3816 StubCodeMark mark(this, "StubRoutines", "upper_word_mask"); 3817 address start = __ pc(); 3818 __ emit_data64(0x0000000000000000, relocInfo::none); 3819 __ emit_data64(0xFFFFFFFF00000000, relocInfo::none); 3820 return start; 3821 } 3822 3823 address generate_shuffle_byte_flip_mask() { 3824 __ align(64); 3825 StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask"); 3826 address start = __ pc(); 3827 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); 3828 __ emit_data64(0x0001020304050607, relocInfo::none); 3829 return start; 3830 } 3831 3832 // ofs and limit are use for multi-block byte array. 3833 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 3834 address generate_sha1_implCompress(bool multi_block, const char *name) { 3835 __ align(CodeEntryAlignment); 3836 StubCodeMark mark(this, "StubRoutines", name); 3837 address start = __ pc(); 3838 3839 Register buf = c_rarg0; 3840 Register state = c_rarg1; 3841 Register ofs = c_rarg2; 3842 Register limit = c_rarg3; 3843 3844 const XMMRegister abcd = xmm0; 3845 const XMMRegister e0 = xmm1; 3846 const XMMRegister e1 = xmm2; 3847 const XMMRegister msg0 = xmm3; 3848 3849 const XMMRegister msg1 = xmm4; 3850 const XMMRegister msg2 = xmm5; 3851 const XMMRegister msg3 = xmm6; 3852 const XMMRegister shuf_mask = xmm7; 3853 3854 __ enter(); 3855 3856 __ subptr(rsp, 4 * wordSize); 3857 3858 __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask, 3859 buf, state, ofs, limit, rsp, multi_block); 3860 3861 __ addptr(rsp, 4 * wordSize); 3862 3863 __ leave(); 3864 __ ret(0); 3865 return start; 3866 } 3867 3868 address generate_pshuffle_byte_flip_mask() { 3869 __ align(64); 3870 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask"); 3871 address start = __ pc(); 3872 __ emit_data64(0x0405060700010203, relocInfo::none); 3873 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none); 3874 3875 if (VM_Version::supports_avx2()) { 3876 __ emit_data64(0x0405060700010203, relocInfo::none); // second copy 3877 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none); 3878 // _SHUF_00BA 3879 __ emit_data64(0x0b0a090803020100, relocInfo::none); 3880 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 3881 __ emit_data64(0x0b0a090803020100, relocInfo::none); 3882 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 3883 // _SHUF_DC00 3884 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 3885 __ emit_data64(0x0b0a090803020100, relocInfo::none); 3886 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 3887 __ emit_data64(0x0b0a090803020100, relocInfo::none); 3888 } 3889 3890 return start; 3891 } 3892 3893 //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. 3894 address generate_pshuffle_byte_flip_mask_sha512() { 3895 __ align(32); 3896 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512"); 3897 address start = __ pc(); 3898 if (VM_Version::supports_avx2()) { 3899 __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK 3900 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); 3901 __ emit_data64(0x1011121314151617, relocInfo::none); 3902 __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none); 3903 __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO 3904 __ emit_data64(0x0000000000000000, relocInfo::none); 3905 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 3906 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 3907 } 3908 3909 return start; 3910 } 3911 3912 // ofs and limit are use for multi-block byte array. 3913 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 3914 address generate_sha256_implCompress(bool multi_block, const char *name) { 3915 assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), ""); 3916 __ align(CodeEntryAlignment); 3917 StubCodeMark mark(this, "StubRoutines", name); 3918 address start = __ pc(); 3919 3920 Register buf = c_rarg0; 3921 Register state = c_rarg1; 3922 Register ofs = c_rarg2; 3923 Register limit = c_rarg3; 3924 3925 const XMMRegister msg = xmm0; 3926 const XMMRegister state0 = xmm1; 3927 const XMMRegister state1 = xmm2; 3928 const XMMRegister msgtmp0 = xmm3; 3929 3930 const XMMRegister msgtmp1 = xmm4; 3931 const XMMRegister msgtmp2 = xmm5; 3932 const XMMRegister msgtmp3 = xmm6; 3933 const XMMRegister msgtmp4 = xmm7; 3934 3935 const XMMRegister shuf_mask = xmm8; 3936 3937 __ enter(); 3938 3939 __ subptr(rsp, 4 * wordSize); 3940 3941 if (VM_Version::supports_sha()) { 3942 __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, 3943 buf, state, ofs, limit, rsp, multi_block, shuf_mask); 3944 } else if (VM_Version::supports_avx2()) { 3945 __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, 3946 buf, state, ofs, limit, rsp, multi_block, shuf_mask); 3947 } 3948 __ addptr(rsp, 4 * wordSize); 3949 __ vzeroupper(); 3950 __ leave(); 3951 __ ret(0); 3952 return start; 3953 } 3954 3955 address generate_sha512_implCompress(bool multi_block, const char *name) { 3956 assert(VM_Version::supports_avx2(), ""); 3957 assert(VM_Version::supports_bmi2(), ""); 3958 __ align(CodeEntryAlignment); 3959 StubCodeMark mark(this, "StubRoutines", name); 3960 address start = __ pc(); 3961 3962 Register buf = c_rarg0; 3963 Register state = c_rarg1; 3964 Register ofs = c_rarg2; 3965 Register limit = c_rarg3; 3966 3967 const XMMRegister msg = xmm0; 3968 const XMMRegister state0 = xmm1; 3969 const XMMRegister state1 = xmm2; 3970 const XMMRegister msgtmp0 = xmm3; 3971 const XMMRegister msgtmp1 = xmm4; 3972 const XMMRegister msgtmp2 = xmm5; 3973 const XMMRegister msgtmp3 = xmm6; 3974 const XMMRegister msgtmp4 = xmm7; 3975 3976 const XMMRegister shuf_mask = xmm8; 3977 3978 __ enter(); 3979 3980 __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, 3981 buf, state, ofs, limit, rsp, multi_block, shuf_mask); 3982 3983 __ vzeroupper(); 3984 __ leave(); 3985 __ ret(0); 3986 return start; 3987 } 3988 3989 // This mask is used for incrementing counter value(linc0, linc4, etc.) 3990 address counter_mask_addr() { 3991 __ align(64); 3992 StubCodeMark mark(this, "StubRoutines", "counter_mask_addr"); 3993 address start = __ pc(); 3994 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);//lbswapmask 3995 __ emit_data64(0x0001020304050607, relocInfo::none); 3996 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); 3997 __ emit_data64(0x0001020304050607, relocInfo::none); 3998 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); 3999 __ emit_data64(0x0001020304050607, relocInfo::none); 4000 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); 4001 __ emit_data64(0x0001020304050607, relocInfo::none); 4002 __ emit_data64(0x0000000000000000, relocInfo::none);//linc0 = counter_mask_addr+64 4003 __ emit_data64(0x0000000000000000, relocInfo::none); 4004 __ emit_data64(0x0000000000000001, relocInfo::none);//counter_mask_addr() + 80 4005 __ emit_data64(0x0000000000000000, relocInfo::none); 4006 __ emit_data64(0x0000000000000002, relocInfo::none); 4007 __ emit_data64(0x0000000000000000, relocInfo::none); 4008 __ emit_data64(0x0000000000000003, relocInfo::none); 4009 __ emit_data64(0x0000000000000000, relocInfo::none); 4010 __ emit_data64(0x0000000000000004, relocInfo::none);//linc4 = counter_mask_addr() + 128 4011 __ emit_data64(0x0000000000000000, relocInfo::none); 4012 __ emit_data64(0x0000000000000004, relocInfo::none); 4013 __ emit_data64(0x0000000000000000, relocInfo::none); 4014 __ emit_data64(0x0000000000000004, relocInfo::none); 4015 __ emit_data64(0x0000000000000000, relocInfo::none); 4016 __ emit_data64(0x0000000000000004, relocInfo::none); 4017 __ emit_data64(0x0000000000000000, relocInfo::none); 4018 __ emit_data64(0x0000000000000008, relocInfo::none);//linc8 = counter_mask_addr() + 192 4019 __ emit_data64(0x0000000000000000, relocInfo::none); 4020 __ emit_data64(0x0000000000000008, relocInfo::none); 4021 __ emit_data64(0x0000000000000000, relocInfo::none); 4022 __ emit_data64(0x0000000000000008, relocInfo::none); 4023 __ emit_data64(0x0000000000000000, relocInfo::none); 4024 __ emit_data64(0x0000000000000008, relocInfo::none); 4025 __ emit_data64(0x0000000000000000, relocInfo::none); 4026 __ emit_data64(0x0000000000000020, relocInfo::none);//linc32 = counter_mask_addr() + 256 4027 __ emit_data64(0x0000000000000000, relocInfo::none); 4028 __ emit_data64(0x0000000000000020, relocInfo::none); 4029 __ emit_data64(0x0000000000000000, relocInfo::none); 4030 __ emit_data64(0x0000000000000020, relocInfo::none); 4031 __ emit_data64(0x0000000000000000, relocInfo::none); 4032 __ emit_data64(0x0000000000000020, relocInfo::none); 4033 __ emit_data64(0x0000000000000000, relocInfo::none); 4034 __ emit_data64(0x0000000000000010, relocInfo::none);//linc16 = counter_mask_addr() + 320 4035 __ emit_data64(0x0000000000000000, relocInfo::none); 4036 __ emit_data64(0x0000000000000010, relocInfo::none); 4037 __ emit_data64(0x0000000000000000, relocInfo::none); 4038 __ emit_data64(0x0000000000000010, relocInfo::none); 4039 __ emit_data64(0x0000000000000000, relocInfo::none); 4040 __ emit_data64(0x0000000000000010, relocInfo::none); 4041 __ emit_data64(0x0000000000000000, relocInfo::none); 4042 return start; 4043 } 4044 4045 // Vector AES Counter implementation 4046 address generate_counterMode_VectorAESCrypt() { 4047 __ align(CodeEntryAlignment); 4048 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 4049 address start = __ pc(); 4050 const Register from = c_rarg0; // source array address 4051 const Register to = c_rarg1; // destination array address 4052 const Register key = c_rarg2; // key array address r8 4053 const Register counter = c_rarg3; // counter byte array initialized from counter array address 4054 // and updated with the incremented counter in the end 4055 #ifndef _WIN64 4056 const Register len_reg = c_rarg4; 4057 const Register saved_encCounter_start = c_rarg5; 4058 const Register used_addr = r10; 4059 const Address used_mem(rbp, 2 * wordSize); 4060 const Register used = r11; 4061 #else 4062 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 4063 const Address saved_encCounter_mem(rbp, 7 * wordSize); // saved encrypted counter is on stack on Win64 4064 const Address used_mem(rbp, 8 * wordSize); // used length is on stack on Win64 4065 const Register len_reg = r10; // pick the first volatile windows register 4066 const Register saved_encCounter_start = r11; 4067 const Register used_addr = r13; 4068 const Register used = r14; 4069 #endif 4070 __ enter(); 4071 // Save state before entering routine 4072 __ push(r12); 4073 __ push(r13); 4074 __ push(r14); 4075 __ push(r15); 4076 #ifdef _WIN64 4077 // on win64, fill len_reg from stack position 4078 __ movl(len_reg, len_mem); 4079 __ movptr(saved_encCounter_start, saved_encCounter_mem); 4080 __ movptr(used_addr, used_mem); 4081 __ movl(used, Address(used_addr, 0)); 4082 #else 4083 __ push(len_reg); // Save 4084 __ movptr(used_addr, used_mem); 4085 __ movl(used, Address(used_addr, 0)); 4086 #endif 4087 __ push(rbx); 4088 __ aesctr_encrypt(from, to, key, counter, len_reg, used, used_addr, saved_encCounter_start); 4089 // Restore state before leaving routine 4090 __ pop(rbx); 4091 #ifdef _WIN64 4092 __ movl(rax, len_mem); // return length 4093 #else 4094 __ pop(rax); // return length 4095 #endif 4096 __ pop(r15); 4097 __ pop(r14); 4098 __ pop(r13); 4099 __ pop(r12); 4100 4101 __ leave(); // required for proper stackwalking of RuntimeStub frame 4102 __ ret(0); 4103 return start; 4104 } 4105 4106 // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time 4107 // to hide instruction latency 4108 // 4109 // Arguments: 4110 // 4111 // Inputs: 4112 // c_rarg0 - source byte array address 4113 // c_rarg1 - destination byte array address 4114 // c_rarg2 - K (key) in little endian int array 4115 // c_rarg3 - counter vector byte array address 4116 // Linux 4117 // c_rarg4 - input length 4118 // c_rarg5 - saved encryptedCounter start 4119 // rbp + 6 * wordSize - saved used length 4120 // Windows 4121 // rbp + 6 * wordSize - input length 4122 // rbp + 7 * wordSize - saved encryptedCounter start 4123 // rbp + 8 * wordSize - saved used length 4124 // 4125 // Output: 4126 // rax - input length 4127 // 4128 address generate_counterMode_AESCrypt_Parallel() { 4129 assert(UseAES, "need AES instructions and misaligned SSE support"); 4130 __ align(CodeEntryAlignment); 4131 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 4132 address start = __ pc(); 4133 const Register from = c_rarg0; // source array address 4134 const Register to = c_rarg1; // destination array address 4135 const Register key = c_rarg2; // key array address 4136 const Register counter = c_rarg3; // counter byte array initialized from counter array address 4137 // and updated with the incremented counter in the end 4138 #ifndef _WIN64 4139 const Register len_reg = c_rarg4; 4140 const Register saved_encCounter_start = c_rarg5; 4141 const Register used_addr = r10; 4142 const Address used_mem(rbp, 2 * wordSize); 4143 const Register used = r11; 4144 #else 4145 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 4146 const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64 4147 const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64 4148 const Register len_reg = r10; // pick the first volatile windows register 4149 const Register saved_encCounter_start = r11; 4150 const Register used_addr = r13; 4151 const Register used = r14; 4152 #endif 4153 const Register pos = rax; 4154 4155 const int PARALLEL_FACTOR = 6; 4156 const XMMRegister xmm_counter_shuf_mask = xmm0; 4157 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front 4158 const XMMRegister xmm_curr_counter = xmm2; 4159 4160 const XMMRegister xmm_key_tmp0 = xmm3; 4161 const XMMRegister xmm_key_tmp1 = xmm4; 4162 4163 // registers holding the four results in the parallelized loop 4164 const XMMRegister xmm_result0 = xmm5; 4165 const XMMRegister xmm_result1 = xmm6; 4166 const XMMRegister xmm_result2 = xmm7; 4167 const XMMRegister xmm_result3 = xmm8; 4168 const XMMRegister xmm_result4 = xmm9; 4169 const XMMRegister xmm_result5 = xmm10; 4170 4171 const XMMRegister xmm_from0 = xmm11; 4172 const XMMRegister xmm_from1 = xmm12; 4173 const XMMRegister xmm_from2 = xmm13; 4174 const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64. 4175 const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text 4176 const XMMRegister xmm_from5 = xmm4; 4177 4178 //for key_128, key_192, key_256 4179 const int rounds[3] = {10, 12, 14}; 4180 Label L_exit_preLoop, L_preLoop_start; 4181 Label L_multiBlock_loopTop[3]; 4182 Label L_singleBlockLoopTop[3]; 4183 Label L__incCounter[3][6]; //for 6 blocks 4184 Label L__incCounter_single[3]; //for single block, key128, key192, key256 4185 Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3]; 4186 Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3]; 4187 4188 Label L_exit; 4189 4190 __ enter(); // required for proper stackwalking of RuntimeStub frame 4191 4192 #ifdef _WIN64 4193 // allocate spill slots for r13, r14 4194 enum { 4195 saved_r13_offset, 4196 saved_r14_offset 4197 }; 4198 __ subptr(rsp, 2 * wordSize); 4199 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13); 4200 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14); 4201 4202 // on win64, fill len_reg from stack position 4203 __ movl(len_reg, len_mem); 4204 __ movptr(saved_encCounter_start, saved_encCounter_mem); 4205 __ movptr(used_addr, used_mem); 4206 __ movl(used, Address(used_addr, 0)); 4207 #else 4208 __ push(len_reg); // Save 4209 __ movptr(used_addr, used_mem); 4210 __ movl(used, Address(used_addr, 0)); 4211 #endif 4212 4213 __ push(rbx); // Save RBX 4214 __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter 4215 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch 4216 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled 4217 __ movptr(pos, 0); 4218 4219 // Use the partially used encrpyted counter from last invocation 4220 __ BIND(L_preLoop_start); 4221 __ cmpptr(used, 16); 4222 __ jcc(Assembler::aboveEqual, L_exit_preLoop); 4223 __ cmpptr(len_reg, 0); 4224 __ jcc(Assembler::lessEqual, L_exit_preLoop); 4225 __ movb(rbx, Address(saved_encCounter_start, used)); 4226 __ xorb(rbx, Address(from, pos)); 4227 __ movb(Address(to, pos), rbx); 4228 __ addptr(pos, 1); 4229 __ addptr(used, 1); 4230 __ subptr(len_reg, 1); 4231 4232 __ jmp(L_preLoop_start); 4233 4234 __ BIND(L_exit_preLoop); 4235 __ movl(Address(used_addr, 0), used); 4236 4237 // key length could be only {11, 13, 15} * 4 = {44, 52, 60} 4238 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch 4239 __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 4240 __ cmpl(rbx, 52); 4241 __ jcc(Assembler::equal, L_multiBlock_loopTop[1]); 4242 __ cmpl(rbx, 60); 4243 __ jcc(Assembler::equal, L_multiBlock_loopTop[2]); 4244 4245 #define CTR_DoSix(opc, src_reg) \ 4246 __ opc(xmm_result0, src_reg); \ 4247 __ opc(xmm_result1, src_reg); \ 4248 __ opc(xmm_result2, src_reg); \ 4249 __ opc(xmm_result3, src_reg); \ 4250 __ opc(xmm_result4, src_reg); \ 4251 __ opc(xmm_result5, src_reg); 4252 4253 // k == 0 : generate code for key_128 4254 // k == 1 : generate code for key_192 4255 // k == 2 : generate code for key_256 4256 for (int k = 0; k < 3; ++k) { 4257 //multi blocks starts here 4258 __ align(OptoLoopAlignment); 4259 __ BIND(L_multiBlock_loopTop[k]); 4260 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left 4261 __ jcc(Assembler::less, L_singleBlockLoopTop[k]); 4262 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); 4263 4264 //load, then increase counters 4265 CTR_DoSix(movdqa, xmm_curr_counter); 4266 inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]); 4267 inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]); 4268 inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]); 4269 inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]); 4270 inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]); 4271 inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]); 4272 CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR 4273 CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key 4274 4275 //load two ROUND_KEYs at a time 4276 for (int i = 1; i < rounds[k]; ) { 4277 load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask); 4278 load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask); 4279 CTR_DoSix(aesenc, xmm_key_tmp1); 4280 i++; 4281 if (i != rounds[k]) { 4282 CTR_DoSix(aesenc, xmm_key_tmp0); 4283 } else { 4284 CTR_DoSix(aesenclast, xmm_key_tmp0); 4285 } 4286 i++; 4287 } 4288 4289 // get next PARALLEL_FACTOR blocks into xmm_result registers 4290 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); 4291 __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); 4292 __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); 4293 __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); 4294 __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize)); 4295 __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize)); 4296 4297 __ pxor(xmm_result0, xmm_from0); 4298 __ pxor(xmm_result1, xmm_from1); 4299 __ pxor(xmm_result2, xmm_from2); 4300 __ pxor(xmm_result3, xmm_from3); 4301 __ pxor(xmm_result4, xmm_from4); 4302 __ pxor(xmm_result5, xmm_from5); 4303 4304 // store 6 results into the next 64 bytes of output 4305 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); 4306 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); 4307 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); 4308 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); 4309 __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4); 4310 __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5); 4311 4312 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text 4313 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length 4314 __ jmp(L_multiBlock_loopTop[k]); 4315 4316 // singleBlock starts here 4317 __ align(OptoLoopAlignment); 4318 __ BIND(L_singleBlockLoopTop[k]); 4319 __ cmpptr(len_reg, 0); 4320 __ jcc(Assembler::lessEqual, L_exit); 4321 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); 4322 __ movdqa(xmm_result0, xmm_curr_counter); 4323 inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]); 4324 __ pshufb(xmm_result0, xmm_counter_shuf_mask); 4325 __ pxor(xmm_result0, xmm_key_tmp0); 4326 for (int i = 1; i < rounds[k]; i++) { 4327 load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask); 4328 __ aesenc(xmm_result0, xmm_key_tmp0); 4329 } 4330 load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask); 4331 __ aesenclast(xmm_result0, xmm_key_tmp0); 4332 __ cmpptr(len_reg, AESBlockSize); 4333 __ jcc(Assembler::less, L_processTail_insr[k]); 4334 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); 4335 __ pxor(xmm_result0, xmm_from0); 4336 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); 4337 __ addptr(pos, AESBlockSize); 4338 __ subptr(len_reg, AESBlockSize); 4339 __ jmp(L_singleBlockLoopTop[k]); 4340 __ BIND(L_processTail_insr[k]); // Process the tail part of the input array 4341 __ addptr(pos, len_reg); // 1. Insert bytes from src array into xmm_from0 register 4342 __ testptr(len_reg, 8); 4343 __ jcc(Assembler::zero, L_processTail_4_insr[k]); 4344 __ subptr(pos,8); 4345 __ pinsrq(xmm_from0, Address(from, pos), 0); 4346 __ BIND(L_processTail_4_insr[k]); 4347 __ testptr(len_reg, 4); 4348 __ jcc(Assembler::zero, L_processTail_2_insr[k]); 4349 __ subptr(pos,4); 4350 __ pslldq(xmm_from0, 4); 4351 __ pinsrd(xmm_from0, Address(from, pos), 0); 4352 __ BIND(L_processTail_2_insr[k]); 4353 __ testptr(len_reg, 2); 4354 __ jcc(Assembler::zero, L_processTail_1_insr[k]); 4355 __ subptr(pos, 2); 4356 __ pslldq(xmm_from0, 2); 4357 __ pinsrw(xmm_from0, Address(from, pos), 0); 4358 __ BIND(L_processTail_1_insr[k]); 4359 __ testptr(len_reg, 1); 4360 __ jcc(Assembler::zero, L_processTail_exit_insr[k]); 4361 __ subptr(pos, 1); 4362 __ pslldq(xmm_from0, 1); 4363 __ pinsrb(xmm_from0, Address(from, pos), 0); 4364 __ BIND(L_processTail_exit_insr[k]); 4365 4366 __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); // 2. Perform pxor of the encrypted counter and plaintext Bytes. 4367 __ pxor(xmm_result0, xmm_from0); // Also the encrypted counter is saved for next invocation. 4368 4369 __ testptr(len_reg, 8); 4370 __ jcc(Assembler::zero, L_processTail_4_extr[k]); // 3. Extract bytes from xmm_result0 into the dest. array 4371 __ pextrq(Address(to, pos), xmm_result0, 0); 4372 __ psrldq(xmm_result0, 8); 4373 __ addptr(pos, 8); 4374 __ BIND(L_processTail_4_extr[k]); 4375 __ testptr(len_reg, 4); 4376 __ jcc(Assembler::zero, L_processTail_2_extr[k]); 4377 __ pextrd(Address(to, pos), xmm_result0, 0); 4378 __ psrldq(xmm_result0, 4); 4379 __ addptr(pos, 4); 4380 __ BIND(L_processTail_2_extr[k]); 4381 __ testptr(len_reg, 2); 4382 __ jcc(Assembler::zero, L_processTail_1_extr[k]); 4383 __ pextrw(Address(to, pos), xmm_result0, 0); 4384 __ psrldq(xmm_result0, 2); 4385 __ addptr(pos, 2); 4386 __ BIND(L_processTail_1_extr[k]); 4387 __ testptr(len_reg, 1); 4388 __ jcc(Assembler::zero, L_processTail_exit_extr[k]); 4389 __ pextrb(Address(to, pos), xmm_result0, 0); 4390 4391 __ BIND(L_processTail_exit_extr[k]); 4392 __ movl(Address(used_addr, 0), len_reg); 4393 __ jmp(L_exit); 4394 4395 } 4396 4397 __ BIND(L_exit); 4398 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back. 4399 __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back 4400 __ pop(rbx); // pop the saved RBX. 4401 #ifdef _WIN64 4402 __ movl(rax, len_mem); 4403 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize)); 4404 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize)); 4405 __ addptr(rsp, 2 * wordSize); 4406 #else 4407 __ pop(rax); // return 'len' 4408 #endif 4409 __ leave(); // required for proper stackwalking of RuntimeStub frame 4410 __ ret(0); 4411 return start; 4412 } 4413 4414 void roundDec(XMMRegister xmm_reg) { 4415 __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit); 4416 __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit); 4417 __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit); 4418 __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit); 4419 __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit); 4420 __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit); 4421 __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit); 4422 __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit); 4423 } 4424 4425 void roundDeclast(XMMRegister xmm_reg) { 4426 __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit); 4427 __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit); 4428 __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit); 4429 __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit); 4430 __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit); 4431 __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit); 4432 __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit); 4433 __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit); 4434 } 4435 4436 void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) { 4437 __ movdqu(xmmdst, Address(key, offset)); 4438 if (xmm_shuf_mask != NULL) { 4439 __ pshufb(xmmdst, xmm_shuf_mask); 4440 } else { 4441 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 4442 } 4443 __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit); 4444 4445 } 4446 4447 address generate_cipherBlockChaining_decryptVectorAESCrypt() { 4448 assert(VM_Version::supports_vaes(), "need AES instructions and misaligned SSE support"); 4449 __ align(CodeEntryAlignment); 4450 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 4451 address start = __ pc(); 4452 4453 const Register from = c_rarg0; // source array address 4454 const Register to = c_rarg1; // destination array address 4455 const Register key = c_rarg2; // key array address 4456 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 4457 // and left with the results of the last encryption block 4458 #ifndef _WIN64 4459 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 4460 #else 4461 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 4462 const Register len_reg = r11; // pick the volatile windows register 4463 #endif 4464 4465 Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop, 4466 Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit; 4467 4468 __ enter(); 4469 4470 #ifdef _WIN64 4471 // on win64, fill len_reg from stack position 4472 __ movl(len_reg, len_mem); 4473 #else 4474 __ push(len_reg); // Save 4475 #endif 4476 __ push(rbx); 4477 __ vzeroupper(); 4478 4479 // Temporary variable declaration for swapping key bytes 4480 const XMMRegister xmm_key_shuf_mask = xmm1; 4481 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 4482 4483 // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds 4484 const Register rounds = rbx; 4485 __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 4486 4487 const XMMRegister IV = xmm0; 4488 // Load IV and broadcast value to 512-bits 4489 __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit); 4490 4491 // Temporary variables for storing round keys 4492 const XMMRegister RK0 = xmm30; 4493 const XMMRegister RK1 = xmm9; 4494 const XMMRegister RK2 = xmm18; 4495 const XMMRegister RK3 = xmm19; 4496 const XMMRegister RK4 = xmm20; 4497 const XMMRegister RK5 = xmm21; 4498 const XMMRegister RK6 = xmm22; 4499 const XMMRegister RK7 = xmm23; 4500 const XMMRegister RK8 = xmm24; 4501 const XMMRegister RK9 = xmm25; 4502 const XMMRegister RK10 = xmm26; 4503 4504 // Load and shuffle key 4505 // the java expanded key ordering is rotated one position from what we want 4506 // so we start from 1*16 here and hit 0*16 last 4507 ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask); 4508 ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask); 4509 ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask); 4510 ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask); 4511 ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask); 4512 ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask); 4513 ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask); 4514 ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask); 4515 ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask); 4516 ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask); 4517 ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask); 4518 4519 // Variables for storing source cipher text 4520 const XMMRegister S0 = xmm10; 4521 const XMMRegister S1 = xmm11; 4522 const XMMRegister S2 = xmm12; 4523 const XMMRegister S3 = xmm13; 4524 const XMMRegister S4 = xmm14; 4525 const XMMRegister S5 = xmm15; 4526 const XMMRegister S6 = xmm16; 4527 const XMMRegister S7 = xmm17; 4528 4529 // Variables for storing decrypted text 4530 const XMMRegister B0 = xmm1; 4531 const XMMRegister B1 = xmm2; 4532 const XMMRegister B2 = xmm3; 4533 const XMMRegister B3 = xmm4; 4534 const XMMRegister B4 = xmm5; 4535 const XMMRegister B5 = xmm6; 4536 const XMMRegister B6 = xmm7; 4537 const XMMRegister B7 = xmm8; 4538 4539 __ cmpl(rounds, 44); 4540 __ jcc(Assembler::greater, KEY_192); 4541 __ jmp(Loop); 4542 4543 __ BIND(KEY_192); 4544 const XMMRegister RK11 = xmm27; 4545 const XMMRegister RK12 = xmm28; 4546 ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask); 4547 ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask); 4548 4549 __ cmpl(rounds, 52); 4550 __ jcc(Assembler::greater, KEY_256); 4551 __ jmp(Loop); 4552 4553 __ BIND(KEY_256); 4554 const XMMRegister RK13 = xmm29; 4555 const XMMRegister RK14 = xmm31; 4556 ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask); 4557 ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask); 4558 4559 __ BIND(Loop); 4560 __ cmpl(len_reg, 512); 4561 __ jcc(Assembler::below, Lcbc_dec_rem); 4562 __ BIND(Loop1); 4563 __ subl(len_reg, 512); 4564 __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit); 4565 __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit); 4566 __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit); 4567 __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit); 4568 __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit); 4569 __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit); 4570 __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit); 4571 __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit); 4572 __ leaq(from, Address(from, 8 * 64)); 4573 4574 __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit); 4575 __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit); 4576 __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit); 4577 __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit); 4578 __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit); 4579 __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit); 4580 __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit); 4581 __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit); 4582 4583 __ evalignq(IV, S0, IV, 0x06); 4584 __ evalignq(S0, S1, S0, 0x06); 4585 __ evalignq(S1, S2, S1, 0x06); 4586 __ evalignq(S2, S3, S2, 0x06); 4587 __ evalignq(S3, S4, S3, 0x06); 4588 __ evalignq(S4, S5, S4, 0x06); 4589 __ evalignq(S5, S6, S5, 0x06); 4590 __ evalignq(S6, S7, S6, 0x06); 4591 4592 roundDec(RK2); 4593 roundDec(RK3); 4594 roundDec(RK4); 4595 roundDec(RK5); 4596 roundDec(RK6); 4597 roundDec(RK7); 4598 roundDec(RK8); 4599 roundDec(RK9); 4600 roundDec(RK10); 4601 4602 __ cmpl(rounds, 44); 4603 __ jcc(Assembler::belowEqual, L_128); 4604 roundDec(RK11); 4605 roundDec(RK12); 4606 4607 __ cmpl(rounds, 52); 4608 __ jcc(Assembler::belowEqual, L_192); 4609 roundDec(RK13); 4610 roundDec(RK14); 4611 4612 __ BIND(L_256); 4613 roundDeclast(RK0); 4614 __ jmp(Loop2); 4615 4616 __ BIND(L_128); 4617 roundDeclast(RK0); 4618 __ jmp(Loop2); 4619 4620 __ BIND(L_192); 4621 roundDeclast(RK0); 4622 4623 __ BIND(Loop2); 4624 __ evpxorq(B0, B0, IV, Assembler::AVX_512bit); 4625 __ evpxorq(B1, B1, S0, Assembler::AVX_512bit); 4626 __ evpxorq(B2, B2, S1, Assembler::AVX_512bit); 4627 __ evpxorq(B3, B3, S2, Assembler::AVX_512bit); 4628 __ evpxorq(B4, B4, S3, Assembler::AVX_512bit); 4629 __ evpxorq(B5, B5, S4, Assembler::AVX_512bit); 4630 __ evpxorq(B6, B6, S5, Assembler::AVX_512bit); 4631 __ evpxorq(B7, B7, S6, Assembler::AVX_512bit); 4632 __ evmovdquq(IV, S7, Assembler::AVX_512bit); 4633 4634 __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit); 4635 __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit); 4636 __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit); 4637 __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit); 4638 __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit); 4639 __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit); 4640 __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit); 4641 __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit); 4642 __ leaq(to, Address(to, 8 * 64)); 4643 __ jmp(Loop); 4644 4645 __ BIND(Lcbc_dec_rem); 4646 __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit); 4647 4648 __ BIND(Lcbc_dec_rem_loop); 4649 __ subl(len_reg, 16); 4650 __ jcc(Assembler::carrySet, Lcbc_dec_ret); 4651 4652 __ movdqu(S0, Address(from, 0)); 4653 __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit); 4654 __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit); 4655 __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit); 4656 __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit); 4657 __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit); 4658 __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit); 4659 __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit); 4660 __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit); 4661 __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit); 4662 __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit); 4663 __ cmpl(rounds, 44); 4664 __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last); 4665 4666 __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit); 4667 __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit); 4668 __ cmpl(rounds, 52); 4669 __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last); 4670 4671 __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit); 4672 __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit); 4673 4674 __ BIND(Lcbc_dec_rem_last); 4675 __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit); 4676 4677 __ evpxorq(B0, B0, IV, Assembler::AVX_512bit); 4678 __ evmovdquq(IV, S0, Assembler::AVX_512bit); 4679 __ movdqu(Address(to, 0), B0); 4680 __ leaq(from, Address(from, 16)); 4681 __ leaq(to, Address(to, 16)); 4682 __ jmp(Lcbc_dec_rem_loop); 4683 4684 __ BIND(Lcbc_dec_ret); 4685 __ movdqu(Address(rvec, 0), IV); 4686 4687 // Zero out the round keys 4688 __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit); 4689 __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit); 4690 __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit); 4691 __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit); 4692 __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit); 4693 __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit); 4694 __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit); 4695 __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit); 4696 __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit); 4697 __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit); 4698 __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit); 4699 __ cmpl(rounds, 44); 4700 __ jcc(Assembler::belowEqual, Lcbc_exit); 4701 __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit); 4702 __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit); 4703 __ cmpl(rounds, 52); 4704 __ jcc(Assembler::belowEqual, Lcbc_exit); 4705 __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit); 4706 __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit); 4707 4708 __ BIND(Lcbc_exit); 4709 __ pop(rbx); 4710 #ifdef _WIN64 4711 __ movl(rax, len_mem); 4712 #else 4713 __ pop(rax); // return length 4714 #endif 4715 __ leave(); // required for proper stackwalking of RuntimeStub frame 4716 __ ret(0); 4717 return start; 4718 } 4719 4720 // Polynomial x^128+x^127+x^126+x^121+1 4721 address ghash_polynomial_addr() { 4722 __ align(CodeEntryAlignment); 4723 StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr"); 4724 address start = __ pc(); 4725 __ emit_data64(0x0000000000000001, relocInfo::none); 4726 __ emit_data64(0xc200000000000000, relocInfo::none); 4727 return start; 4728 } 4729 4730 address ghash_shufflemask_addr() { 4731 __ align(CodeEntryAlignment); 4732 StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr"); 4733 address start = __ pc(); 4734 __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none); 4735 __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none); 4736 return start; 4737 } 4738 4739 // Ghash single and multi block operations using AVX instructions 4740 address generate_avx_ghash_processBlocks() { 4741 __ align(CodeEntryAlignment); 4742 4743 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4744 address start = __ pc(); 4745 4746 // arguments 4747 const Register state = c_rarg0; 4748 const Register htbl = c_rarg1; 4749 const Register data = c_rarg2; 4750 const Register blocks = c_rarg3; 4751 __ enter(); 4752 // Save state before entering routine 4753 __ avx_ghash(state, htbl, data, blocks); 4754 __ leave(); // required for proper stackwalking of RuntimeStub frame 4755 __ ret(0); 4756 return start; 4757 } 4758 4759 // byte swap x86 long 4760 address generate_ghash_long_swap_mask() { 4761 __ align(CodeEntryAlignment); 4762 StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask"); 4763 address start = __ pc(); 4764 __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none ); 4765 __ emit_data64(0x0706050403020100, relocInfo::none ); 4766 return start; 4767 } 4768 4769 // byte swap x86 byte array 4770 address generate_ghash_byte_swap_mask() { 4771 __ align(CodeEntryAlignment); 4772 StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask"); 4773 address start = __ pc(); 4774 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none ); 4775 __ emit_data64(0x0001020304050607, relocInfo::none ); 4776 return start; 4777 } 4778 4779 /* Single and multi-block ghash operations */ 4780 address generate_ghash_processBlocks() { 4781 __ align(CodeEntryAlignment); 4782 Label L_ghash_loop, L_exit; 4783 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4784 address start = __ pc(); 4785 4786 const Register state = c_rarg0; 4787 const Register subkeyH = c_rarg1; 4788 const Register data = c_rarg2; 4789 const Register blocks = c_rarg3; 4790 4791 const XMMRegister xmm_temp0 = xmm0; 4792 const XMMRegister xmm_temp1 = xmm1; 4793 const XMMRegister xmm_temp2 = xmm2; 4794 const XMMRegister xmm_temp3 = xmm3; 4795 const XMMRegister xmm_temp4 = xmm4; 4796 const XMMRegister xmm_temp5 = xmm5; 4797 const XMMRegister xmm_temp6 = xmm6; 4798 const XMMRegister xmm_temp7 = xmm7; 4799 const XMMRegister xmm_temp8 = xmm8; 4800 const XMMRegister xmm_temp9 = xmm9; 4801 const XMMRegister xmm_temp10 = xmm10; 4802 4803 __ enter(); 4804 4805 __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); 4806 4807 __ movdqu(xmm_temp0, Address(state, 0)); 4808 __ pshufb(xmm_temp0, xmm_temp10); 4809 4810 4811 __ BIND(L_ghash_loop); 4812 __ movdqu(xmm_temp2, Address(data, 0)); 4813 __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); 4814 4815 __ movdqu(xmm_temp1, Address(subkeyH, 0)); 4816 __ pshufb(xmm_temp1, xmm_temp10); 4817 4818 __ pxor(xmm_temp0, xmm_temp2); 4819 4820 // 4821 // Multiply with the hash key 4822 // 4823 __ movdqu(xmm_temp3, xmm_temp0); 4824 __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0 4825 __ movdqu(xmm_temp4, xmm_temp0); 4826 __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1 4827 4828 __ movdqu(xmm_temp5, xmm_temp0); 4829 __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0 4830 __ movdqu(xmm_temp6, xmm_temp0); 4831 __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1 4832 4833 __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0 4834 4835 __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5 4836 __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right 4837 __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left 4838 __ pxor(xmm_temp3, xmm_temp5); 4839 __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result 4840 // of the carry-less multiplication of 4841 // xmm0 by xmm1. 4842 4843 // We shift the result of the multiplication by one bit position 4844 // to the left to cope for the fact that the bits are reversed. 4845 __ movdqu(xmm_temp7, xmm_temp3); 4846 __ movdqu(xmm_temp8, xmm_temp6); 4847 __ pslld(xmm_temp3, 1); 4848 __ pslld(xmm_temp6, 1); 4849 __ psrld(xmm_temp7, 31); 4850 __ psrld(xmm_temp8, 31); 4851 __ movdqu(xmm_temp9, xmm_temp7); 4852 __ pslldq(xmm_temp8, 4); 4853 __ pslldq(xmm_temp7, 4); 4854 __ psrldq(xmm_temp9, 12); 4855 __ por(xmm_temp3, xmm_temp7); 4856 __ por(xmm_temp6, xmm_temp8); 4857 __ por(xmm_temp6, xmm_temp9); 4858 4859 // 4860 // First phase of the reduction 4861 // 4862 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts 4863 // independently. 4864 __ movdqu(xmm_temp7, xmm_temp3); 4865 __ movdqu(xmm_temp8, xmm_temp3); 4866 __ movdqu(xmm_temp9, xmm_temp3); 4867 __ pslld(xmm_temp7, 31); // packed right shift shifting << 31 4868 __ pslld(xmm_temp8, 30); // packed right shift shifting << 30 4869 __ pslld(xmm_temp9, 25); // packed right shift shifting << 25 4870 __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions 4871 __ pxor(xmm_temp7, xmm_temp9); 4872 __ movdqu(xmm_temp8, xmm_temp7); 4873 __ pslldq(xmm_temp7, 12); 4874 __ psrldq(xmm_temp8, 4); 4875 __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete 4876 4877 // 4878 // Second phase of the reduction 4879 // 4880 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these 4881 // shift operations. 4882 __ movdqu(xmm_temp2, xmm_temp3); 4883 __ movdqu(xmm_temp4, xmm_temp3); 4884 __ movdqu(xmm_temp5, xmm_temp3); 4885 __ psrld(xmm_temp2, 1); // packed left shifting >> 1 4886 __ psrld(xmm_temp4, 2); // packed left shifting >> 2 4887 __ psrld(xmm_temp5, 7); // packed left shifting >> 7 4888 __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions 4889 __ pxor(xmm_temp2, xmm_temp5); 4890 __ pxor(xmm_temp2, xmm_temp8); 4891 __ pxor(xmm_temp3, xmm_temp2); 4892 __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6 4893 4894 __ decrement(blocks); 4895 __ jcc(Assembler::zero, L_exit); 4896 __ movdqu(xmm_temp0, xmm_temp6); 4897 __ addptr(data, 16); 4898 __ jmp(L_ghash_loop); 4899 4900 __ BIND(L_exit); 4901 __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result 4902 __ movdqu(Address(state, 0), xmm_temp6); // store the result 4903 __ leave(); 4904 __ ret(0); 4905 return start; 4906 } 4907 4908 //base64 character set 4909 address base64_charset_addr() { 4910 __ align(CodeEntryAlignment); 4911 StubCodeMark mark(this, "StubRoutines", "base64_charset"); 4912 address start = __ pc(); 4913 __ emit_data64(0x0000004200000041, relocInfo::none); 4914 __ emit_data64(0x0000004400000043, relocInfo::none); 4915 __ emit_data64(0x0000004600000045, relocInfo::none); 4916 __ emit_data64(0x0000004800000047, relocInfo::none); 4917 __ emit_data64(0x0000004a00000049, relocInfo::none); 4918 __ emit_data64(0x0000004c0000004b, relocInfo::none); 4919 __ emit_data64(0x0000004e0000004d, relocInfo::none); 4920 __ emit_data64(0x000000500000004f, relocInfo::none); 4921 __ emit_data64(0x0000005200000051, relocInfo::none); 4922 __ emit_data64(0x0000005400000053, relocInfo::none); 4923 __ emit_data64(0x0000005600000055, relocInfo::none); 4924 __ emit_data64(0x0000005800000057, relocInfo::none); 4925 __ emit_data64(0x0000005a00000059, relocInfo::none); 4926 __ emit_data64(0x0000006200000061, relocInfo::none); 4927 __ emit_data64(0x0000006400000063, relocInfo::none); 4928 __ emit_data64(0x0000006600000065, relocInfo::none); 4929 __ emit_data64(0x0000006800000067, relocInfo::none); 4930 __ emit_data64(0x0000006a00000069, relocInfo::none); 4931 __ emit_data64(0x0000006c0000006b, relocInfo::none); 4932 __ emit_data64(0x0000006e0000006d, relocInfo::none); 4933 __ emit_data64(0x000000700000006f, relocInfo::none); 4934 __ emit_data64(0x0000007200000071, relocInfo::none); 4935 __ emit_data64(0x0000007400000073, relocInfo::none); 4936 __ emit_data64(0x0000007600000075, relocInfo::none); 4937 __ emit_data64(0x0000007800000077, relocInfo::none); 4938 __ emit_data64(0x0000007a00000079, relocInfo::none); 4939 __ emit_data64(0x0000003100000030, relocInfo::none); 4940 __ emit_data64(0x0000003300000032, relocInfo::none); 4941 __ emit_data64(0x0000003500000034, relocInfo::none); 4942 __ emit_data64(0x0000003700000036, relocInfo::none); 4943 __ emit_data64(0x0000003900000038, relocInfo::none); 4944 __ emit_data64(0x0000002f0000002b, relocInfo::none); 4945 return start; 4946 } 4947 4948 //base64 url character set 4949 address base64url_charset_addr() { 4950 __ align(CodeEntryAlignment); 4951 StubCodeMark mark(this, "StubRoutines", "base64url_charset"); 4952 address start = __ pc(); 4953 __ emit_data64(0x0000004200000041, relocInfo::none); 4954 __ emit_data64(0x0000004400000043, relocInfo::none); 4955 __ emit_data64(0x0000004600000045, relocInfo::none); 4956 __ emit_data64(0x0000004800000047, relocInfo::none); 4957 __ emit_data64(0x0000004a00000049, relocInfo::none); 4958 __ emit_data64(0x0000004c0000004b, relocInfo::none); 4959 __ emit_data64(0x0000004e0000004d, relocInfo::none); 4960 __ emit_data64(0x000000500000004f, relocInfo::none); 4961 __ emit_data64(0x0000005200000051, relocInfo::none); 4962 __ emit_data64(0x0000005400000053, relocInfo::none); 4963 __ emit_data64(0x0000005600000055, relocInfo::none); 4964 __ emit_data64(0x0000005800000057, relocInfo::none); 4965 __ emit_data64(0x0000005a00000059, relocInfo::none); 4966 __ emit_data64(0x0000006200000061, relocInfo::none); 4967 __ emit_data64(0x0000006400000063, relocInfo::none); 4968 __ emit_data64(0x0000006600000065, relocInfo::none); 4969 __ emit_data64(0x0000006800000067, relocInfo::none); 4970 __ emit_data64(0x0000006a00000069, relocInfo::none); 4971 __ emit_data64(0x0000006c0000006b, relocInfo::none); 4972 __ emit_data64(0x0000006e0000006d, relocInfo::none); 4973 __ emit_data64(0x000000700000006f, relocInfo::none); 4974 __ emit_data64(0x0000007200000071, relocInfo::none); 4975 __ emit_data64(0x0000007400000073, relocInfo::none); 4976 __ emit_data64(0x0000007600000075, relocInfo::none); 4977 __ emit_data64(0x0000007800000077, relocInfo::none); 4978 __ emit_data64(0x0000007a00000079, relocInfo::none); 4979 __ emit_data64(0x0000003100000030, relocInfo::none); 4980 __ emit_data64(0x0000003300000032, relocInfo::none); 4981 __ emit_data64(0x0000003500000034, relocInfo::none); 4982 __ emit_data64(0x0000003700000036, relocInfo::none); 4983 __ emit_data64(0x0000003900000038, relocInfo::none); 4984 __ emit_data64(0x0000005f0000002d, relocInfo::none); 4985 4986 return start; 4987 } 4988 4989 address base64_bswap_mask_addr() { 4990 __ align(CodeEntryAlignment); 4991 StubCodeMark mark(this, "StubRoutines", "bswap_mask_base64"); 4992 address start = __ pc(); 4993 __ emit_data64(0x0504038002010080, relocInfo::none); 4994 __ emit_data64(0x0b0a098008070680, relocInfo::none); 4995 __ emit_data64(0x0908078006050480, relocInfo::none); 4996 __ emit_data64(0x0f0e0d800c0b0a80, relocInfo::none); 4997 __ emit_data64(0x0605048003020180, relocInfo::none); 4998 __ emit_data64(0x0c0b0a8009080780, relocInfo::none); 4999 __ emit_data64(0x0504038002010080, relocInfo::none); 5000 __ emit_data64(0x0b0a098008070680, relocInfo::none); 5001 5002 return start; 5003 } 5004 5005 address base64_right_shift_mask_addr() { 5006 __ align(CodeEntryAlignment); 5007 StubCodeMark mark(this, "StubRoutines", "right_shift_mask"); 5008 address start = __ pc(); 5009 __ emit_data64(0x0006000400020000, relocInfo::none); 5010 __ emit_data64(0x0006000400020000, relocInfo::none); 5011 __ emit_data64(0x0006000400020000, relocInfo::none); 5012 __ emit_data64(0x0006000400020000, relocInfo::none); 5013 __ emit_data64(0x0006000400020000, relocInfo::none); 5014 __ emit_data64(0x0006000400020000, relocInfo::none); 5015 __ emit_data64(0x0006000400020000, relocInfo::none); 5016 __ emit_data64(0x0006000400020000, relocInfo::none); 5017 5018 return start; 5019 } 5020 5021 address base64_left_shift_mask_addr() { 5022 __ align(CodeEntryAlignment); 5023 StubCodeMark mark(this, "StubRoutines", "left_shift_mask"); 5024 address start = __ pc(); 5025 __ emit_data64(0x0000000200040000, relocInfo::none); 5026 __ emit_data64(0x0000000200040000, relocInfo::none); 5027 __ emit_data64(0x0000000200040000, relocInfo::none); 5028 __ emit_data64(0x0000000200040000, relocInfo::none); 5029 __ emit_data64(0x0000000200040000, relocInfo::none); 5030 __ emit_data64(0x0000000200040000, relocInfo::none); 5031 __ emit_data64(0x0000000200040000, relocInfo::none); 5032 __ emit_data64(0x0000000200040000, relocInfo::none); 5033 5034 return start; 5035 } 5036 5037 address base64_and_mask_addr() { 5038 __ align(CodeEntryAlignment); 5039 StubCodeMark mark(this, "StubRoutines", "and_mask"); 5040 address start = __ pc(); 5041 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5042 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5043 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5044 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5045 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5046 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5047 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5048 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5049 return start; 5050 } 5051 5052 address base64_gather_mask_addr() { 5053 __ align(CodeEntryAlignment); 5054 StubCodeMark mark(this, "StubRoutines", "gather_mask"); 5055 address start = __ pc(); 5056 __ emit_data64(0xffffffffffffffff, relocInfo::none); 5057 return start; 5058 } 5059 5060 // Code for generating Base64 encoding. 5061 // Intrinsic function prototype in Base64.java: 5062 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) { 5063 address generate_base64_encodeBlock() { 5064 __ align(CodeEntryAlignment); 5065 StubCodeMark mark(this, "StubRoutines", "implEncode"); 5066 address start = __ pc(); 5067 __ enter(); 5068 5069 // Save callee-saved registers before using them 5070 __ push(r12); 5071 __ push(r13); 5072 __ push(r14); 5073 __ push(r15); 5074 5075 // arguments 5076 const Register source = c_rarg0; // Source Array 5077 const Register start_offset = c_rarg1; // start offset 5078 const Register end_offset = c_rarg2; // end offset 5079 const Register dest = c_rarg3; // destination array 5080 5081 #ifndef _WIN64 5082 const Register dp = c_rarg4; // Position for writing to dest array 5083 const Register isURL = c_rarg5;// Base64 or URL character set 5084 #else 5085 const Address dp_mem(rbp, 6 * wordSize); // length is on stack on Win64 5086 const Address isURL_mem(rbp, 7 * wordSize); 5087 const Register isURL = r10; // pick the volatile windows register 5088 const Register dp = r12; 5089 __ movl(dp, dp_mem); 5090 __ movl(isURL, isURL_mem); 5091 #endif 5092 5093 const Register length = r14; 5094 Label L_process80, L_process32, L_process3, L_exit, L_processdata; 5095 5096 // calculate length from offsets 5097 __ movl(length, end_offset); 5098 __ subl(length, start_offset); 5099 __ cmpl(length, 0); 5100 __ jcc(Assembler::lessEqual, L_exit); 5101 5102 __ lea(r11, ExternalAddress(StubRoutines::x86::base64_charset_addr())); 5103 // check if base64 charset(isURL=0) or base64 url charset(isURL=1) needs to be loaded 5104 __ cmpl(isURL, 0); 5105 __ jcc(Assembler::equal, L_processdata); 5106 __ lea(r11, ExternalAddress(StubRoutines::x86::base64url_charset_addr())); 5107 5108 // load masks required for encoding data 5109 __ BIND(L_processdata); 5110 __ movdqu(xmm16, ExternalAddress(StubRoutines::x86::base64_gather_mask_addr())); 5111 // Set 64 bits of K register. 5112 __ evpcmpeqb(k3, xmm16, xmm16, Assembler::AVX_512bit); 5113 __ evmovdquq(xmm12, ExternalAddress(StubRoutines::x86::base64_bswap_mask_addr()), Assembler::AVX_256bit, r13); 5114 __ evmovdquq(xmm13, ExternalAddress(StubRoutines::x86::base64_right_shift_mask_addr()), Assembler::AVX_512bit, r13); 5115 __ evmovdquq(xmm14, ExternalAddress(StubRoutines::x86::base64_left_shift_mask_addr()), Assembler::AVX_512bit, r13); 5116 __ evmovdquq(xmm15, ExternalAddress(StubRoutines::x86::base64_and_mask_addr()), Assembler::AVX_512bit, r13); 5117 5118 // Vector Base64 implementation, producing 96 bytes of encoded data 5119 __ BIND(L_process80); 5120 __ cmpl(length, 80); 5121 __ jcc(Assembler::below, L_process32); 5122 __ evmovdquq(xmm0, Address(source, start_offset, Address::times_1, 0), Assembler::AVX_256bit); 5123 __ evmovdquq(xmm1, Address(source, start_offset, Address::times_1, 24), Assembler::AVX_256bit); 5124 __ evmovdquq(xmm2, Address(source, start_offset, Address::times_1, 48), Assembler::AVX_256bit); 5125 5126 //permute the input data in such a manner that we have continuity of the source 5127 __ vpermq(xmm3, xmm0, 148, Assembler::AVX_256bit); 5128 __ vpermq(xmm4, xmm1, 148, Assembler::AVX_256bit); 5129 __ vpermq(xmm5, xmm2, 148, Assembler::AVX_256bit); 5130 5131 //shuffle input and group 3 bytes of data and to it add 0 as the 4th byte. 5132 //we can deal with 12 bytes at a time in a 128 bit register 5133 __ vpshufb(xmm3, xmm3, xmm12, Assembler::AVX_256bit); 5134 __ vpshufb(xmm4, xmm4, xmm12, Assembler::AVX_256bit); 5135 __ vpshufb(xmm5, xmm5, xmm12, Assembler::AVX_256bit); 5136 5137 //convert byte to word. Each 128 bit register will have 6 bytes for processing 5138 __ vpmovzxbw(xmm3, xmm3, Assembler::AVX_512bit); 5139 __ vpmovzxbw(xmm4, xmm4, Assembler::AVX_512bit); 5140 __ vpmovzxbw(xmm5, xmm5, Assembler::AVX_512bit); 5141 5142 // Extract bits in the following pattern 6, 4+2, 2+4, 6 to convert 3, 8 bit numbers to 4, 6 bit numbers 5143 __ evpsrlvw(xmm0, xmm3, xmm13, Assembler::AVX_512bit); 5144 __ evpsrlvw(xmm1, xmm4, xmm13, Assembler::AVX_512bit); 5145 __ evpsrlvw(xmm2, xmm5, xmm13, Assembler::AVX_512bit); 5146 5147 __ evpsllvw(xmm3, xmm3, xmm14, Assembler::AVX_512bit); 5148 __ evpsllvw(xmm4, xmm4, xmm14, Assembler::AVX_512bit); 5149 __ evpsllvw(xmm5, xmm5, xmm14, Assembler::AVX_512bit); 5150 5151 __ vpsrlq(xmm0, xmm0, 8, Assembler::AVX_512bit); 5152 __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit); 5153 __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit); 5154 5155 __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit); 5156 __ vpsllq(xmm4, xmm4, 8, Assembler::AVX_512bit); 5157 __ vpsllq(xmm5, xmm5, 8, Assembler::AVX_512bit); 5158 5159 __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit); 5160 __ vpandq(xmm4, xmm4, xmm15, Assembler::AVX_512bit); 5161 __ vpandq(xmm5, xmm5, xmm15, Assembler::AVX_512bit); 5162 5163 // Get the final 4*6 bits base64 encoding 5164 __ vporq(xmm3, xmm3, xmm0, Assembler::AVX_512bit); 5165 __ vporq(xmm4, xmm4, xmm1, Assembler::AVX_512bit); 5166 __ vporq(xmm5, xmm5, xmm2, Assembler::AVX_512bit); 5167 5168 // Shift 5169 __ vpsrlq(xmm3, xmm3, 8, Assembler::AVX_512bit); 5170 __ vpsrlq(xmm4, xmm4, 8, Assembler::AVX_512bit); 5171 __ vpsrlq(xmm5, xmm5, 8, Assembler::AVX_512bit); 5172 5173 // look up 6 bits in the base64 character set to fetch the encoding 5174 // we are converting word to dword as gather instructions need dword indices for looking up encoding 5175 __ vextracti64x4(xmm6, xmm3, 0); 5176 __ vpmovzxwd(xmm0, xmm6, Assembler::AVX_512bit); 5177 __ vextracti64x4(xmm6, xmm3, 1); 5178 __ vpmovzxwd(xmm1, xmm6, Assembler::AVX_512bit); 5179 5180 __ vextracti64x4(xmm6, xmm4, 0); 5181 __ vpmovzxwd(xmm2, xmm6, Assembler::AVX_512bit); 5182 __ vextracti64x4(xmm6, xmm4, 1); 5183 __ vpmovzxwd(xmm3, xmm6, Assembler::AVX_512bit); 5184 5185 __ vextracti64x4(xmm4, xmm5, 0); 5186 __ vpmovzxwd(xmm6, xmm4, Assembler::AVX_512bit); 5187 5188 __ vextracti64x4(xmm4, xmm5, 1); 5189 __ vpmovzxwd(xmm7, xmm4, Assembler::AVX_512bit); 5190 5191 __ kmovql(k2, k3); 5192 __ evpgatherdd(xmm4, k2, Address(r11, xmm0, Address::times_4, 0), Assembler::AVX_512bit); 5193 __ kmovql(k2, k3); 5194 __ evpgatherdd(xmm5, k2, Address(r11, xmm1, Address::times_4, 0), Assembler::AVX_512bit); 5195 __ kmovql(k2, k3); 5196 __ evpgatherdd(xmm8, k2, Address(r11, xmm2, Address::times_4, 0), Assembler::AVX_512bit); 5197 __ kmovql(k2, k3); 5198 __ evpgatherdd(xmm9, k2, Address(r11, xmm3, Address::times_4, 0), Assembler::AVX_512bit); 5199 __ kmovql(k2, k3); 5200 __ evpgatherdd(xmm10, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit); 5201 __ kmovql(k2, k3); 5202 __ evpgatherdd(xmm11, k2, Address(r11, xmm7, Address::times_4, 0), Assembler::AVX_512bit); 5203 5204 //Down convert dword to byte. Final output is 16*6 = 96 bytes long 5205 __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm4, Assembler::AVX_512bit); 5206 __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm5, Assembler::AVX_512bit); 5207 __ evpmovdb(Address(dest, dp, Address::times_1, 32), xmm8, Assembler::AVX_512bit); 5208 __ evpmovdb(Address(dest, dp, Address::times_1, 48), xmm9, Assembler::AVX_512bit); 5209 __ evpmovdb(Address(dest, dp, Address::times_1, 64), xmm10, Assembler::AVX_512bit); 5210 __ evpmovdb(Address(dest, dp, Address::times_1, 80), xmm11, Assembler::AVX_512bit); 5211 5212 __ addq(dest, 96); 5213 __ addq(source, 72); 5214 __ subq(length, 72); 5215 __ jmp(L_process80); 5216 5217 // Vector Base64 implementation generating 32 bytes of encoded data 5218 __ BIND(L_process32); 5219 __ cmpl(length, 32); 5220 __ jcc(Assembler::below, L_process3); 5221 __ evmovdquq(xmm0, Address(source, start_offset), Assembler::AVX_256bit); 5222 __ vpermq(xmm0, xmm0, 148, Assembler::AVX_256bit); 5223 __ vpshufb(xmm6, xmm0, xmm12, Assembler::AVX_256bit); 5224 __ vpmovzxbw(xmm6, xmm6, Assembler::AVX_512bit); 5225 __ evpsrlvw(xmm2, xmm6, xmm13, Assembler::AVX_512bit); 5226 __ evpsllvw(xmm3, xmm6, xmm14, Assembler::AVX_512bit); 5227 5228 __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit); 5229 __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit); 5230 __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit); 5231 __ vporq(xmm1, xmm2, xmm3, Assembler::AVX_512bit); 5232 __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit); 5233 __ vextracti64x4(xmm9, xmm1, 0); 5234 __ vpmovzxwd(xmm6, xmm9, Assembler::AVX_512bit); 5235 __ vextracti64x4(xmm9, xmm1, 1); 5236 __ vpmovzxwd(xmm5, xmm9, Assembler::AVX_512bit); 5237 __ kmovql(k2, k3); 5238 __ evpgatherdd(xmm8, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit); 5239 __ kmovql(k2, k3); 5240 __ evpgatherdd(xmm10, k2, Address(r11, xmm5, Address::times_4, 0), Assembler::AVX_512bit); 5241 __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm8, Assembler::AVX_512bit); 5242 __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm10, Assembler::AVX_512bit); 5243 __ subq(length, 24); 5244 __ addq(dest, 32); 5245 __ addq(source, 24); 5246 __ jmp(L_process32); 5247 5248 // Scalar data processing takes 3 bytes at a time and produces 4 bytes of encoded data 5249 /* This code corresponds to the scalar version of the following snippet in Base64.java 5250 ** int bits = (src[sp0++] & 0xff) << 16 |(src[sp0++] & 0xff) << 8 |(src[sp0++] & 0xff); 5251 ** dst[dp0++] = (byte)base64[(bits >> > 18) & 0x3f]; 5252 ** dst[dp0++] = (byte)base64[(bits >> > 12) & 0x3f]; 5253 ** dst[dp0++] = (byte)base64[(bits >> > 6) & 0x3f]; 5254 ** dst[dp0++] = (byte)base64[bits & 0x3f];*/ 5255 __ BIND(L_process3); 5256 __ cmpl(length, 3); 5257 __ jcc(Assembler::below, L_exit); 5258 // Read 1 byte at a time 5259 __ movzbl(rax, Address(source, start_offset)); 5260 __ shll(rax, 0x10); 5261 __ movl(r15, rax); 5262 __ movzbl(rax, Address(source, start_offset, Address::times_1, 1)); 5263 __ shll(rax, 0x8); 5264 __ movzwl(rax, rax); 5265 __ orl(r15, rax); 5266 __ movzbl(rax, Address(source, start_offset, Address::times_1, 2)); 5267 __ orl(rax, r15); 5268 // Save 3 bytes read in r15 5269 __ movl(r15, rax); 5270 __ shrl(rax, 0x12); 5271 __ andl(rax, 0x3f); 5272 // rax contains the index, r11 contains base64 lookup table 5273 __ movb(rax, Address(r11, rax, Address::times_4)); 5274 // Write the encoded byte to destination 5275 __ movb(Address(dest, dp, Address::times_1, 0), rax); 5276 __ movl(rax, r15); 5277 __ shrl(rax, 0xc); 5278 __ andl(rax, 0x3f); 5279 __ movb(rax, Address(r11, rax, Address::times_4)); 5280 __ movb(Address(dest, dp, Address::times_1, 1), rax); 5281 __ movl(rax, r15); 5282 __ shrl(rax, 0x6); 5283 __ andl(rax, 0x3f); 5284 __ movb(rax, Address(r11, rax, Address::times_4)); 5285 __ movb(Address(dest, dp, Address::times_1, 2), rax); 5286 __ movl(rax, r15); 5287 __ andl(rax, 0x3f); 5288 __ movb(rax, Address(r11, rax, Address::times_4)); 5289 __ movb(Address(dest, dp, Address::times_1, 3), rax); 5290 __ subl(length, 3); 5291 __ addq(dest, 4); 5292 __ addq(source, 3); 5293 __ jmp(L_process3); 5294 __ BIND(L_exit); 5295 __ pop(r15); 5296 __ pop(r14); 5297 __ pop(r13); 5298 __ pop(r12); 5299 __ leave(); 5300 __ ret(0); 5301 return start; 5302 } 5303 5304 /** 5305 * Arguments: 5306 * 5307 * Inputs: 5308 * c_rarg0 - int crc 5309 * c_rarg1 - byte* buf 5310 * c_rarg2 - int length 5311 * 5312 * Ouput: 5313 * rax - int crc result 5314 */ 5315 address generate_updateBytesCRC32() { 5316 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions"); 5317 5318 __ align(CodeEntryAlignment); 5319 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 5320 5321 address start = __ pc(); 5322 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 5323 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) 5324 // rscratch1: r10 5325 const Register crc = c_rarg0; // crc 5326 const Register buf = c_rarg1; // source java byte array address 5327 const Register len = c_rarg2; // length 5328 const Register table = c_rarg3; // crc_table address (reuse register) 5329 const Register tmp = r11; 5330 assert_different_registers(crc, buf, len, table, tmp, rax); 5331 5332 BLOCK_COMMENT("Entry:"); 5333 __ enter(); // required for proper stackwalking of RuntimeStub frame 5334 5335 __ kernel_crc32(crc, buf, len, table, tmp); 5336 5337 __ movl(rax, crc); 5338 __ vzeroupper(); 5339 __ leave(); // required for proper stackwalking of RuntimeStub frame 5340 __ ret(0); 5341 5342 return start; 5343 } 5344 5345 /** 5346 * Arguments: 5347 * 5348 * Inputs: 5349 * c_rarg0 - int crc 5350 * c_rarg1 - byte* buf 5351 * c_rarg2 - long length 5352 * c_rarg3 - table_start - optional (present only when doing a library_call, 5353 * not used by x86 algorithm) 5354 * 5355 * Ouput: 5356 * rax - int crc result 5357 */ 5358 address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) { 5359 assert(UseCRC32CIntrinsics, "need SSE4_2"); 5360 __ align(CodeEntryAlignment); 5361 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 5362 address start = __ pc(); 5363 //reg.arg int#0 int#1 int#2 int#3 int#4 int#5 float regs 5364 //Windows RCX RDX R8 R9 none none XMM0..XMM3 5365 //Lin / Sol RDI RSI RDX RCX R8 R9 XMM0..XMM7 5366 const Register crc = c_rarg0; // crc 5367 const Register buf = c_rarg1; // source java byte array address 5368 const Register len = c_rarg2; // length 5369 const Register a = rax; 5370 const Register j = r9; 5371 const Register k = r10; 5372 const Register l = r11; 5373 #ifdef _WIN64 5374 const Register y = rdi; 5375 const Register z = rsi; 5376 #else 5377 const Register y = rcx; 5378 const Register z = r8; 5379 #endif 5380 assert_different_registers(crc, buf, len, a, j, k, l, y, z); 5381 5382 BLOCK_COMMENT("Entry:"); 5383 __ enter(); // required for proper stackwalking of RuntimeStub frame 5384 #ifdef _WIN64 5385 __ push(y); 5386 __ push(z); 5387 #endif 5388 __ crc32c_ipl_alg2_alt2(crc, buf, len, 5389 a, j, k, 5390 l, y, z, 5391 c_farg0, c_farg1, c_farg2, 5392 is_pclmulqdq_supported); 5393 __ movl(rax, crc); 5394 #ifdef _WIN64 5395 __ pop(z); 5396 __ pop(y); 5397 #endif 5398 __ vzeroupper(); 5399 __ leave(); // required for proper stackwalking of RuntimeStub frame 5400 __ ret(0); 5401 5402 return start; 5403 } 5404 5405 /** 5406 * Arguments: 5407 * 5408 * Input: 5409 * c_rarg0 - x address 5410 * c_rarg1 - x length 5411 * c_rarg2 - y address 5412 * c_rarg3 - y length 5413 * not Win64 5414 * c_rarg4 - z address 5415 * c_rarg5 - z length 5416 * Win64 5417 * rsp+40 - z address 5418 * rsp+48 - z length 5419 */ 5420 address generate_multiplyToLen() { 5421 __ align(CodeEntryAlignment); 5422 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 5423 5424 address start = __ pc(); 5425 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 5426 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) 5427 const Register x = rdi; 5428 const Register xlen = rax; 5429 const Register y = rsi; 5430 const Register ylen = rcx; 5431 const Register z = r8; 5432 const Register zlen = r11; 5433 5434 // Next registers will be saved on stack in multiply_to_len(). 5435 const Register tmp1 = r12; 5436 const Register tmp2 = r13; 5437 const Register tmp3 = r14; 5438 const Register tmp4 = r15; 5439 const Register tmp5 = rbx; 5440 5441 BLOCK_COMMENT("Entry:"); 5442 __ enter(); // required for proper stackwalking of RuntimeStub frame 5443 5444 #ifndef _WIN64 5445 __ movptr(zlen, r9); // Save r9 in r11 - zlen 5446 #endif 5447 setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx 5448 // ylen => rcx, z => r8, zlen => r11 5449 // r9 and r10 may be used to save non-volatile registers 5450 #ifdef _WIN64 5451 // last 2 arguments (#4, #5) are on stack on Win64 5452 __ movptr(z, Address(rsp, 6 * wordSize)); 5453 __ movptr(zlen, Address(rsp, 7 * wordSize)); 5454 #endif 5455 5456 __ movptr(xlen, rsi); 5457 __ movptr(y, rdx); 5458 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5); 5459 5460 restore_arg_regs(); 5461 5462 __ leave(); // required for proper stackwalking of RuntimeStub frame 5463 __ ret(0); 5464 5465 return start; 5466 } 5467 5468 /** 5469 * Arguments: 5470 * 5471 * Input: 5472 * c_rarg0 - obja address 5473 * c_rarg1 - objb address 5474 * c_rarg3 - length length 5475 * c_rarg4 - scale log2_array_indxscale 5476 * 5477 * Output: 5478 * rax - int >= mismatched index, < 0 bitwise complement of tail 5479 */ 5480 address generate_vectorizedMismatch() { 5481 __ align(CodeEntryAlignment); 5482 StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch"); 5483 address start = __ pc(); 5484 5485 BLOCK_COMMENT("Entry:"); 5486 __ enter(); 5487 5488 #ifdef _WIN64 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 5489 const Register scale = c_rarg0; //rcx, will exchange with r9 5490 const Register objb = c_rarg1; //rdx 5491 const Register length = c_rarg2; //r8 5492 const Register obja = c_rarg3; //r9 5493 __ xchgq(obja, scale); //now obja and scale contains the correct contents 5494 5495 const Register tmp1 = r10; 5496 const Register tmp2 = r11; 5497 #endif 5498 #ifndef _WIN64 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) 5499 const Register obja = c_rarg0; //U:rdi 5500 const Register objb = c_rarg1; //U:rsi 5501 const Register length = c_rarg2; //U:rdx 5502 const Register scale = c_rarg3; //U:rcx 5503 const Register tmp1 = r8; 5504 const Register tmp2 = r9; 5505 #endif 5506 const Register result = rax; //return value 5507 const XMMRegister vec0 = xmm0; 5508 const XMMRegister vec1 = xmm1; 5509 const XMMRegister vec2 = xmm2; 5510 5511 __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2); 5512 5513 __ vzeroupper(); 5514 __ leave(); 5515 __ ret(0); 5516 5517 return start; 5518 } 5519 5520 /** 5521 * Arguments: 5522 * 5523 // Input: 5524 // c_rarg0 - x address 5525 // c_rarg1 - x length 5526 // c_rarg2 - z address 5527 // c_rarg3 - z lenth 5528 * 5529 */ 5530 address generate_squareToLen() { 5531 5532 __ align(CodeEntryAlignment); 5533 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 5534 5535 address start = __ pc(); 5536 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 5537 // Unix: rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...) 5538 const Register x = rdi; 5539 const Register len = rsi; 5540 const Register z = r8; 5541 const Register zlen = rcx; 5542 5543 const Register tmp1 = r12; 5544 const Register tmp2 = r13; 5545 const Register tmp3 = r14; 5546 const Register tmp4 = r15; 5547 const Register tmp5 = rbx; 5548 5549 BLOCK_COMMENT("Entry:"); 5550 __ enter(); // required for proper stackwalking of RuntimeStub frame 5551 5552 setup_arg_regs(4); // x => rdi, len => rsi, z => rdx 5553 // zlen => rcx 5554 // r9 and r10 may be used to save non-volatile registers 5555 __ movptr(r8, rdx); 5556 __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax); 5557 5558 restore_arg_regs(); 5559 5560 __ leave(); // required for proper stackwalking of RuntimeStub frame 5561 __ ret(0); 5562 5563 return start; 5564 } 5565 5566 address generate_method_entry_barrier() { 5567 __ align(CodeEntryAlignment); 5568 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5569 5570 Label deoptimize_label; 5571 5572 address start = __ pc(); 5573 5574 __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing 5575 5576 BLOCK_COMMENT("Entry:"); 5577 __ enter(); // save rbp 5578 5579 // save c_rarg0, because we want to use that value. 5580 // We could do without it but then we depend on the number of slots used by pusha 5581 __ push(c_rarg0); 5582 5583 __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address 5584 5585 __ pusha(); 5586 5587 // The method may have floats as arguments, and we must spill them before calling 5588 // the VM runtime. 5589 assert(Argument::n_float_register_parameters_j == 8, "Assumption"); 5590 const int xmm_size = wordSize * 2; 5591 const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j; 5592 __ subptr(rsp, xmm_spill_size); 5593 __ movdqu(Address(rsp, xmm_size * 7), xmm7); 5594 __ movdqu(Address(rsp, xmm_size * 6), xmm6); 5595 __ movdqu(Address(rsp, xmm_size * 5), xmm5); 5596 __ movdqu(Address(rsp, xmm_size * 4), xmm4); 5597 __ movdqu(Address(rsp, xmm_size * 3), xmm3); 5598 __ movdqu(Address(rsp, xmm_size * 2), xmm2); 5599 __ movdqu(Address(rsp, xmm_size * 1), xmm1); 5600 __ movdqu(Address(rsp, xmm_size * 0), xmm0); 5601 5602 __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1); 5603 5604 __ movdqu(xmm0, Address(rsp, xmm_size * 0)); 5605 __ movdqu(xmm1, Address(rsp, xmm_size * 1)); 5606 __ movdqu(xmm2, Address(rsp, xmm_size * 2)); 5607 __ movdqu(xmm3, Address(rsp, xmm_size * 3)); 5608 __ movdqu(xmm4, Address(rsp, xmm_size * 4)); 5609 __ movdqu(xmm5, Address(rsp, xmm_size * 5)); 5610 __ movdqu(xmm6, Address(rsp, xmm_size * 6)); 5611 __ movdqu(xmm7, Address(rsp, xmm_size * 7)); 5612 __ addptr(rsp, xmm_spill_size); 5613 5614 __ cmpl(rax, 1); // 1 means deoptimize 5615 __ jcc(Assembler::equal, deoptimize_label); 5616 5617 __ popa(); 5618 __ pop(c_rarg0); 5619 5620 __ leave(); 5621 5622 __ addptr(rsp, 1 * wordSize); // cookie 5623 __ ret(0); 5624 5625 5626 __ BIND(deoptimize_label); 5627 5628 __ popa(); 5629 __ pop(c_rarg0); 5630 5631 __ leave(); 5632 5633 // this can be taken out, but is good for verification purposes. getting a SIGSEGV 5634 // here while still having a correct stack is valuable 5635 __ testptr(rsp, Address(rsp, 0)); 5636 5637 __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier 5638 __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point 5639 5640 return start; 5641 } 5642 5643 /** 5644 * Arguments: 5645 * 5646 * Input: 5647 * c_rarg0 - out address 5648 * c_rarg1 - in address 5649 * c_rarg2 - offset 5650 * c_rarg3 - len 5651 * not Win64 5652 * c_rarg4 - k 5653 * Win64 5654 * rsp+40 - k 5655 */ 5656 address generate_mulAdd() { 5657 __ align(CodeEntryAlignment); 5658 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 5659 5660 address start = __ pc(); 5661 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 5662 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) 5663 const Register out = rdi; 5664 const Register in = rsi; 5665 const Register offset = r11; 5666 const Register len = rcx; 5667 const Register k = r8; 5668 5669 // Next registers will be saved on stack in mul_add(). 5670 const Register tmp1 = r12; 5671 const Register tmp2 = r13; 5672 const Register tmp3 = r14; 5673 const Register tmp4 = r15; 5674 const Register tmp5 = rbx; 5675 5676 BLOCK_COMMENT("Entry:"); 5677 __ enter(); // required for proper stackwalking of RuntimeStub frame 5678 5679 setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx 5680 // len => rcx, k => r8 5681 // r9 and r10 may be used to save non-volatile registers 5682 #ifdef _WIN64 5683 // last argument is on stack on Win64 5684 __ movl(k, Address(rsp, 6 * wordSize)); 5685 #endif 5686 __ movptr(r11, rdx); // move offset in rdx to offset(r11) 5687 __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax); 5688 5689 restore_arg_regs(); 5690 5691 __ leave(); // required for proper stackwalking of RuntimeStub frame 5692 __ ret(0); 5693 5694 return start; 5695 } 5696 5697 address generate_libmExp() { 5698 StubCodeMark mark(this, "StubRoutines", "libmExp"); 5699 5700 address start = __ pc(); 5701 5702 const XMMRegister x0 = xmm0; 5703 const XMMRegister x1 = xmm1; 5704 const XMMRegister x2 = xmm2; 5705 const XMMRegister x3 = xmm3; 5706 5707 const XMMRegister x4 = xmm4; 5708 const XMMRegister x5 = xmm5; 5709 const XMMRegister x6 = xmm6; 5710 const XMMRegister x7 = xmm7; 5711 5712 const Register tmp = r11; 5713 5714 BLOCK_COMMENT("Entry:"); 5715 __ enter(); // required for proper stackwalking of RuntimeStub frame 5716 5717 __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); 5718 5719 __ leave(); // required for proper stackwalking of RuntimeStub frame 5720 __ ret(0); 5721 5722 return start; 5723 5724 } 5725 5726 address generate_libmLog() { 5727 StubCodeMark mark(this, "StubRoutines", "libmLog"); 5728 5729 address start = __ pc(); 5730 5731 const XMMRegister x0 = xmm0; 5732 const XMMRegister x1 = xmm1; 5733 const XMMRegister x2 = xmm2; 5734 const XMMRegister x3 = xmm3; 5735 5736 const XMMRegister x4 = xmm4; 5737 const XMMRegister x5 = xmm5; 5738 const XMMRegister x6 = xmm6; 5739 const XMMRegister x7 = xmm7; 5740 5741 const Register tmp1 = r11; 5742 const Register tmp2 = r8; 5743 5744 BLOCK_COMMENT("Entry:"); 5745 __ enter(); // required for proper stackwalking of RuntimeStub frame 5746 5747 __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2); 5748 5749 __ leave(); // required for proper stackwalking of RuntimeStub frame 5750 __ ret(0); 5751 5752 return start; 5753 5754 } 5755 5756 address generate_libmLog10() { 5757 StubCodeMark mark(this, "StubRoutines", "libmLog10"); 5758 5759 address start = __ pc(); 5760 5761 const XMMRegister x0 = xmm0; 5762 const XMMRegister x1 = xmm1; 5763 const XMMRegister x2 = xmm2; 5764 const XMMRegister x3 = xmm3; 5765 5766 const XMMRegister x4 = xmm4; 5767 const XMMRegister x5 = xmm5; 5768 const XMMRegister x6 = xmm6; 5769 const XMMRegister x7 = xmm7; 5770 5771 const Register tmp = r11; 5772 5773 BLOCK_COMMENT("Entry:"); 5774 __ enter(); // required for proper stackwalking of RuntimeStub frame 5775 5776 __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); 5777 5778 __ leave(); // required for proper stackwalking of RuntimeStub frame 5779 __ ret(0); 5780 5781 return start; 5782 5783 } 5784 5785 address generate_libmPow() { 5786 StubCodeMark mark(this, "StubRoutines", "libmPow"); 5787 5788 address start = __ pc(); 5789 5790 const XMMRegister x0 = xmm0; 5791 const XMMRegister x1 = xmm1; 5792 const XMMRegister x2 = xmm2; 5793 const XMMRegister x3 = xmm3; 5794 5795 const XMMRegister x4 = xmm4; 5796 const XMMRegister x5 = xmm5; 5797 const XMMRegister x6 = xmm6; 5798 const XMMRegister x7 = xmm7; 5799 5800 const Register tmp1 = r8; 5801 const Register tmp2 = r9; 5802 const Register tmp3 = r10; 5803 const Register tmp4 = r11; 5804 5805 BLOCK_COMMENT("Entry:"); 5806 __ enter(); // required for proper stackwalking of RuntimeStub frame 5807 5808 __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4); 5809 5810 __ leave(); // required for proper stackwalking of RuntimeStub frame 5811 __ ret(0); 5812 5813 return start; 5814 5815 } 5816 5817 address generate_libmSin() { 5818 StubCodeMark mark(this, "StubRoutines", "libmSin"); 5819 5820 address start = __ pc(); 5821 5822 const XMMRegister x0 = xmm0; 5823 const XMMRegister x1 = xmm1; 5824 const XMMRegister x2 = xmm2; 5825 const XMMRegister x3 = xmm3; 5826 5827 const XMMRegister x4 = xmm4; 5828 const XMMRegister x5 = xmm5; 5829 const XMMRegister x6 = xmm6; 5830 const XMMRegister x7 = xmm7; 5831 5832 const Register tmp1 = r8; 5833 const Register tmp2 = r9; 5834 const Register tmp3 = r10; 5835 const Register tmp4 = r11; 5836 5837 BLOCK_COMMENT("Entry:"); 5838 __ enter(); // required for proper stackwalking of RuntimeStub frame 5839 5840 #ifdef _WIN64 5841 __ push(rsi); 5842 __ push(rdi); 5843 #endif 5844 __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4); 5845 5846 #ifdef _WIN64 5847 __ pop(rdi); 5848 __ pop(rsi); 5849 #endif 5850 5851 __ leave(); // required for proper stackwalking of RuntimeStub frame 5852 __ ret(0); 5853 5854 return start; 5855 5856 } 5857 5858 address generate_libmCos() { 5859 StubCodeMark mark(this, "StubRoutines", "libmCos"); 5860 5861 address start = __ pc(); 5862 5863 const XMMRegister x0 = xmm0; 5864 const XMMRegister x1 = xmm1; 5865 const XMMRegister x2 = xmm2; 5866 const XMMRegister x3 = xmm3; 5867 5868 const XMMRegister x4 = xmm4; 5869 const XMMRegister x5 = xmm5; 5870 const XMMRegister x6 = xmm6; 5871 const XMMRegister x7 = xmm7; 5872 5873 const Register tmp1 = r8; 5874 const Register tmp2 = r9; 5875 const Register tmp3 = r10; 5876 const Register tmp4 = r11; 5877 5878 BLOCK_COMMENT("Entry:"); 5879 __ enter(); // required for proper stackwalking of RuntimeStub frame 5880 5881 #ifdef _WIN64 5882 __ push(rsi); 5883 __ push(rdi); 5884 #endif 5885 __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4); 5886 5887 #ifdef _WIN64 5888 __ pop(rdi); 5889 __ pop(rsi); 5890 #endif 5891 5892 __ leave(); // required for proper stackwalking of RuntimeStub frame 5893 __ ret(0); 5894 5895 return start; 5896 5897 } 5898 5899 address generate_libmTan() { 5900 StubCodeMark mark(this, "StubRoutines", "libmTan"); 5901 5902 address start = __ pc(); 5903 5904 const XMMRegister x0 = xmm0; 5905 const XMMRegister x1 = xmm1; 5906 const XMMRegister x2 = xmm2; 5907 const XMMRegister x3 = xmm3; 5908 5909 const XMMRegister x4 = xmm4; 5910 const XMMRegister x5 = xmm5; 5911 const XMMRegister x6 = xmm6; 5912 const XMMRegister x7 = xmm7; 5913 5914 const Register tmp1 = r8; 5915 const Register tmp2 = r9; 5916 const Register tmp3 = r10; 5917 const Register tmp4 = r11; 5918 5919 BLOCK_COMMENT("Entry:"); 5920 __ enter(); // required for proper stackwalking of RuntimeStub frame 5921 5922 #ifdef _WIN64 5923 __ push(rsi); 5924 __ push(rdi); 5925 #endif 5926 __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4); 5927 5928 #ifdef _WIN64 5929 __ pop(rdi); 5930 __ pop(rsi); 5931 #endif 5932 5933 __ leave(); // required for proper stackwalking of RuntimeStub frame 5934 __ ret(0); 5935 5936 return start; 5937 5938 } 5939 5940 #undef __ 5941 #define __ masm-> 5942 5943 // Continuation point for throwing of implicit exceptions that are 5944 // not handled in the current activation. Fabricates an exception 5945 // oop and initiates normal exception dispatching in this 5946 // frame. Since we need to preserve callee-saved values (currently 5947 // only for C2, but done for C1 as well) we need a callee-saved oop 5948 // map and therefore have to make these stubs into RuntimeStubs 5949 // rather than BufferBlobs. If the compiler needs all registers to 5950 // be preserved between the fault point and the exception handler 5951 // then it must assume responsibility for that in 5952 // AbstractCompiler::continuation_for_implicit_null_exception or 5953 // continuation_for_implicit_division_by_zero_exception. All other 5954 // implicit exceptions (e.g., NullPointerException or 5955 // AbstractMethodError on entry) are either at call sites or 5956 // otherwise assume that stack unwinding will be initiated, so 5957 // caller saved registers were assumed volatile in the compiler. 5958 address generate_throw_exception(const char* name, 5959 address runtime_entry, 5960 Register arg1 = noreg, 5961 Register arg2 = noreg) { 5962 // Information about frame layout at time of blocking runtime call. 5963 // Note that we only have to preserve callee-saved registers since 5964 // the compilers are responsible for supplying a continuation point 5965 // if they expect all registers to be preserved. 5966 enum layout { 5967 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 5968 rbp_off2, 5969 return_off, 5970 return_off2, 5971 framesize // inclusive of return address 5972 }; 5973 5974 int insts_size = 512; 5975 int locs_size = 64; 5976 5977 CodeBuffer code(name, insts_size, locs_size); 5978 OopMapSet* oop_maps = new OopMapSet(); 5979 MacroAssembler* masm = new MacroAssembler(&code); 5980 5981 address start = __ pc(); 5982 5983 // This is an inlined and slightly modified version of call_VM 5984 // which has the ability to fetch the return PC out of 5985 // thread-local storage and also sets up last_Java_sp slightly 5986 // differently than the real call_VM 5987 5988 __ enter(); // required for proper stackwalking of RuntimeStub frame 5989 5990 assert(is_even(framesize/2), "sp not 16-byte aligned"); 5991 5992 // return address and rbp are already in place 5993 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog 5994 5995 int frame_complete = __ pc() - start; 5996 5997 // Set up last_Java_sp and last_Java_fp 5998 address the_pc = __ pc(); 5999 __ set_last_Java_frame(rsp, rbp, the_pc); 6000 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 6001 6002 // Call runtime 6003 if (arg1 != noreg) { 6004 assert(arg2 != c_rarg1, "clobbered"); 6005 __ movptr(c_rarg1, arg1); 6006 } 6007 if (arg2 != noreg) { 6008 __ movptr(c_rarg2, arg2); 6009 } 6010 __ movptr(c_rarg0, r15_thread); 6011 BLOCK_COMMENT("call runtime_entry"); 6012 __ call(RuntimeAddress(runtime_entry)); 6013 6014 // Generate oop map 6015 OopMap* map = new OopMap(framesize, 0); 6016 6017 oop_maps->add_gc_map(the_pc - start, map); 6018 6019 __ reset_last_Java_frame(true); 6020 6021 __ leave(); // required for proper stackwalking of RuntimeStub frame 6022 6023 // check for pending exceptions 6024 #ifdef ASSERT 6025 Label L; 6026 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), 6027 (int32_t) NULL_WORD); 6028 __ jcc(Assembler::notEqual, L); 6029 __ should_not_reach_here(); 6030 __ bind(L); 6031 #endif // ASSERT 6032 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 6033 6034 6035 // codeBlob framesize is in words (not VMRegImpl::slot_size) 6036 RuntimeStub* stub = 6037 RuntimeStub::new_runtime_stub(name, 6038 &code, 6039 frame_complete, 6040 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 6041 oop_maps, false); 6042 return stub->entry_point(); 6043 } 6044 6045 void create_control_words() { 6046 // Round to nearest, 53-bit mode, exceptions masked 6047 StubRoutines::_fpu_cntrl_wrd_std = 0x027F; 6048 // Round to zero, 53-bit mode, exception mased 6049 StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F; 6050 // Round to nearest, 24-bit mode, exceptions masked 6051 StubRoutines::_fpu_cntrl_wrd_24 = 0x007F; 6052 // Round to nearest, 64-bit mode, exceptions masked 6053 StubRoutines::_mxcsr_std = 0x1F80; 6054 // Note: the following two constants are 80-bit values 6055 // layout is critical for correct loading by FPU. 6056 // Bias for strict fp multiply/divide 6057 StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000 6058 StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000; 6059 StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff; 6060 // Un-Bias for strict fp multiply/divide 6061 StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000 6062 StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000; 6063 StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff; 6064 } 6065 6066 // Initialization 6067 void generate_initial() { 6068 // Generates all stubs and initializes the entry points 6069 6070 // This platform-specific settings are needed by generate_call_stub() 6071 create_control_words(); 6072 6073 // entry points that exist in all platforms Note: This is code 6074 // that could be shared among different platforms - however the 6075 // benefit seems to be smaller than the disadvantage of having a 6076 // much more complicated generator structure. See also comment in 6077 // stubRoutines.hpp. 6078 6079 StubRoutines::_forward_exception_entry = generate_forward_exception(); 6080 6081 StubRoutines::_call_stub_entry = 6082 generate_call_stub(StubRoutines::_call_stub_return_address); 6083 6084 // is referenced by megamorphic call 6085 StubRoutines::_catch_exception_entry = generate_catch_exception(); 6086 6087 // atomic calls 6088 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg(); 6089 StubRoutines::_atomic_xchg_long_entry = generate_atomic_xchg_long(); 6090 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg(); 6091 StubRoutines::_atomic_cmpxchg_byte_entry = generate_atomic_cmpxchg_byte(); 6092 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long(); 6093 StubRoutines::_atomic_add_entry = generate_atomic_add(); 6094 StubRoutines::_atomic_add_long_entry = generate_atomic_add_long(); 6095 StubRoutines::_fence_entry = generate_orderaccess_fence(); 6096 6097 // platform dependent 6098 StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp(); 6099 StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp(); 6100 6101 StubRoutines::x86::_verify_mxcsr_entry = generate_verify_mxcsr(); 6102 6103 // Build this early so it's available for the interpreter. 6104 StubRoutines::_throw_StackOverflowError_entry = 6105 generate_throw_exception("StackOverflowError throw_exception", 6106 CAST_FROM_FN_PTR(address, 6107 SharedRuntime:: 6108 throw_StackOverflowError)); 6109 StubRoutines::_throw_delayed_StackOverflowError_entry = 6110 generate_throw_exception("delayed StackOverflowError throw_exception", 6111 CAST_FROM_FN_PTR(address, 6112 SharedRuntime:: 6113 throw_delayed_StackOverflowError)); 6114 if (UseCRC32Intrinsics) { 6115 // set table address before stub generation which use it 6116 StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table; 6117 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 6118 } 6119 6120 if (UseCRC32CIntrinsics) { 6121 bool supports_clmul = VM_Version::supports_clmul(); 6122 StubRoutines::x86::generate_CRC32C_table(supports_clmul); 6123 StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table; 6124 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul); 6125 } 6126 if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) { 6127 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) || 6128 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) || 6129 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) { 6130 StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF; 6131 StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2; 6132 StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4; 6133 StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable; 6134 StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2; 6135 StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3; 6136 StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1; 6137 StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE; 6138 StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4; 6139 StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV; 6140 StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK; 6141 StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1; 6142 StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3; 6143 StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO; 6144 } 6145 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) { 6146 StubRoutines::_dexp = generate_libmExp(); 6147 } 6148 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 6149 StubRoutines::_dlog = generate_libmLog(); 6150 } 6151 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) { 6152 StubRoutines::_dlog10 = generate_libmLog10(); 6153 } 6154 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) { 6155 StubRoutines::_dpow = generate_libmPow(); 6156 } 6157 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 6158 StubRoutines::_dsin = generate_libmSin(); 6159 } 6160 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 6161 StubRoutines::_dcos = generate_libmCos(); 6162 } 6163 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) { 6164 StubRoutines::_dtan = generate_libmTan(); 6165 } 6166 } 6167 } 6168 6169 void generate_all() { 6170 // Generates all stubs and initializes the entry points 6171 6172 // These entry points require SharedInfo::stack0 to be set up in 6173 // non-core builds and need to be relocatable, so they each 6174 // fabricate a RuntimeStub internally. 6175 StubRoutines::_throw_AbstractMethodError_entry = 6176 generate_throw_exception("AbstractMethodError throw_exception", 6177 CAST_FROM_FN_PTR(address, 6178 SharedRuntime:: 6179 throw_AbstractMethodError)); 6180 6181 StubRoutines::_throw_IncompatibleClassChangeError_entry = 6182 generate_throw_exception("IncompatibleClassChangeError throw_exception", 6183 CAST_FROM_FN_PTR(address, 6184 SharedRuntime:: 6185 throw_IncompatibleClassChangeError)); 6186 6187 StubRoutines::_throw_NullPointerException_at_call_entry = 6188 generate_throw_exception("NullPointerException at call throw_exception", 6189 CAST_FROM_FN_PTR(address, 6190 SharedRuntime:: 6191 throw_NullPointerException_at_call)); 6192 6193 // entry points that are platform specific 6194 StubRoutines::x86::_f2i_fixup = generate_f2i_fixup(); 6195 StubRoutines::x86::_f2l_fixup = generate_f2l_fixup(); 6196 StubRoutines::x86::_d2i_fixup = generate_d2i_fixup(); 6197 StubRoutines::x86::_d2l_fixup = generate_d2l_fixup(); 6198 6199 StubRoutines::x86::_float_sign_mask = generate_fp_mask("float_sign_mask", 0x7FFFFFFF7FFFFFFF); 6200 StubRoutines::x86::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000); 6201 StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF); 6202 StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000); 6203 StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF); 6204 StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000); 6205 StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF); 6206 StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000); 6207 StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff); 6208 StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask"); 6209 StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000); 6210 6211 // support for verify_oop (must happen after universe_init) 6212 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 6213 6214 // data cache line writeback 6215 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 6216 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 6217 6218 // arraycopy stubs used by compilers 6219 generate_arraycopy_stubs(); 6220 6221 // don't bother generating these AES intrinsic stubs unless global flag is set 6222 if (UseAESIntrinsics) { 6223 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others 6224 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 6225 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 6226 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 6227 if (VM_Version::supports_vaes() && VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) { 6228 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt(); 6229 StubRoutines::_electronicCodeBook_encryptAESCrypt = generate_electronicCodeBook_encryptAESCrypt(); 6230 StubRoutines::_electronicCodeBook_decryptAESCrypt = generate_electronicCodeBook_decryptAESCrypt(); 6231 } else { 6232 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 6233 } 6234 } 6235 if (UseAESCTRIntrinsics) { 6236 if (VM_Version::supports_vaes() && VM_Version::supports_avx512bw() && VM_Version::supports_avx512vl()) { 6237 StubRoutines::x86::_counter_mask_addr = counter_mask_addr(); 6238 StubRoutines::_counterMode_AESCrypt = generate_counterMode_VectorAESCrypt(); 6239 } else { 6240 StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask(); 6241 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel(); 6242 } 6243 } 6244 6245 if (UseSHA1Intrinsics) { 6246 StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask(); 6247 StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask(); 6248 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 6249 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 6250 } 6251 if (UseSHA256Intrinsics) { 6252 StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256; 6253 char* dst = (char*)StubRoutines::x86::_k256_W; 6254 char* src = (char*)StubRoutines::x86::_k256; 6255 for (int ii = 0; ii < 16; ++ii) { 6256 memcpy(dst + 32 * ii, src + 16 * ii, 16); 6257 memcpy(dst + 32 * ii + 16, src + 16 * ii, 16); 6258 } 6259 StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W; 6260 StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask(); 6261 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 6262 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 6263 } 6264 if (UseSHA512Intrinsics) { 6265 StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W; 6266 StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512(); 6267 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 6268 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 6269 } 6270 6271 // Generate GHASH intrinsics code 6272 if (UseGHASHIntrinsics) { 6273 StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); 6274 StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask(); 6275 if (VM_Version::supports_avx()) { 6276 StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr(); 6277 StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr(); 6278 StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks(); 6279 } else { 6280 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 6281 } 6282 } 6283 6284 if (UseBASE64Intrinsics) { 6285 StubRoutines::x86::_and_mask = base64_and_mask_addr(); 6286 StubRoutines::x86::_bswap_mask = base64_bswap_mask_addr(); 6287 StubRoutines::x86::_base64_charset = base64_charset_addr(); 6288 StubRoutines::x86::_url_charset = base64url_charset_addr(); 6289 StubRoutines::x86::_gather_mask = base64_gather_mask_addr(); 6290 StubRoutines::x86::_left_shift_mask = base64_left_shift_mask_addr(); 6291 StubRoutines::x86::_right_shift_mask = base64_right_shift_mask_addr(); 6292 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 6293 } 6294 6295 // Safefetch stubs. 6296 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 6297 &StubRoutines::_safefetch32_fault_pc, 6298 &StubRoutines::_safefetch32_continuation_pc); 6299 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 6300 &StubRoutines::_safefetchN_fault_pc, 6301 &StubRoutines::_safefetchN_continuation_pc); 6302 6303 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 6304 if (bs_nm != NULL) { 6305 StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier(); 6306 } 6307 #ifdef COMPILER2 6308 if (UseMultiplyToLenIntrinsic) { 6309 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 6310 } 6311 if (UseSquareToLenIntrinsic) { 6312 StubRoutines::_squareToLen = generate_squareToLen(); 6313 } 6314 if (UseMulAddIntrinsic) { 6315 StubRoutines::_mulAdd = generate_mulAdd(); 6316 } 6317 #ifndef _WINDOWS 6318 if (UseMontgomeryMultiplyIntrinsic) { 6319 StubRoutines::_montgomeryMultiply 6320 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply); 6321 } 6322 if (UseMontgomerySquareIntrinsic) { 6323 StubRoutines::_montgomerySquare 6324 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); 6325 } 6326 #endif // WINDOWS 6327 #endif // COMPILER2 6328 6329 if (UseVectorizedMismatchIntrinsic) { 6330 StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch(); 6331 } 6332 } 6333 6334 public: 6335 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 6336 if (all) { 6337 generate_all(); 6338 } else { 6339 generate_initial(); 6340 } 6341 } 6342 }; // end class declaration 6343 6344 #define UCM_TABLE_MAX_ENTRIES 16 6345 void StubGenerator_generate(CodeBuffer* code, bool all) { 6346 if (UnsafeCopyMemory::_table == NULL) { 6347 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 6348 } 6349 StubGenerator g(code, all); 6350 }