1 /*
   2  * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "ci/ciUtilities.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_x86.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/continuation.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 // Declaration and definition of StubGenerator (no .hpp file).
  50 // For a more detailed description of the stub routine structure
  51 // see the comment in stubRoutines.hpp
  52 
  53 #define __ _masm->
  54 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  55 #define a__ ((Assembler*)_masm)->
  56 
  57 #ifdef PRODUCT
  58 #define BLOCK_COMMENT(str) /* nothing */
  59 #else
  60 #define BLOCK_COMMENT(str) __ block_comment(str)
  61 #endif
  62 
  63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  64 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  65 
  66 // Stub Code definitions
  67 
  68 class StubGenerator: public StubCodeGenerator {
  69  private:
  70 
  71 #ifdef PRODUCT
  72 #define inc_counter_np(counter) ((void)0)
  73 #else
  74   void inc_counter_np_(int& counter) {
  75     // This can destroy rscratch1 if counter is far from the code cache
  76     __ incrementl(ExternalAddress((address)&counter));
  77   }
  78 #define inc_counter_np(counter) \
  79   BLOCK_COMMENT("inc_counter " #counter); \
  80   inc_counter_np_(counter);
  81 #endif
  82 
  83   // Call stubs are used to call Java from C
  84   //
  85   // Linux Arguments:
  86   //    c_rarg0:   call wrapper address                   address
  87   //    c_rarg1:   result                                 address
  88   //    c_rarg2:   result type                            BasicType
  89   //    c_rarg3:   method                                 Method*
  90   //    c_rarg4:   (interpreter) entry point              address
  91   //    c_rarg5:   parameters                             intptr_t*
  92   //    16(rbp): parameter size (in words)              int
  93   //    24(rbp): thread                                 Thread*
  94   //
  95   //     [ return_from_Java     ] <--- rsp
  96   //     [ argument word n      ]
  97   //      ...
  98   // -12 [ argument word 1      ]
  99   // -11 [ saved r15            ] <--- rsp_after_call
 100   // -10 [ saved r14            ]
 101   //  -9 [ saved r13            ]
 102   //  -8 [ saved r12            ]
 103   //  -7 [ saved rbx            ]
 104   //  -6 [ call wrapper         ]
 105   //  -5 [ result               ]
 106   //  -4 [ result type          ]
 107   //  -3 [ method               ]
 108   //  -2 [ entry point          ]
 109   //  -1 [ parameters           ]
 110   //   0 [ saved rbp            ] <--- rbp
 111   //   1 [ return address       ]
 112   //   2 [ parameter size       ]
 113   //   3 [ thread               ]
 114   //
 115   // Windows Arguments:
 116   //    c_rarg0:   call wrapper address                   address
 117   //    c_rarg1:   result                                 address
 118   //    c_rarg2:   result type                            BasicType
 119   //    c_rarg3:   method                                 Method*
 120   //    48(rbp): (interpreter) entry point              address
 121   //    56(rbp): parameters                             intptr_t*
 122   //    64(rbp): parameter size (in words)              int
 123   //    72(rbp): thread                                 Thread*
 124   //
 125   //     [ return_from_Java     ] <--- rsp
 126   //     [ argument word n      ]
 127   //      ...
 128   // -60 [ argument word 1      ]
 129   // -59 [ saved xmm31          ] <--- rsp after_call
 130   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 131   // -27 [ saved xmm15          ]
 132   //     [ saved xmm7-xmm14     ]
 133   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 134   //  -7 [ saved r15            ]
 135   //  -6 [ saved r14            ]
 136   //  -5 [ saved r13            ]
 137   //  -4 [ saved r12            ]
 138   //  -3 [ saved rdi            ]
 139   //  -2 [ saved rsi            ]
 140   //  -1 [ saved rbx            ]
 141   //   0 [ saved rbp            ] <--- rbp
 142   //   1 [ return address       ]
 143   //   2 [ call wrapper         ]
 144   //   3 [ result               ]
 145   //   4 [ result type          ]
 146   //   5 [ method               ]
 147   //   6 [ entry point          ]
 148   //   7 [ parameters           ]
 149   //   8 [ parameter size       ]
 150   //   9 [ thread               ]
 151   //
 152   //    Windows reserves the callers stack space for arguments 1-4.
 153   //    We spill c_rarg0-c_rarg3 to this space.
 154 
 155   // Call stub stack layout word offsets from rbp
 156   enum call_stub_layout {
 157 #ifdef _WIN64
 158     xmm_save_first     = 6,  // save from xmm6
 159     xmm_save_last      = 31, // to xmm31
 160     xmm_save_base      = -9,
 161     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 162     r15_off            = -7,
 163     r14_off            = -6,
 164     r13_off            = -5,
 165     r12_off            = -4,
 166     rdi_off            = -3,
 167     rsi_off            = -2,
 168     rbx_off            = -1,
 169     rbp_off            =  0,
 170     retaddr_off        =  1,
 171     call_wrapper_off   =  2,
 172     result_off         =  3,
 173     result_type_off    =  4,
 174     method_off         =  5,
 175     entry_point_off    =  6,
 176     parameters_off     =  7,
 177     parameter_size_off =  8,
 178     thread_off         =  9
 179 #else
 180     rsp_after_call_off = -12,
 181     mxcsr_off          = rsp_after_call_off,
 182     r15_off            = -11,
 183     r14_off            = -10,
 184     r13_off            = -9,
 185     r12_off            = -8,
 186     rbx_off            = -7,
 187     call_wrapper_off   = -6,
 188     result_off         = -5,
 189     result_type_off    = -4,
 190     method_off         = -3,
 191     entry_point_off    = -2,
 192     parameters_off     = -1,
 193     rbp_off            =  0,
 194     retaddr_off        =  1,
 195     parameter_size_off =  2,
 196     thread_off         =  3
 197 #endif
 198   };
 199 
 200 #ifdef _WIN64
 201   Address xmm_save(int reg) {
 202     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 203     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 204   }
 205 #endif
 206 
 207   address generate_call_stub(address& return_address) {
 208     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 209            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 210            "adjust this code");
 211     StubCodeMark mark(this, "StubRoutines", "call_stub");
 212     address start = __ pc();
 213 
 214     // same as in generate_catch_exception()!
 215     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 216 
 217     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 218     const Address result        (rbp, result_off         * wordSize);
 219     const Address result_type   (rbp, result_type_off    * wordSize);
 220     const Address method        (rbp, method_off         * wordSize);
 221     const Address entry_point   (rbp, entry_point_off    * wordSize);
 222     const Address parameters    (rbp, parameters_off     * wordSize);
 223     const Address parameter_size(rbp, parameter_size_off * wordSize);
 224 
 225     // same as in generate_catch_exception()!
 226     const Address thread        (rbp, thread_off         * wordSize);
 227 
 228     const Address r15_save(rbp, r15_off * wordSize);
 229     const Address r14_save(rbp, r14_off * wordSize);
 230     const Address r13_save(rbp, r13_off * wordSize);
 231     const Address r12_save(rbp, r12_off * wordSize);
 232     const Address rbx_save(rbp, rbx_off * wordSize);
 233 
 234     // stub code
 235     __ enter();
 236     __ subptr(rsp, -rsp_after_call_off * wordSize);
 237 
 238     // save register parameters
 239 #ifndef _WIN64
 240     __ movptr(parameters,   c_rarg5); // parameters
 241     __ movptr(entry_point,  c_rarg4); // entry_point
 242 #endif
 243 
 244     __ movptr(method,       c_rarg3); // method
 245     __ movl(result_type,  c_rarg2);   // result type
 246     __ movptr(result,       c_rarg1); // result
 247     __ movptr(call_wrapper, c_rarg0); // call wrapper
 248 
 249     // save regs belonging to calling function
 250     __ movptr(rbx_save, rbx);
 251     __ movptr(r12_save, r12);
 252     __ movptr(r13_save, r13);
 253     __ movptr(r14_save, r14);
 254     __ movptr(r15_save, r15);
 255     if (UseAVX > 2) {
 256       __ movl(rbx, 0xffff);
 257       __ kmovwl(k1, rbx);
 258     }
 259 #ifdef _WIN64
 260     int last_reg = 15;
 261     if (UseAVX > 2) {
 262       last_reg = 31;
 263     }
 264     if (VM_Version::supports_evex()) {
 265       for (int i = xmm_save_first; i <= last_reg; i++) {
 266         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 267       }
 268     } else {
 269       for (int i = xmm_save_first; i <= last_reg; i++) {
 270         __ movdqu(xmm_save(i), as_XMMRegister(i));
 271       }
 272     }
 273 
 274     const Address rdi_save(rbp, rdi_off * wordSize);
 275     const Address rsi_save(rbp, rsi_off * wordSize);
 276 
 277     __ movptr(rsi_save, rsi);
 278     __ movptr(rdi_save, rdi);
 279 #else
 280     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 281     {
 282       Label skip_ldmx;
 283       __ stmxcsr(mxcsr_save);
 284       __ movl(rax, mxcsr_save);
 285       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 286       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 287       __ cmp32(rax, mxcsr_std);
 288       __ jcc(Assembler::equal, skip_ldmx);
 289       __ ldmxcsr(mxcsr_std);
 290       __ bind(skip_ldmx);
 291     }
 292 #endif
 293 
 294     // Load up thread register
 295     __ movptr(r15_thread, thread);
 296     __ reinit_heapbase();
 297 
 298 #ifdef ASSERT
 299     // make sure we have no pending exceptions
 300     {
 301       Label L;
 302       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 303       __ jcc(Assembler::equal, L);
 304       __ stop("StubRoutines::call_stub: entered with pending exception");
 305       __ bind(L);
 306     }
 307 #endif
 308 
 309     // pass parameters if any
 310     BLOCK_COMMENT("pass parameters if any");
 311     Label parameters_done;
 312     __ movl(c_rarg3, parameter_size);
 313     __ testl(c_rarg3, c_rarg3);
 314     __ jcc(Assembler::zero, parameters_done);
 315 
 316     Label loop;
 317     __ movptr(c_rarg2, parameters);       // parameter pointer
 318     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 319     __ BIND(loop);
 320     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 321     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 322     __ decrementl(c_rarg1);             // decrement counter
 323     __ push(rax);                       // pass parameter
 324     __ jcc(Assembler::notZero, loop);
 325 
 326     // call Java function
 327     __ BIND(parameters_done);
 328     __ movptr(rbx, method);             // get Method*
 329     __ movptr(c_rarg1, entry_point);    // get entry_point
 330     __ mov(r13, rsp);                   // set sender sp
 331     BLOCK_COMMENT("call Java function");
 332     __ call(c_rarg1);
 333 
 334     BLOCK_COMMENT("call_stub_return_address:");
 335     return_address = __ pc();
 336 
 337     // store result depending on type (everything that is not
 338     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 339     __ movptr(c_rarg0, result);
 340     Label is_long, is_float, is_double, exit;
 341     __ movl(c_rarg1, result_type);
 342     __ cmpl(c_rarg1, T_OBJECT);
 343     __ jcc(Assembler::equal, is_long);
 344     __ cmpl(c_rarg1, T_LONG);
 345     __ jcc(Assembler::equal, is_long);
 346     __ cmpl(c_rarg1, T_FLOAT);
 347     __ jcc(Assembler::equal, is_float);
 348     __ cmpl(c_rarg1, T_DOUBLE);
 349     __ jcc(Assembler::equal, is_double);
 350 
 351     // handle T_INT case
 352     __ movl(Address(c_rarg0, 0), rax);
 353 
 354     __ BIND(exit);
 355 
 356     // pop parameters
 357     __ lea(rsp, rsp_after_call);
 358 
 359 #ifdef ASSERT
 360     // verify that threads correspond
 361     {
 362      Label L1, L2, L3;
 363       __ cmpptr(r15_thread, thread);
 364       __ jcc(Assembler::equal, L1);
 365       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 366       __ bind(L1);
 367       __ get_thread(rbx);
 368       __ cmpptr(r15_thread, thread);
 369       __ jcc(Assembler::equal, L2);
 370       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 371       __ bind(L2);
 372       __ cmpptr(r15_thread, rbx);
 373       __ jcc(Assembler::equal, L3);
 374       __ stop("StubRoutines::call_stub: threads must correspond");
 375       __ bind(L3);
 376     }
 377 #endif
 378 
 379     // restore regs belonging to calling function
 380 #ifdef _WIN64
 381     // emit the restores for xmm regs
 382     if (VM_Version::supports_evex()) {
 383       for (int i = xmm_save_first; i <= last_reg; i++) {
 384         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 385       }
 386     } else {
 387       for (int i = xmm_save_first; i <= last_reg; i++) {
 388         __ movdqu(as_XMMRegister(i), xmm_save(i));
 389       }
 390     }
 391 #endif
 392     __ movptr(r15, r15_save);
 393     __ movptr(r14, r14_save);
 394     __ movptr(r13, r13_save);
 395     __ movptr(r12, r12_save);
 396     __ movptr(rbx, rbx_save);
 397 
 398 #ifdef _WIN64
 399     __ movptr(rdi, rdi_save);
 400     __ movptr(rsi, rsi_save);
 401 #else
 402     __ ldmxcsr(mxcsr_save);
 403 #endif
 404 
 405     // restore rsp
 406     __ addptr(rsp, -rsp_after_call_off * wordSize);
 407 
 408     // return
 409     __ vzeroupper();
 410     __ pop(rbp);
 411     __ ret(0);
 412 
 413     // handle return types different from T_INT
 414     __ BIND(is_long);
 415     __ movq(Address(c_rarg0, 0), rax);
 416     __ jmp(exit);
 417 
 418     __ BIND(is_float);
 419     __ movflt(Address(c_rarg0, 0), xmm0);
 420     __ jmp(exit);
 421 
 422     __ BIND(is_double);
 423     __ movdbl(Address(c_rarg0, 0), xmm0);
 424     __ jmp(exit);
 425 
 426     return start;
 427   }
 428 
 429   // Return point for a Java call if there's an exception thrown in
 430   // Java code.  The exception is caught and transformed into a
 431   // pending exception stored in JavaThread that can be tested from
 432   // within the VM.
 433   //
 434   // Note: Usually the parameters are removed by the callee. In case
 435   // of an exception crossing an activation frame boundary, that is
 436   // not the case if the callee is compiled code => need to setup the
 437   // rsp.
 438   //
 439   // rax: exception oop
 440 
 441   address generate_catch_exception() {
 442     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 443     address start = __ pc();
 444 
 445     // same as in generate_call_stub():
 446     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 447     const Address thread        (rbp, thread_off         * wordSize);
 448 
 449 #ifdef ASSERT
 450     // verify that threads correspond
 451     {
 452       Label L1, L2, L3;
 453       __ cmpptr(r15_thread, thread);
 454       __ jcc(Assembler::equal, L1);
 455       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 456       __ bind(L1);
 457       __ get_thread(rbx);
 458       __ cmpptr(r15_thread, thread);
 459       __ jcc(Assembler::equal, L2);
 460       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 461       __ bind(L2);
 462       __ cmpptr(r15_thread, rbx);
 463       __ jcc(Assembler::equal, L3);
 464       __ stop("StubRoutines::catch_exception: threads must correspond");
 465       __ bind(L3);
 466     }
 467 #endif
 468 
 469     // set pending exception
 470     __ verify_oop(rax);
 471 
 472     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 473     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 474     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 475     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 476 
 477     // complete return to VM
 478     assert(StubRoutines::_call_stub_return_address != NULL,
 479            "_call_stub_return_address must have been generated before");
 480     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 481 
 482     return start;
 483   }
 484 
 485   // Continuation point for runtime calls returning with a pending
 486   // exception.  The pending exception check happened in the runtime
 487   // or native call stub.  The pending exception in Thread is
 488   // converted into a Java-level exception.
 489   //
 490   // Contract with Java-level exception handlers:
 491   // rax: exception
 492   // rdx: throwing pc
 493   //
 494   // NOTE: At entry of this stub, exception-pc must be on stack !!
 495 
 496   address generate_forward_exception() {
 497     StubCodeMark mark(this, "StubRoutines", "forward exception");
 498     address start = __ pc();
 499 
 500     // Upon entry, the sp points to the return address returning into
 501     // Java (interpreted or compiled) code; i.e., the return address
 502     // becomes the throwing pc.
 503     //
 504     // Arguments pushed before the runtime call are still on the stack
 505     // but the exception handler will reset the stack pointer ->
 506     // ignore them.  A potential result in registers can be ignored as
 507     // well.
 508 
 509 #ifdef ASSERT
 510     // make sure this code is only executed if there is a pending exception
 511     {
 512       Label L;
 513       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 514       __ jcc(Assembler::notEqual, L);
 515       __ stop("StubRoutines::forward exception: no pending exception (1)");
 516       __ bind(L);
 517     }
 518 #endif
 519 
 520     // compute exception handler into rbx
 521     __ movptr(c_rarg0, Address(rsp, 0));
 522     BLOCK_COMMENT("call exception_handler_for_return_address");
 523     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 524                          SharedRuntime::exception_handler_for_return_address),
 525                     r15_thread, c_rarg0);
 526     __ mov(rbx, rax);
 527 
 528     // setup rax & rdx, remove return address & clear pending exception
 529     __ pop(rdx);
 530     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 531     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 532 
 533 #ifdef ASSERT
 534     // make sure exception is set
 535     {
 536       Label L;
 537       __ testptr(rax, rax);
 538       __ jcc(Assembler::notEqual, L);
 539       __ stop("StubRoutines::forward exception: no pending exception (2)");
 540       __ bind(L);
 541     }
 542 #endif
 543 
 544     // continue at exception handler (return address removed)
 545     // rax: exception
 546     // rbx: exception handler
 547     // rdx: throwing pc
 548     __ verify_oop(rax);
 549     __ jmp(rbx);
 550 
 551     return start;
 552   }
 553 
 554   // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest)
 555   //
 556   // Arguments :
 557   //    c_rarg0: exchange_value
 558   //    c_rarg0: dest
 559   //
 560   // Result:
 561   //    *dest <- ex, return (orig *dest)
 562   address generate_atomic_xchg() {
 563     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 564     address start = __ pc();
 565 
 566     __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow
 567     __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK
 568     __ ret(0);
 569 
 570     return start;
 571   }
 572 
 573   // Support for intptr_t atomic::xchg_long(jlong exchange_value, volatile jlong* dest)
 574   //
 575   // Arguments :
 576   //    c_rarg0: exchange_value
 577   //    c_rarg1: dest
 578   //
 579   // Result:
 580   //    *dest <- ex, return (orig *dest)
 581   address generate_atomic_xchg_long() {
 582     StubCodeMark mark(this, "StubRoutines", "atomic_xchg_long");
 583     address start = __ pc();
 584 
 585     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 586     __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
 587     __ ret(0);
 588 
 589     return start;
 590   }
 591 
 592   // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest,
 593   //                                         jint compare_value)
 594   //
 595   // Arguments :
 596   //    c_rarg0: exchange_value
 597   //    c_rarg1: dest
 598   //    c_rarg2: compare_value
 599   //
 600   // Result:
 601   //    if ( compare_value == *dest ) {
 602   //       *dest = exchange_value
 603   //       return compare_value;
 604   //    else
 605   //       return *dest;
 606   address generate_atomic_cmpxchg() {
 607     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 608     address start = __ pc();
 609 
 610     __ movl(rax, c_rarg2);
 611    if ( os::is_MP() ) __ lock();
 612     __ cmpxchgl(c_rarg0, Address(c_rarg1, 0));
 613     __ ret(0);
 614 
 615     return start;
 616   }
 617 
 618   // Support for int8_t atomic::atomic_cmpxchg(int8_t exchange_value, volatile int8_t* dest,
 619   //                                           int8_t compare_value)
 620   //
 621   // Arguments :
 622   //    c_rarg0: exchange_value
 623   //    c_rarg1: dest
 624   //    c_rarg2: compare_value
 625   //
 626   // Result:
 627   //    if ( compare_value == *dest ) {
 628   //       *dest = exchange_value
 629   //       return compare_value;
 630   //    else
 631   //       return *dest;
 632   address generate_atomic_cmpxchg_byte() {
 633     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_byte");
 634     address start = __ pc();
 635 
 636     __ movsbq(rax, c_rarg2);
 637    if ( os::is_MP() ) __ lock();
 638     __ cmpxchgb(c_rarg0, Address(c_rarg1, 0));
 639     __ ret(0);
 640 
 641     return start;
 642   }
 643 
 644   // Support for int64_t atomic::atomic_cmpxchg(int64_t exchange_value,
 645   //                                            volatile int64_t* dest,
 646   //                                            int64_t compare_value)
 647   // Arguments :
 648   //    c_rarg0: exchange_value
 649   //    c_rarg1: dest
 650   //    c_rarg2: compare_value
 651   //
 652   // Result:
 653   //    if ( compare_value == *dest ) {
 654   //       *dest = exchange_value
 655   //       return compare_value;
 656   //    else
 657   //       return *dest;
 658   address generate_atomic_cmpxchg_long() {
 659     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 660     address start = __ pc();
 661 
 662     __ movq(rax, c_rarg2);
 663    if ( os::is_MP() ) __ lock();
 664     __ cmpxchgq(c_rarg0, Address(c_rarg1, 0));
 665     __ ret(0);
 666 
 667     return start;
 668   }
 669 
 670   // Support for jint atomic::add(jint add_value, volatile jint* dest)
 671   //
 672   // Arguments :
 673   //    c_rarg0: add_value
 674   //    c_rarg1: dest
 675   //
 676   // Result:
 677   //    *dest += add_value
 678   //    return *dest;
 679   address generate_atomic_add() {
 680     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 681     address start = __ pc();
 682 
 683     __ movl(rax, c_rarg0);
 684    if ( os::is_MP() ) __ lock();
 685     __ xaddl(Address(c_rarg1, 0), c_rarg0);
 686     __ addl(rax, c_rarg0);
 687     __ ret(0);
 688 
 689     return start;
 690   }
 691 
 692   // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest)
 693   //
 694   // Arguments :
 695   //    c_rarg0: add_value
 696   //    c_rarg1: dest
 697   //
 698   // Result:
 699   //    *dest += add_value
 700   //    return *dest;
 701   address generate_atomic_add_long() {
 702     StubCodeMark mark(this, "StubRoutines", "atomic_add_long");
 703     address start = __ pc();
 704 
 705     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 706    if ( os::is_MP() ) __ lock();
 707     __ xaddptr(Address(c_rarg1, 0), c_rarg0);
 708     __ addptr(rax, c_rarg0);
 709     __ ret(0);
 710 
 711     return start;
 712   }
 713 
 714   // Support for intptr_t OrderAccess::fence()
 715   //
 716   // Arguments :
 717   //
 718   // Result:
 719   address generate_orderaccess_fence() {
 720     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 721     address start = __ pc();
 722     __ membar(Assembler::StoreLoad);
 723     __ ret(0);
 724 
 725     return start;
 726   }
 727 
 728   // Support for intptr_t get_previous_fp()
 729   //
 730   // This routine is used to find the previous frame pointer for the
 731   // caller (current_frame_guess). This is used as part of debugging
 732   // ps() is seemingly lost trying to find frames.
 733   // This code assumes that caller current_frame_guess) has a frame.
 734   address generate_get_previous_fp() {
 735     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
 736     const Address old_fp(rbp, 0);
 737     const Address older_fp(rax, 0);
 738     address start = __ pc();
 739 
 740     __ enter();
 741     __ movptr(rax, old_fp); // callers fp
 742     __ movptr(rax, older_fp); // the frame for ps()
 743     __ pop(rbp);
 744     __ ret(0);
 745 
 746     return start;
 747   }
 748 
 749   // Support for intptr_t get_previous_sp()
 750   //
 751   // This routine is used to find the previous stack pointer for the
 752   // caller.
 753   address generate_get_previous_sp() {
 754     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
 755     address start = __ pc();
 756 
 757     __ movptr(rax, rsp);
 758     __ addptr(rax, 8); // return address is at the top of the stack.
 759     __ ret(0);
 760 
 761     return start;
 762   }
 763 
 764   //----------------------------------------------------------------------------------------------------
 765   // Support for void verify_mxcsr()
 766   //
 767   // This routine is used with -Xcheck:jni to verify that native
 768   // JNI code does not return to Java code without restoring the
 769   // MXCSR register to our expected state.
 770 
 771   address generate_verify_mxcsr() {
 772     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 773     address start = __ pc();
 774 
 775     const Address mxcsr_save(rsp, 0);
 776 
 777     if (CheckJNICalls) {
 778       Label ok_ret;
 779       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 780       __ push(rax);
 781       __ subptr(rsp, wordSize);      // allocate a temp location
 782       __ stmxcsr(mxcsr_save);
 783       __ movl(rax, mxcsr_save);
 784       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 785       __ cmp32(rax, mxcsr_std);
 786       __ jcc(Assembler::equal, ok_ret);
 787 
 788       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
 789 
 790       __ ldmxcsr(mxcsr_std);
 791 
 792       __ bind(ok_ret);
 793       __ addptr(rsp, wordSize);
 794       __ pop(rax);
 795     }
 796 
 797     __ ret(0);
 798 
 799     return start;
 800   }
 801 
 802   address generate_f2i_fixup() {
 803     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
 804     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 805 
 806     address start = __ pc();
 807 
 808     Label L;
 809 
 810     __ push(rax);
 811     __ push(c_rarg3);
 812     __ push(c_rarg2);
 813     __ push(c_rarg1);
 814 
 815     __ movl(rax, 0x7f800000);
 816     __ xorl(c_rarg3, c_rarg3);
 817     __ movl(c_rarg2, inout);
 818     __ movl(c_rarg1, c_rarg2);
 819     __ andl(c_rarg1, 0x7fffffff);
 820     __ cmpl(rax, c_rarg1); // NaN? -> 0
 821     __ jcc(Assembler::negative, L);
 822     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
 823     __ movl(c_rarg3, 0x80000000);
 824     __ movl(rax, 0x7fffffff);
 825     __ cmovl(Assembler::positive, c_rarg3, rax);
 826 
 827     __ bind(L);
 828     __ movptr(inout, c_rarg3);
 829 
 830     __ pop(c_rarg1);
 831     __ pop(c_rarg2);
 832     __ pop(c_rarg3);
 833     __ pop(rax);
 834 
 835     __ ret(0);
 836 
 837     return start;
 838   }
 839 
 840   address generate_f2l_fixup() {
 841     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
 842     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 843     address start = __ pc();
 844 
 845     Label L;
 846 
 847     __ push(rax);
 848     __ push(c_rarg3);
 849     __ push(c_rarg2);
 850     __ push(c_rarg1);
 851 
 852     __ movl(rax, 0x7f800000);
 853     __ xorl(c_rarg3, c_rarg3);
 854     __ movl(c_rarg2, inout);
 855     __ movl(c_rarg1, c_rarg2);
 856     __ andl(c_rarg1, 0x7fffffff);
 857     __ cmpl(rax, c_rarg1); // NaN? -> 0
 858     __ jcc(Assembler::negative, L);
 859     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
 860     __ mov64(c_rarg3, 0x8000000000000000);
 861     __ mov64(rax, 0x7fffffffffffffff);
 862     __ cmov(Assembler::positive, c_rarg3, rax);
 863 
 864     __ bind(L);
 865     __ movptr(inout, c_rarg3);
 866 
 867     __ pop(c_rarg1);
 868     __ pop(c_rarg2);
 869     __ pop(c_rarg3);
 870     __ pop(rax);
 871 
 872     __ ret(0);
 873 
 874     return start;
 875   }
 876 
 877   address generate_d2i_fixup() {
 878     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
 879     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 880 
 881     address start = __ pc();
 882 
 883     Label L;
 884 
 885     __ push(rax);
 886     __ push(c_rarg3);
 887     __ push(c_rarg2);
 888     __ push(c_rarg1);
 889     __ push(c_rarg0);
 890 
 891     __ movl(rax, 0x7ff00000);
 892     __ movq(c_rarg2, inout);
 893     __ movl(c_rarg3, c_rarg2);
 894     __ mov(c_rarg1, c_rarg2);
 895     __ mov(c_rarg0, c_rarg2);
 896     __ negl(c_rarg3);
 897     __ shrptr(c_rarg1, 0x20);
 898     __ orl(c_rarg3, c_rarg2);
 899     __ andl(c_rarg1, 0x7fffffff);
 900     __ xorl(c_rarg2, c_rarg2);
 901     __ shrl(c_rarg3, 0x1f);
 902     __ orl(c_rarg1, c_rarg3);
 903     __ cmpl(rax, c_rarg1);
 904     __ jcc(Assembler::negative, L); // NaN -> 0
 905     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
 906     __ movl(c_rarg2, 0x80000000);
 907     __ movl(rax, 0x7fffffff);
 908     __ cmov(Assembler::positive, c_rarg2, rax);
 909 
 910     __ bind(L);
 911     __ movptr(inout, c_rarg2);
 912 
 913     __ pop(c_rarg0);
 914     __ pop(c_rarg1);
 915     __ pop(c_rarg2);
 916     __ pop(c_rarg3);
 917     __ pop(rax);
 918 
 919     __ ret(0);
 920 
 921     return start;
 922   }
 923 
 924   address generate_d2l_fixup() {
 925     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
 926     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 927 
 928     address start = __ pc();
 929 
 930     Label L;
 931 
 932     __ push(rax);
 933     __ push(c_rarg3);
 934     __ push(c_rarg2);
 935     __ push(c_rarg1);
 936     __ push(c_rarg0);
 937 
 938     __ movl(rax, 0x7ff00000);
 939     __ movq(c_rarg2, inout);
 940     __ movl(c_rarg3, c_rarg2);
 941     __ mov(c_rarg1, c_rarg2);
 942     __ mov(c_rarg0, c_rarg2);
 943     __ negl(c_rarg3);
 944     __ shrptr(c_rarg1, 0x20);
 945     __ orl(c_rarg3, c_rarg2);
 946     __ andl(c_rarg1, 0x7fffffff);
 947     __ xorl(c_rarg2, c_rarg2);
 948     __ shrl(c_rarg3, 0x1f);
 949     __ orl(c_rarg1, c_rarg3);
 950     __ cmpl(rax, c_rarg1);
 951     __ jcc(Assembler::negative, L); // NaN -> 0
 952     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
 953     __ mov64(c_rarg2, 0x8000000000000000);
 954     __ mov64(rax, 0x7fffffffffffffff);
 955     __ cmovq(Assembler::positive, c_rarg2, rax);
 956 
 957     __ bind(L);
 958     __ movq(inout, c_rarg2);
 959 
 960     __ pop(c_rarg0);
 961     __ pop(c_rarg1);
 962     __ pop(c_rarg2);
 963     __ pop(c_rarg3);
 964     __ pop(rax);
 965 
 966     __ ret(0);
 967 
 968     return start;
 969   }
 970 
 971   address generate_fp_mask(const char *stub_name, int64_t mask) {
 972     __ align(CodeEntryAlignment);
 973     StubCodeMark mark(this, "StubRoutines", stub_name);
 974     address start = __ pc();
 975 
 976     __ emit_data64( mask, relocInfo::none );
 977     __ emit_data64( mask, relocInfo::none );
 978 
 979     return start;
 980   }
 981 
 982   // Non-destructive plausibility checks for oops
 983   //
 984   // Arguments:
 985   //    all args on stack!
 986   //
 987   // Stack after saving c_rarg3:
 988   //    [tos + 0]: saved c_rarg3
 989   //    [tos + 1]: saved c_rarg2
 990   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
 991   //    [tos + 3]: saved flags
 992   //    [tos + 4]: return address
 993   //  * [tos + 5]: error message (char*)
 994   //  * [tos + 6]: object to verify (oop)
 995   //  * [tos + 7]: saved rax - saved by caller and bashed
 996   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
 997   //  * = popped on exit
 998   address generate_verify_oop() {
 999     StubCodeMark mark(this, "StubRoutines", "verify_oop");
1000     address start = __ pc();
1001 
1002     Label exit, error;
1003 
1004     __ pushf();
1005     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
1006 
1007     __ push(r12);
1008 
1009     // save c_rarg2 and c_rarg3
1010     __ push(c_rarg2);
1011     __ push(c_rarg3);
1012 
1013     enum {
1014            // After previous pushes.
1015            oop_to_verify = 6 * wordSize,
1016            saved_rax     = 7 * wordSize,
1017            saved_r10     = 8 * wordSize,
1018 
1019            // Before the call to MacroAssembler::debug(), see below.
1020            return_addr   = 16 * wordSize,
1021            error_msg     = 17 * wordSize
1022     };
1023 
1024     // get object
1025     __ movptr(rax, Address(rsp, oop_to_verify));
1026 
1027     // make sure object is 'reasonable'
1028     __ testptr(rax, rax);
1029     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
1030     // Check if the oop is in the right area of memory
1031     __ movptr(c_rarg2, rax);
1032     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1033     __ andptr(c_rarg2, c_rarg3);
1034     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1035     __ cmpptr(c_rarg2, c_rarg3);
1036     __ jcc(Assembler::notZero, error);
1037 
1038     // set r12 to heapbase for load_klass()
1039     __ reinit_heapbase();
1040 
1041     // make sure klass is 'reasonable', which is not zero.
1042     __ load_klass(rax, rax);  // get klass
1043     __ testptr(rax, rax);
1044     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1045 
1046     // return if everything seems ok
1047     __ bind(exit);
1048     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1049     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1050     __ pop(c_rarg3);                             // restore c_rarg3
1051     __ pop(c_rarg2);                             // restore c_rarg2
1052     __ pop(r12);                                 // restore r12
1053     __ popf();                                   // restore flags
1054     __ ret(4 * wordSize);                        // pop caller saved stuff
1055 
1056     // handle errors
1057     __ bind(error);
1058     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1059     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1060     __ pop(c_rarg3);                             // get saved c_rarg3 back
1061     __ pop(c_rarg2);                             // get saved c_rarg2 back
1062     __ pop(r12);                                 // get saved r12 back
1063     __ popf();                                   // get saved flags off stack --
1064                                                  // will be ignored
1065 
1066     __ pusha();                                  // push registers
1067                                                  // (rip is already
1068                                                  // already pushed)
1069     // debug(char* msg, int64_t pc, int64_t regs[])
1070     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1071     // pushed all the registers, so now the stack looks like:
1072     //     [tos +  0] 16 saved registers
1073     //     [tos + 16] return address
1074     //   * [tos + 17] error message (char*)
1075     //   * [tos + 18] object to verify (oop)
1076     //   * [tos + 19] saved rax - saved by caller and bashed
1077     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1078     //   * = popped on exit
1079 
1080     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1081     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1082     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1083     __ mov(r12, rsp);                               // remember rsp
1084     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1085     __ andptr(rsp, -16);                            // align stack as required by ABI
1086     BLOCK_COMMENT("call MacroAssembler::debug");
1087     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1088     __ mov(rsp, r12);                               // restore rsp
1089     __ popa();                                      // pop registers (includes r12)
1090     __ ret(4 * wordSize);                           // pop caller saved stuff
1091 
1092     return start;
1093   }
1094 
1095   //
1096   // Verify that a register contains clean 32-bits positive value
1097   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1098   //
1099   //  Input:
1100   //    Rint  -  32-bits value
1101   //    Rtmp  -  scratch
1102   //
1103   void assert_clean_int(Register Rint, Register Rtmp) {
1104 #ifdef ASSERT
1105     Label L;
1106     assert_different_registers(Rtmp, Rint);
1107     __ movslq(Rtmp, Rint);
1108     __ cmpq(Rtmp, Rint);
1109     __ jcc(Assembler::equal, L);
1110     __ stop("high 32-bits of int value are not 0");
1111     __ bind(L);
1112 #endif
1113   }
1114 
1115   //  Generate overlap test for array copy stubs
1116   //
1117   //  Input:
1118   //     c_rarg0 - from
1119   //     c_rarg1 - to
1120   //     c_rarg2 - element count
1121   //
1122   //  Output:
1123   //     rax   - &from[element count - 1]
1124   //
1125   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1126     assert(no_overlap_target != NULL, "must be generated");
1127     array_overlap_test(no_overlap_target, NULL, sf);
1128   }
1129   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1130     array_overlap_test(NULL, &L_no_overlap, sf);
1131   }
1132   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1133     const Register from     = c_rarg0;
1134     const Register to       = c_rarg1;
1135     const Register count    = c_rarg2;
1136     const Register end_from = rax;
1137 
1138     __ cmpptr(to, from);
1139     __ lea(end_from, Address(from, count, sf, 0));
1140     if (NOLp == NULL) {
1141       ExternalAddress no_overlap(no_overlap_target);
1142       __ jump_cc(Assembler::belowEqual, no_overlap);
1143       __ cmpptr(to, end_from);
1144       __ jump_cc(Assembler::aboveEqual, no_overlap);
1145     } else {
1146       __ jcc(Assembler::belowEqual, (*NOLp));
1147       __ cmpptr(to, end_from);
1148       __ jcc(Assembler::aboveEqual, (*NOLp));
1149     }
1150   }
1151 
1152   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1153   //
1154   // Outputs:
1155   //    rdi - rcx
1156   //    rsi - rdx
1157   //    rdx - r8
1158   //    rcx - r9
1159   //
1160   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1161   // are non-volatile.  r9 and r10 should not be used by the caller.
1162   //
1163   void setup_arg_regs(int nargs = 3) {
1164     const Register saved_rdi = r9;
1165     const Register saved_rsi = r10;
1166     assert(nargs == 3 || nargs == 4, "else fix");
1167 #ifdef _WIN64
1168     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1169            "unexpected argument registers");
1170     if (nargs >= 4)
1171       __ mov(rax, r9);  // r9 is also saved_rdi
1172     __ movptr(saved_rdi, rdi);
1173     __ movptr(saved_rsi, rsi);
1174     __ mov(rdi, rcx); // c_rarg0
1175     __ mov(rsi, rdx); // c_rarg1
1176     __ mov(rdx, r8);  // c_rarg2
1177     if (nargs >= 4)
1178       __ mov(rcx, rax); // c_rarg3 (via rax)
1179 #else
1180     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1181            "unexpected argument registers");
1182 #endif
1183   }
1184 
1185   void restore_arg_regs() {
1186     const Register saved_rdi = r9;
1187     const Register saved_rsi = r10;
1188 #ifdef _WIN64
1189     __ movptr(rdi, saved_rdi);
1190     __ movptr(rsi, saved_rsi);
1191 #endif
1192   }
1193 
1194 
1195   // Copy big chunks forward
1196   //
1197   // Inputs:
1198   //   end_from     - source arrays end address
1199   //   end_to       - destination array end address
1200   //   qword_count  - 64-bits element count, negative
1201   //   to           - scratch
1202   //   L_copy_bytes - entry label
1203   //   L_copy_8_bytes  - exit  label
1204   //
1205   void copy_bytes_forward(Register end_from, Register end_to,
1206                              Register qword_count, Register to,
1207                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1208     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1209     Label L_loop;
1210     __ align(OptoLoopAlignment);
1211     if (UseUnalignedLoadStores) {
1212       Label L_end;
1213       if (UseAVX > 2) {
1214         __ movl(to, 0xffff);
1215         __ kmovwl(k1, to);
1216       }
1217       // Copy 64-bytes per iteration
1218       __ BIND(L_loop);
1219       if (UseAVX > 2) {
1220         __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
1221         __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
1222       } else if (UseAVX == 2) {
1223         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1224         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1225         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1226         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1227       } else {
1228         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1229         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1230         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1231         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1232         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1233         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1234         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1235         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1236       }
1237       __ BIND(L_copy_bytes);
1238       __ addptr(qword_count, 8);
1239       __ jcc(Assembler::lessEqual, L_loop);
1240       __ subptr(qword_count, 4);  // sub(8) and add(4)
1241       __ jccb(Assembler::greater, L_end);
1242       // Copy trailing 32 bytes
1243       if (UseAVX >= 2) {
1244         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1245         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1246       } else {
1247         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1248         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1249         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1250         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1251       }
1252       __ addptr(qword_count, 4);
1253       __ BIND(L_end);
1254       if (UseAVX >= 2) {
1255         // clean upper bits of YMM registers
1256         __ vpxor(xmm0, xmm0);
1257         __ vpxor(xmm1, xmm1);
1258       }
1259     } else {
1260       // Copy 32-bytes per iteration
1261       __ BIND(L_loop);
1262       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1263       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1264       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1265       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1266       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1267       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1268       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1269       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1270 
1271       __ BIND(L_copy_bytes);
1272       __ addptr(qword_count, 4);
1273       __ jcc(Assembler::lessEqual, L_loop);
1274     }
1275     __ subptr(qword_count, 4);
1276     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1277   }
1278 
1279   // Copy big chunks backward
1280   //
1281   // Inputs:
1282   //   from         - source arrays address
1283   //   dest         - destination array address
1284   //   qword_count  - 64-bits element count
1285   //   to           - scratch
1286   //   L_copy_bytes - entry label
1287   //   L_copy_8_bytes  - exit  label
1288   //
1289   void copy_bytes_backward(Register from, Register dest,
1290                               Register qword_count, Register to,
1291                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1292     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1293     Label L_loop;
1294     __ align(OptoLoopAlignment);
1295     if (UseUnalignedLoadStores) {
1296       Label L_end;
1297       if (UseAVX > 2) {
1298         __ movl(to, 0xffff);
1299         __ kmovwl(k1, to);
1300       }
1301       // Copy 64-bytes per iteration
1302       __ BIND(L_loop);
1303       if (UseAVX > 2) {
1304         __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
1305         __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
1306       } else if (UseAVX == 2) {
1307         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1308         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1309         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1310         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1311       } else {
1312         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1313         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1314         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1315         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1316         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1317         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1318         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1319         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1320       }
1321       __ BIND(L_copy_bytes);
1322       __ subptr(qword_count, 8);
1323       __ jcc(Assembler::greaterEqual, L_loop);
1324 
1325       __ addptr(qword_count, 4);  // add(8) and sub(4)
1326       __ jccb(Assembler::less, L_end);
1327       // Copy trailing 32 bytes
1328       if (UseAVX >= 2) {
1329         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1330         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1331       } else {
1332         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1333         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1334         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1335         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1336       }
1337       __ subptr(qword_count, 4);
1338       __ BIND(L_end);
1339       if (UseAVX >= 2) {
1340         // clean upper bits of YMM registers
1341         __ vpxor(xmm0, xmm0);
1342         __ vpxor(xmm1, xmm1);
1343       }
1344     } else {
1345       // Copy 32-bytes per iteration
1346       __ BIND(L_loop);
1347       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1348       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1349       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1350       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1351       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1352       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1353       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1354       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1355 
1356       __ BIND(L_copy_bytes);
1357       __ subptr(qword_count, 4);
1358       __ jcc(Assembler::greaterEqual, L_loop);
1359     }
1360     __ addptr(qword_count, 4);
1361     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1362   }
1363 
1364 
1365   // Arguments:
1366   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1367   //             ignored
1368   //   name    - stub name string
1369   //
1370   // Inputs:
1371   //   c_rarg0   - source array address
1372   //   c_rarg1   - destination array address
1373   //   c_rarg2   - element count, treated as ssize_t, can be zero
1374   //
1375   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1376   // we let the hardware handle it.  The one to eight bytes within words,
1377   // dwords or qwords that span cache line boundaries will still be loaded
1378   // and stored atomically.
1379   //
1380   // Side Effects:
1381   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1382   //   used by generate_conjoint_byte_copy().
1383   //
1384   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1385     __ align(CodeEntryAlignment);
1386     StubCodeMark mark(this, "StubRoutines", name);
1387     address start = __ pc();
1388 
1389     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1390     Label L_copy_byte, L_exit;
1391     const Register from        = rdi;  // source array address
1392     const Register to          = rsi;  // destination array address
1393     const Register count       = rdx;  // elements count
1394     const Register byte_count  = rcx;
1395     const Register qword_count = count;
1396     const Register end_from    = from; // source array end address
1397     const Register end_to      = to;   // destination array end address
1398     // End pointers are inclusive, and if count is not zero they point
1399     // to the last unit copied:  end_to[0] := end_from[0]
1400 
1401     __ enter(); // required for proper stackwalking of RuntimeStub frame
1402     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1403 
1404     if (entry != NULL) {
1405       *entry = __ pc();
1406        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1407       BLOCK_COMMENT("Entry:");
1408     }
1409 
1410     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1411                       // r9 and r10 may be used to save non-volatile registers
1412 
1413     // 'from', 'to' and 'count' are now valid
1414     __ movptr(byte_count, count);
1415     __ shrptr(count, 3); // count => qword_count
1416 
1417     // Copy from low to high addresses.  Use 'to' as scratch.
1418     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1419     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1420     __ negptr(qword_count); // make the count negative
1421     __ jmp(L_copy_bytes);
1422 
1423     // Copy trailing qwords
1424   __ BIND(L_copy_8_bytes);
1425     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1426     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1427     __ increment(qword_count);
1428     __ jcc(Assembler::notZero, L_copy_8_bytes);
1429 
1430     // Check for and copy trailing dword
1431   __ BIND(L_copy_4_bytes);
1432     __ testl(byte_count, 4);
1433     __ jccb(Assembler::zero, L_copy_2_bytes);
1434     __ movl(rax, Address(end_from, 8));
1435     __ movl(Address(end_to, 8), rax);
1436 
1437     __ addptr(end_from, 4);
1438     __ addptr(end_to, 4);
1439 
1440     // Check for and copy trailing word
1441   __ BIND(L_copy_2_bytes);
1442     __ testl(byte_count, 2);
1443     __ jccb(Assembler::zero, L_copy_byte);
1444     __ movw(rax, Address(end_from, 8));
1445     __ movw(Address(end_to, 8), rax);
1446 
1447     __ addptr(end_from, 2);
1448     __ addptr(end_to, 2);
1449 
1450     // Check for and copy trailing byte
1451   __ BIND(L_copy_byte);
1452     __ testl(byte_count, 1);
1453     __ jccb(Assembler::zero, L_exit);
1454     __ movb(rax, Address(end_from, 8));
1455     __ movb(Address(end_to, 8), rax);
1456 
1457   __ BIND(L_exit);
1458     restore_arg_regs();
1459     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1460     __ xorptr(rax, rax); // return 0
1461     __ vzeroupper();
1462     __ leave(); // required for proper stackwalking of RuntimeStub frame
1463     __ ret(0);
1464 
1465     // Copy in multi-bytes chunks
1466     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1467     __ jmp(L_copy_4_bytes);
1468 
1469     return start;
1470   }
1471 
1472   // Arguments:
1473   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1474   //             ignored
1475   //   name    - stub name string
1476   //
1477   // Inputs:
1478   //   c_rarg0   - source array address
1479   //   c_rarg1   - destination array address
1480   //   c_rarg2   - element count, treated as ssize_t, can be zero
1481   //
1482   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1483   // we let the hardware handle it.  The one to eight bytes within words,
1484   // dwords or qwords that span cache line boundaries will still be loaded
1485   // and stored atomically.
1486   //
1487   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1488                                       address* entry, const char *name) {
1489     __ align(CodeEntryAlignment);
1490     StubCodeMark mark(this, "StubRoutines", name);
1491     address start = __ pc();
1492 
1493     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1494     const Register from        = rdi;  // source array address
1495     const Register to          = rsi;  // destination array address
1496     const Register count       = rdx;  // elements count
1497     const Register byte_count  = rcx;
1498     const Register qword_count = count;
1499 
1500     __ enter(); // required for proper stackwalking of RuntimeStub frame
1501     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1502 
1503     if (entry != NULL) {
1504       *entry = __ pc();
1505       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1506       BLOCK_COMMENT("Entry:");
1507     }
1508 
1509     array_overlap_test(nooverlap_target, Address::times_1);
1510     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1511                       // r9 and r10 may be used to save non-volatile registers
1512 
1513     // 'from', 'to' and 'count' are now valid
1514     __ movptr(byte_count, count);
1515     __ shrptr(count, 3);   // count => qword_count
1516 
1517     // Copy from high to low addresses.
1518 
1519     // Check for and copy trailing byte
1520     __ testl(byte_count, 1);
1521     __ jcc(Assembler::zero, L_copy_2_bytes);
1522     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1523     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1524     __ decrement(byte_count); // Adjust for possible trailing word
1525 
1526     // Check for and copy trailing word
1527   __ BIND(L_copy_2_bytes);
1528     __ testl(byte_count, 2);
1529     __ jcc(Assembler::zero, L_copy_4_bytes);
1530     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1531     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1532 
1533     // Check for and copy trailing dword
1534   __ BIND(L_copy_4_bytes);
1535     __ testl(byte_count, 4);
1536     __ jcc(Assembler::zero, L_copy_bytes);
1537     __ movl(rax, Address(from, qword_count, Address::times_8));
1538     __ movl(Address(to, qword_count, Address::times_8), rax);
1539     __ jmp(L_copy_bytes);
1540 
1541     // Copy trailing qwords
1542   __ BIND(L_copy_8_bytes);
1543     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1544     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1545     __ decrement(qword_count);
1546     __ jcc(Assembler::notZero, L_copy_8_bytes);
1547 
1548     restore_arg_regs();
1549     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1550     __ xorptr(rax, rax); // return 0
1551     __ vzeroupper();
1552     __ leave(); // required for proper stackwalking of RuntimeStub frame
1553     __ ret(0);
1554 
1555     // Copy in multi-bytes chunks
1556     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1557 
1558     restore_arg_regs();
1559     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1560     __ xorptr(rax, rax); // return 0
1561     __ vzeroupper();
1562     __ leave(); // required for proper stackwalking of RuntimeStub frame
1563     __ ret(0);
1564 
1565     return start;
1566   }
1567 
1568   // Arguments:
1569   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1570   //             ignored
1571   //   name    - stub name string
1572   //
1573   // Inputs:
1574   //   c_rarg0   - source array address
1575   //   c_rarg1   - destination array address
1576   //   c_rarg2   - element count, treated as ssize_t, can be zero
1577   //
1578   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1579   // let the hardware handle it.  The two or four words within dwords
1580   // or qwords that span cache line boundaries will still be loaded
1581   // and stored atomically.
1582   //
1583   // Side Effects:
1584   //   disjoint_short_copy_entry is set to the no-overlap entry point
1585   //   used by generate_conjoint_short_copy().
1586   //
1587   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1588     __ align(CodeEntryAlignment);
1589     StubCodeMark mark(this, "StubRoutines", name);
1590     address start = __ pc();
1591 
1592     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1593     const Register from        = rdi;  // source array address
1594     const Register to          = rsi;  // destination array address
1595     const Register count       = rdx;  // elements count
1596     const Register word_count  = rcx;
1597     const Register qword_count = count;
1598     const Register end_from    = from; // source array end address
1599     const Register end_to      = to;   // destination array end address
1600     // End pointers are inclusive, and if count is not zero they point
1601     // to the last unit copied:  end_to[0] := end_from[0]
1602 
1603     __ enter(); // required for proper stackwalking of RuntimeStub frame
1604     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1605 
1606     if (entry != NULL) {
1607       *entry = __ pc();
1608       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1609       BLOCK_COMMENT("Entry:");
1610     }
1611 
1612     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1613                       // r9 and r10 may be used to save non-volatile registers
1614 
1615     // 'from', 'to' and 'count' are now valid
1616     __ movptr(word_count, count);
1617     __ shrptr(count, 2); // count => qword_count
1618 
1619     // Copy from low to high addresses.  Use 'to' as scratch.
1620     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1621     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1622     __ negptr(qword_count);
1623     __ jmp(L_copy_bytes);
1624 
1625     // Copy trailing qwords
1626   __ BIND(L_copy_8_bytes);
1627     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1628     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1629     __ increment(qword_count);
1630     __ jcc(Assembler::notZero, L_copy_8_bytes);
1631 
1632     // Original 'dest' is trashed, so we can't use it as a
1633     // base register for a possible trailing word copy
1634 
1635     // Check for and copy trailing dword
1636   __ BIND(L_copy_4_bytes);
1637     __ testl(word_count, 2);
1638     __ jccb(Assembler::zero, L_copy_2_bytes);
1639     __ movl(rax, Address(end_from, 8));
1640     __ movl(Address(end_to, 8), rax);
1641 
1642     __ addptr(end_from, 4);
1643     __ addptr(end_to, 4);
1644 
1645     // Check for and copy trailing word
1646   __ BIND(L_copy_2_bytes);
1647     __ testl(word_count, 1);
1648     __ jccb(Assembler::zero, L_exit);
1649     __ movw(rax, Address(end_from, 8));
1650     __ movw(Address(end_to, 8), rax);
1651 
1652   __ BIND(L_exit);
1653     restore_arg_regs();
1654     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1655     __ xorptr(rax, rax); // return 0
1656     __ vzeroupper();
1657     __ leave(); // required for proper stackwalking of RuntimeStub frame
1658     __ ret(0);
1659 
1660     // Copy in multi-bytes chunks
1661     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1662     __ jmp(L_copy_4_bytes);
1663 
1664     return start;
1665   }
1666 
1667   address generate_fill(BasicType t, bool aligned, const char *name) {
1668     __ align(CodeEntryAlignment);
1669     StubCodeMark mark(this, "StubRoutines", name);
1670     address start = __ pc();
1671 
1672     BLOCK_COMMENT("Entry:");
1673 
1674     const Register to       = c_rarg0;  // source array address
1675     const Register value    = c_rarg1;  // value
1676     const Register count    = c_rarg2;  // elements count
1677 
1678     __ enter(); // required for proper stackwalking of RuntimeStub frame
1679 
1680     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1681 
1682     __ vzeroupper();
1683     __ leave(); // required for proper stackwalking of RuntimeStub frame
1684     __ ret(0);
1685     return start;
1686   }
1687 
1688   // Arguments:
1689   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1690   //             ignored
1691   //   name    - stub name string
1692   //
1693   // Inputs:
1694   //   c_rarg0   - source array address
1695   //   c_rarg1   - destination array address
1696   //   c_rarg2   - element count, treated as ssize_t, can be zero
1697   //
1698   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1699   // let the hardware handle it.  The two or four words within dwords
1700   // or qwords that span cache line boundaries will still be loaded
1701   // and stored atomically.
1702   //
1703   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1704                                        address *entry, const char *name) {
1705     __ align(CodeEntryAlignment);
1706     StubCodeMark mark(this, "StubRoutines", name);
1707     address start = __ pc();
1708 
1709     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1710     const Register from        = rdi;  // source array address
1711     const Register to          = rsi;  // destination array address
1712     const Register count       = rdx;  // elements count
1713     const Register word_count  = rcx;
1714     const Register qword_count = count;
1715 
1716     __ enter(); // required for proper stackwalking of RuntimeStub frame
1717     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1718 
1719     if (entry != NULL) {
1720       *entry = __ pc();
1721       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1722       BLOCK_COMMENT("Entry:");
1723     }
1724 
1725     array_overlap_test(nooverlap_target, Address::times_2);
1726     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1727                       // r9 and r10 may be used to save non-volatile registers
1728 
1729     // 'from', 'to' and 'count' are now valid
1730     __ movptr(word_count, count);
1731     __ shrptr(count, 2); // count => qword_count
1732 
1733     // Copy from high to low addresses.  Use 'to' as scratch.
1734 
1735     // Check for and copy trailing word
1736     __ testl(word_count, 1);
1737     __ jccb(Assembler::zero, L_copy_4_bytes);
1738     __ movw(rax, Address(from, word_count, Address::times_2, -2));
1739     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1740 
1741     // Check for and copy trailing dword
1742   __ BIND(L_copy_4_bytes);
1743     __ testl(word_count, 2);
1744     __ jcc(Assembler::zero, L_copy_bytes);
1745     __ movl(rax, Address(from, qword_count, Address::times_8));
1746     __ movl(Address(to, qword_count, Address::times_8), rax);
1747     __ jmp(L_copy_bytes);
1748 
1749     // Copy trailing qwords
1750   __ BIND(L_copy_8_bytes);
1751     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1752     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1753     __ decrement(qword_count);
1754     __ jcc(Assembler::notZero, L_copy_8_bytes);
1755 
1756     restore_arg_regs();
1757     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1758     __ xorptr(rax, rax); // return 0
1759     __ vzeroupper();
1760     __ leave(); // required for proper stackwalking of RuntimeStub frame
1761     __ ret(0);
1762 
1763     // Copy in multi-bytes chunks
1764     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1765 
1766     restore_arg_regs();
1767     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1768     __ xorptr(rax, rax); // return 0
1769     __ vzeroupper();
1770     __ leave(); // required for proper stackwalking of RuntimeStub frame
1771     __ ret(0);
1772 
1773     return start;
1774   }
1775 
1776   // Arguments:
1777   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1778   //             ignored
1779   //   is_oop  - true => oop array, so generate store check code
1780   //   name    - stub name string
1781   //
1782   // Inputs:
1783   //   c_rarg0   - source array address
1784   //   c_rarg1   - destination array address
1785   //   c_rarg2   - element count, treated as ssize_t, can be zero
1786   //
1787   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1788   // the hardware handle it.  The two dwords within qwords that span
1789   // cache line boundaries will still be loaded and stored atomicly.
1790   //
1791   // Side Effects:
1792   //   disjoint_int_copy_entry is set to the no-overlap entry point
1793   //   used by generate_conjoint_int_oop_copy().
1794   //
1795   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
1796                                          const char *name, bool dest_uninitialized = false) {
1797     __ align(CodeEntryAlignment);
1798     StubCodeMark mark(this, "StubRoutines", name);
1799     address start = __ pc();
1800 
1801     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1802     const Register from        = rdi;  // source array address
1803     const Register to          = rsi;  // destination array address
1804     const Register count       = rdx;  // elements count
1805     const Register dword_count = rcx;
1806     const Register qword_count = count;
1807     const Register end_from    = from; // source array end address
1808     const Register end_to      = to;   // destination array end address
1809     // End pointers are inclusive, and if count is not zero they point
1810     // to the last unit copied:  end_to[0] := end_from[0]
1811 
1812     __ enter(); // required for proper stackwalking of RuntimeStub frame
1813     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1814 
1815     if (entry != NULL) {
1816       *entry = __ pc();
1817       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1818       BLOCK_COMMENT("Entry:");
1819     }
1820 
1821     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1822                       // r9 and r10 may be used to save non-volatile registers
1823 
1824     DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_DISJOINT;
1825     if (dest_uninitialized) {
1826       decorators |= AS_DEST_NOT_INITIALIZED;
1827     }
1828     if (aligned) {
1829       decorators |= ARRAYCOPY_ALIGNED;
1830     }
1831 
1832     BasicType type = is_oop ? T_OBJECT : T_INT;
1833     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1834     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1835 
1836     // 'from', 'to' and 'count' are now valid
1837     __ movptr(dword_count, count);
1838     __ shrptr(count, 1); // count => qword_count
1839 
1840     // Copy from low to high addresses.  Use 'to' as scratch.
1841     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1842     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1843     __ negptr(qword_count);
1844     __ jmp(L_copy_bytes);
1845 
1846     // Copy trailing qwords
1847   __ BIND(L_copy_8_bytes);
1848     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1849     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1850     __ increment(qword_count);
1851     __ jcc(Assembler::notZero, L_copy_8_bytes);
1852 
1853     // Check for and copy trailing dword
1854   __ BIND(L_copy_4_bytes);
1855     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1856     __ jccb(Assembler::zero, L_exit);
1857     __ movl(rax, Address(end_from, 8));
1858     __ movl(Address(end_to, 8), rax);
1859 
1860   __ BIND(L_exit);
1861     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1862     restore_arg_regs();
1863     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1864     __ vzeroupper();
1865     __ xorptr(rax, rax); // return 0
1866     __ leave(); // required for proper stackwalking of RuntimeStub frame
1867     __ ret(0);
1868 
1869     // Copy in multi-bytes chunks
1870     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1871     __ jmp(L_copy_4_bytes);
1872 
1873     return start;
1874   }
1875 
1876   // Arguments:
1877   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1878   //             ignored
1879   //   is_oop  - true => oop array, so generate store check code
1880   //   name    - stub name string
1881   //
1882   // Inputs:
1883   //   c_rarg0   - source array address
1884   //   c_rarg1   - destination array address
1885   //   c_rarg2   - element count, treated as ssize_t, can be zero
1886   //
1887   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1888   // the hardware handle it.  The two dwords within qwords that span
1889   // cache line boundaries will still be loaded and stored atomicly.
1890   //
1891   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
1892                                          address *entry, const char *name,
1893                                          bool dest_uninitialized = false) {
1894     __ align(CodeEntryAlignment);
1895     StubCodeMark mark(this, "StubRoutines", name);
1896     address start = __ pc();
1897 
1898     Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
1899     const Register from        = rdi;  // source array address
1900     const Register to          = rsi;  // destination array address
1901     const Register count       = rdx;  // elements count
1902     const Register dword_count = rcx;
1903     const Register qword_count = count;
1904 
1905     __ enter(); // required for proper stackwalking of RuntimeStub frame
1906     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1907 
1908     if (entry != NULL) {
1909       *entry = __ pc();
1910        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1911       BLOCK_COMMENT("Entry:");
1912     }
1913 
1914     array_overlap_test(nooverlap_target, Address::times_4);
1915     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1916                       // r9 and r10 may be used to save non-volatile registers
1917 
1918     DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY;
1919     if (dest_uninitialized) {
1920       decorators |= AS_DEST_NOT_INITIALIZED;
1921     }
1922     if (aligned) {
1923       decorators |= ARRAYCOPY_ALIGNED;
1924     }
1925 
1926     BasicType type = is_oop ? T_OBJECT : T_INT;
1927     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1928     // no registers are destroyed by this call
1929     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1930 
1931     assert_clean_int(count, rax); // Make sure 'count' is clean int.
1932     // 'from', 'to' and 'count' are now valid
1933     __ movptr(dword_count, count);
1934     __ shrptr(count, 1); // count => qword_count
1935 
1936     // Copy from high to low addresses.  Use 'to' as scratch.
1937 
1938     // Check for and copy trailing dword
1939     __ testl(dword_count, 1);
1940     __ jcc(Assembler::zero, L_copy_bytes);
1941     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
1942     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
1943     __ jmp(L_copy_bytes);
1944 
1945     // Copy trailing qwords
1946   __ BIND(L_copy_8_bytes);
1947     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1948     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1949     __ decrement(qword_count);
1950     __ jcc(Assembler::notZero, L_copy_8_bytes);
1951 
1952     if (is_oop) {
1953       __ jmp(L_exit);
1954     }
1955     restore_arg_regs();
1956     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1957     __ xorptr(rax, rax); // return 0
1958     __ vzeroupper();
1959     __ leave(); // required for proper stackwalking of RuntimeStub frame
1960     __ ret(0);
1961 
1962     // Copy in multi-bytes chunks
1963     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1964 
1965   __ BIND(L_exit);
1966     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1967     restore_arg_regs();
1968     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1969     __ xorptr(rax, rax); // return 0
1970     __ vzeroupper();
1971     __ leave(); // required for proper stackwalking of RuntimeStub frame
1972     __ ret(0);
1973 
1974     return start;
1975   }
1976 
1977   // Arguments:
1978   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1979   //             ignored
1980   //   is_oop  - true => oop array, so generate store check code
1981   //   name    - stub name string
1982   //
1983   // Inputs:
1984   //   c_rarg0   - source array address
1985   //   c_rarg1   - destination array address
1986   //   c_rarg2   - element count, treated as ssize_t, can be zero
1987   //
1988  // Side Effects:
1989   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1990   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1991   //
1992   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
1993                                           const char *name, bool dest_uninitialized = false) {
1994     __ align(CodeEntryAlignment);
1995     StubCodeMark mark(this, "StubRoutines", name);
1996     address start = __ pc();
1997 
1998     Label L_copy_bytes, L_copy_8_bytes, L_exit;
1999     const Register from        = rdi;  // source array address
2000     const Register to          = rsi;  // destination array address
2001     const Register qword_count = rdx;  // elements count
2002     const Register end_from    = from; // source array end address
2003     const Register end_to      = rcx;  // destination array end address
2004     const Register saved_count = r11;
2005     // End pointers are inclusive, and if count is not zero they point
2006     // to the last unit copied:  end_to[0] := end_from[0]
2007 
2008     __ enter(); // required for proper stackwalking of RuntimeStub frame
2009     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2010     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2011 
2012     if (entry != NULL) {
2013       *entry = __ pc();
2014       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2015       BLOCK_COMMENT("Entry:");
2016     }
2017 
2018     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2019                       // r9 and r10 may be used to save non-volatile registers
2020     // 'from', 'to' and 'qword_count' are now valid
2021 
2022     DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_DISJOINT;
2023     if (dest_uninitialized) {
2024       decorators |= AS_DEST_NOT_INITIALIZED;
2025     }
2026     if (aligned) {
2027       decorators |= ARRAYCOPY_ALIGNED;
2028     }
2029 
2030     BasicType type = is_oop ? T_OBJECT : T_LONG;
2031     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2032     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2033 
2034     // Copy from low to high addresses.  Use 'to' as scratch.
2035     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2036     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2037     __ negptr(qword_count);
2038     __ jmp(L_copy_bytes);
2039 
2040     // Copy trailing qwords
2041   __ BIND(L_copy_8_bytes);
2042     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2043     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2044     __ increment(qword_count);
2045     __ jcc(Assembler::notZero, L_copy_8_bytes);
2046 
2047     if (is_oop) {
2048       __ jmp(L_exit);
2049     } else {
2050       restore_arg_regs();
2051       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2052       __ xorptr(rax, rax); // return 0
2053       __ vzeroupper();
2054       __ leave(); // required for proper stackwalking of RuntimeStub frame
2055       __ ret(0);
2056     }
2057 
2058     // Copy in multi-bytes chunks
2059     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2060 
2061     __ BIND(L_exit);
2062     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2063     restore_arg_regs();
2064     if (is_oop) {
2065       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2066     } else {
2067       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2068     }
2069     __ vzeroupper();
2070     __ xorptr(rax, rax); // return 0
2071     __ leave(); // required for proper stackwalking of RuntimeStub frame
2072     __ ret(0);
2073 
2074     return start;
2075   }
2076 
2077   // Arguments:
2078   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2079   //             ignored
2080   //   is_oop  - true => oop array, so generate store check code
2081   //   name    - stub name string
2082   //
2083   // Inputs:
2084   //   c_rarg0   - source array address
2085   //   c_rarg1   - destination array address
2086   //   c_rarg2   - element count, treated as ssize_t, can be zero
2087   //
2088   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2089                                           address nooverlap_target, address *entry,
2090                                           const char *name, bool dest_uninitialized = false) {
2091     __ align(CodeEntryAlignment);
2092     StubCodeMark mark(this, "StubRoutines", name);
2093     address start = __ pc();
2094 
2095     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2096     const Register from        = rdi;  // source array address
2097     const Register to          = rsi;  // destination array address
2098     const Register qword_count = rdx;  // elements count
2099     const Register saved_count = rcx;
2100 
2101     __ enter(); // required for proper stackwalking of RuntimeStub frame
2102     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2103 
2104     if (entry != NULL) {
2105       *entry = __ pc();
2106       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2107       BLOCK_COMMENT("Entry:");
2108     }
2109 
2110     array_overlap_test(nooverlap_target, Address::times_8);
2111     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2112                       // r9 and r10 may be used to save non-volatile registers
2113     // 'from', 'to' and 'qword_count' are now valid
2114 
2115     DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_DISJOINT;
2116     if (dest_uninitialized) {
2117       decorators |= AS_DEST_NOT_INITIALIZED;
2118     }
2119     if (aligned) {
2120       decorators |= ARRAYCOPY_ALIGNED;
2121     }
2122 
2123     BasicType type = is_oop ? T_OBJECT : T_LONG;
2124     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2125     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2126 
2127     __ jmp(L_copy_bytes);
2128 
2129     // Copy trailing qwords
2130   __ BIND(L_copy_8_bytes);
2131     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2132     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2133     __ decrement(qword_count);
2134     __ jcc(Assembler::notZero, L_copy_8_bytes);
2135 
2136     if (is_oop) {
2137       __ jmp(L_exit);
2138     } else {
2139       restore_arg_regs();
2140       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2141       __ xorptr(rax, rax); // return 0
2142       __ vzeroupper();
2143       __ leave(); // required for proper stackwalking of RuntimeStub frame
2144       __ ret(0);
2145     }
2146 
2147     // Copy in multi-bytes chunks
2148     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2149 
2150     __ BIND(L_exit);
2151     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2152     restore_arg_regs();
2153     if (is_oop) {
2154       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2155     } else {
2156       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2157     }
2158     __ vzeroupper();
2159     __ xorptr(rax, rax); // return 0
2160     __ leave(); // required for proper stackwalking of RuntimeStub frame
2161     __ ret(0);
2162 
2163     return start;
2164   }
2165 
2166 
2167   // Helper for generating a dynamic type check.
2168   // Smashes no registers.
2169   void generate_type_check(Register sub_klass,
2170                            Register super_check_offset,
2171                            Register super_klass,
2172                            Label& L_success) {
2173     assert_different_registers(sub_klass, super_check_offset, super_klass);
2174 
2175     BLOCK_COMMENT("type_check:");
2176 
2177     Label L_miss;
2178 
2179     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2180                                      super_check_offset);
2181     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2182 
2183     // Fall through on failure!
2184     __ BIND(L_miss);
2185   }
2186 
2187   //
2188   //  Generate checkcasting array copy stub
2189   //
2190   //  Input:
2191   //    c_rarg0   - source array address
2192   //    c_rarg1   - destination array address
2193   //    c_rarg2   - element count, treated as ssize_t, can be zero
2194   //    c_rarg3   - size_t ckoff (super_check_offset)
2195   // not Win64
2196   //    c_rarg4   - oop ckval (super_klass)
2197   // Win64
2198   //    rsp+40    - oop ckval (super_klass)
2199   //
2200   //  Output:
2201   //    rax ==  0  -  success
2202   //    rax == -1^K - failure, where K is partial transfer count
2203   //
2204   address generate_checkcast_copy(const char *name, address *entry,
2205                                   bool dest_uninitialized = false) {
2206 
2207     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2208 
2209     // Input registers (after setup_arg_regs)
2210     const Register from        = rdi;   // source array address
2211     const Register to          = rsi;   // destination array address
2212     const Register length      = rdx;   // elements count
2213     const Register ckoff       = rcx;   // super_check_offset
2214     const Register ckval       = r8;    // super_klass
2215 
2216     // Registers used as temps (r13, r14 are save-on-entry)
2217     const Register end_from    = from;  // source array end address
2218     const Register end_to      = r13;   // destination array end address
2219     const Register count       = rdx;   // -(count_remaining)
2220     const Register r14_length  = r14;   // saved copy of length
2221     // End pointers are inclusive, and if length is not zero they point
2222     // to the last unit copied:  end_to[0] := end_from[0]
2223 
2224     const Register rax_oop    = rax;    // actual oop copied
2225     const Register r11_klass  = r11;    // oop._klass
2226 
2227     //---------------------------------------------------------------
2228     // Assembler stub will be used for this call to arraycopy
2229     // if the two arrays are subtypes of Object[] but the
2230     // destination array type is not equal to or a supertype
2231     // of the source type.  Each element must be separately
2232     // checked.
2233 
2234     __ align(CodeEntryAlignment);
2235     StubCodeMark mark(this, "StubRoutines", name);
2236     address start = __ pc();
2237 
2238     __ enter(); // required for proper stackwalking of RuntimeStub frame
2239 
2240 #ifdef ASSERT
2241     // caller guarantees that the arrays really are different
2242     // otherwise, we would have to make conjoint checks
2243     { Label L;
2244       array_overlap_test(L, TIMES_OOP);
2245       __ stop("checkcast_copy within a single array");
2246       __ bind(L);
2247     }
2248 #endif //ASSERT
2249 
2250     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2251                        // ckoff => rcx, ckval => r8
2252                        // r9 and r10 may be used to save non-volatile registers
2253 #ifdef _WIN64
2254     // last argument (#4) is on stack on Win64
2255     __ movptr(ckval, Address(rsp, 6 * wordSize));
2256 #endif
2257 
2258     // Caller of this entry point must set up the argument registers.
2259     if (entry != NULL) {
2260       *entry = __ pc();
2261       BLOCK_COMMENT("Entry:");
2262     }
2263 
2264     // allocate spill slots for r13, r14
2265     enum {
2266       saved_r13_offset,
2267       saved_r14_offset,
2268       saved_rbp_offset
2269     };
2270     __ subptr(rsp, saved_rbp_offset * wordSize);
2271     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2272     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2273 
2274     // check that int operands are properly extended to size_t
2275     assert_clean_int(length, rax);
2276     assert_clean_int(ckoff, rax);
2277 
2278 #ifdef ASSERT
2279     BLOCK_COMMENT("assert consistent ckoff/ckval");
2280     // The ckoff and ckval must be mutually consistent,
2281     // even though caller generates both.
2282     { Label L;
2283       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2284       __ cmpl(ckoff, Address(ckval, sco_offset));
2285       __ jcc(Assembler::equal, L);
2286       __ stop("super_check_offset inconsistent");
2287       __ bind(L);
2288     }
2289 #endif //ASSERT
2290 
2291     // Loop-invariant addresses.  They are exclusive end pointers.
2292     Address end_from_addr(from, length, TIMES_OOP, 0);
2293     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2294     // Loop-variant addresses.  They assume post-incremented count < 0.
2295     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2296     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2297 
2298     DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_CHECKCAST;
2299     if (dest_uninitialized) {
2300       decorators |= AS_DEST_NOT_INITIALIZED;
2301     }
2302 
2303     BasicType type = T_OBJECT;
2304     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2305     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2306 
2307     // Copy from low to high addresses, indexed from the end of each array.
2308     __ lea(end_from, end_from_addr);
2309     __ lea(end_to,   end_to_addr);
2310     __ movptr(r14_length, length);        // save a copy of the length
2311     assert(length == count, "");          // else fix next line:
2312     __ negptr(count);                     // negate and test the length
2313     __ jcc(Assembler::notZero, L_load_element);
2314 
2315     // Empty array:  Nothing to do.
2316     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2317     __ jmp(L_done);
2318 
2319     // ======== begin loop ========
2320     // (Loop is rotated; its entry is L_load_element.)
2321     // Loop control:
2322     //   for (count = -count; count != 0; count++)
2323     // Base pointers src, dst are biased by 8*(count-1),to last element.
2324     __ align(OptoLoopAlignment);
2325 
2326     __ BIND(L_store_element);
2327     __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, AS_RAW);  // store the oop
2328     __ increment(count);               // increment the count toward zero
2329     __ jcc(Assembler::zero, L_do_card_marks);
2330 
2331     // ======== loop entry is here ========
2332     __ BIND(L_load_element);
2333     __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2334     __ testptr(rax_oop, rax_oop);
2335     __ jcc(Assembler::zero, L_store_element);
2336 
2337     __ load_klass(r11_klass, rax_oop);// query the object klass
2338     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2339     // ======== end loop ========
2340 
2341     // It was a real error; we must depend on the caller to finish the job.
2342     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2343     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2344     // and report their number to the caller.
2345     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2346     Label L_post_barrier;
2347     __ addptr(r14_length, count);     // K = (original - remaining) oops
2348     __ movptr(rax, r14_length);       // save the value
2349     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2350     __ jccb(Assembler::notZero, L_post_barrier);
2351     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2352 
2353     // Come here on success only.
2354     __ BIND(L_do_card_marks);
2355     __ xorptr(rax, rax);              // return 0 on success
2356 
2357     __ BIND(L_post_barrier);
2358     bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2359 
2360     // Common exit point (success or failure).
2361     __ BIND(L_done);
2362     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2363     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2364     restore_arg_regs();
2365     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2366     __ leave(); // required for proper stackwalking of RuntimeStub frame
2367     __ ret(0);
2368 
2369     return start;
2370   }
2371 
2372   //
2373   //  Generate 'unsafe' array copy stub
2374   //  Though just as safe as the other stubs, it takes an unscaled
2375   //  size_t argument instead of an element count.
2376   //
2377   //  Input:
2378   //    c_rarg0   - source array address
2379   //    c_rarg1   - destination array address
2380   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2381   //
2382   // Examines the alignment of the operands and dispatches
2383   // to a long, int, short, or byte copy loop.
2384   //
2385   address generate_unsafe_copy(const char *name,
2386                                address byte_copy_entry, address short_copy_entry,
2387                                address int_copy_entry, address long_copy_entry) {
2388 
2389     Label L_long_aligned, L_int_aligned, L_short_aligned;
2390 
2391     // Input registers (before setup_arg_regs)
2392     const Register from        = c_rarg0;  // source array address
2393     const Register to          = c_rarg1;  // destination array address
2394     const Register size        = c_rarg2;  // byte count (size_t)
2395 
2396     // Register used as a temp
2397     const Register bits        = rax;      // test copy of low bits
2398 
2399     __ align(CodeEntryAlignment);
2400     StubCodeMark mark(this, "StubRoutines", name);
2401     address start = __ pc();
2402 
2403     __ enter(); // required for proper stackwalking of RuntimeStub frame
2404 
2405     // bump this on entry, not on exit:
2406     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2407 
2408     __ mov(bits, from);
2409     __ orptr(bits, to);
2410     __ orptr(bits, size);
2411 
2412     __ testb(bits, BytesPerLong-1);
2413     __ jccb(Assembler::zero, L_long_aligned);
2414 
2415     __ testb(bits, BytesPerInt-1);
2416     __ jccb(Assembler::zero, L_int_aligned);
2417 
2418     __ testb(bits, BytesPerShort-1);
2419     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2420 
2421     __ BIND(L_short_aligned);
2422     __ shrptr(size, LogBytesPerShort); // size => short_count
2423     __ jump(RuntimeAddress(short_copy_entry));
2424 
2425     __ BIND(L_int_aligned);
2426     __ shrptr(size, LogBytesPerInt); // size => int_count
2427     __ jump(RuntimeAddress(int_copy_entry));
2428 
2429     __ BIND(L_long_aligned);
2430     __ shrptr(size, LogBytesPerLong); // size => qword_count
2431     __ jump(RuntimeAddress(long_copy_entry));
2432 
2433     return start;
2434   }
2435 
2436   // Perform range checks on the proposed arraycopy.
2437   // Kills temp, but nothing else.
2438   // Also, clean the sign bits of src_pos and dst_pos.
2439   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2440                               Register src_pos, // source position (c_rarg1)
2441                               Register dst,     // destination array oo (c_rarg2)
2442                               Register dst_pos, // destination position (c_rarg3)
2443                               Register length,
2444                               Register temp,
2445                               Label& L_failed) {
2446     BLOCK_COMMENT("arraycopy_range_checks:");
2447 
2448     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2449     __ movl(temp, length);
2450     __ addl(temp, src_pos);             // src_pos + length
2451     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2452     __ jcc(Assembler::above, L_failed);
2453 
2454     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2455     __ movl(temp, length);
2456     __ addl(temp, dst_pos);             // dst_pos + length
2457     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2458     __ jcc(Assembler::above, L_failed);
2459 
2460     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2461     // Move with sign extension can be used since they are positive.
2462     __ movslq(src_pos, src_pos);
2463     __ movslq(dst_pos, dst_pos);
2464 
2465     BLOCK_COMMENT("arraycopy_range_checks done");
2466   }
2467 
2468   //
2469   //  Generate generic array copy stubs
2470   //
2471   //  Input:
2472   //    c_rarg0    -  src oop
2473   //    c_rarg1    -  src_pos (32-bits)
2474   //    c_rarg2    -  dst oop
2475   //    c_rarg3    -  dst_pos (32-bits)
2476   // not Win64
2477   //    c_rarg4    -  element count (32-bits)
2478   // Win64
2479   //    rsp+40     -  element count (32-bits)
2480   //
2481   //  Output:
2482   //    rax ==  0  -  success
2483   //    rax == -1^K - failure, where K is partial transfer count
2484   //
2485   address generate_generic_copy(const char *name,
2486                                 address byte_copy_entry, address short_copy_entry,
2487                                 address int_copy_entry, address oop_copy_entry,
2488                                 address long_copy_entry, address checkcast_copy_entry) {
2489 
2490     Label L_failed, L_failed_0, L_objArray;
2491     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2492 
2493     // Input registers
2494     const Register src        = c_rarg0;  // source array oop
2495     const Register src_pos    = c_rarg1;  // source position
2496     const Register dst        = c_rarg2;  // destination array oop
2497     const Register dst_pos    = c_rarg3;  // destination position
2498 #ifndef _WIN64
2499     const Register length     = c_rarg4;
2500 #else
2501     const Address  length(rsp, 6 * wordSize);  // elements count is on stack on Win64
2502 #endif
2503 
2504     { int modulus = CodeEntryAlignment;
2505       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2506       int advance = target - (__ offset() % modulus);
2507       if (advance < 0)  advance += modulus;
2508       if (advance > 0)  __ nop(advance);
2509     }
2510     StubCodeMark mark(this, "StubRoutines", name);
2511 
2512     // Short-hop target to L_failed.  Makes for denser prologue code.
2513     __ BIND(L_failed_0);
2514     __ jmp(L_failed);
2515     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2516 
2517     __ align(CodeEntryAlignment);
2518     address start = __ pc();
2519 
2520     __ enter(); // required for proper stackwalking of RuntimeStub frame
2521 
2522     // bump this on entry, not on exit:
2523     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2524 
2525     //-----------------------------------------------------------------------
2526     // Assembler stub will be used for this call to arraycopy
2527     // if the following conditions are met:
2528     //
2529     // (1) src and dst must not be null.
2530     // (2) src_pos must not be negative.
2531     // (3) dst_pos must not be negative.
2532     // (4) length  must not be negative.
2533     // (5) src klass and dst klass should be the same and not NULL.
2534     // (6) src and dst should be arrays.
2535     // (7) src_pos + length must not exceed length of src.
2536     // (8) dst_pos + length must not exceed length of dst.
2537     //
2538 
2539     //  if (src == NULL) return -1;
2540     __ testptr(src, src);         // src oop
2541     size_t j1off = __ offset();
2542     __ jccb(Assembler::zero, L_failed_0);
2543 
2544     //  if (src_pos < 0) return -1;
2545     __ testl(src_pos, src_pos); // src_pos (32-bits)
2546     __ jccb(Assembler::negative, L_failed_0);
2547 
2548     //  if (dst == NULL) return -1;
2549     __ testptr(dst, dst);         // dst oop
2550     __ jccb(Assembler::zero, L_failed_0);
2551 
2552     //  if (dst_pos < 0) return -1;
2553     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2554     size_t j4off = __ offset();
2555     __ jccb(Assembler::negative, L_failed_0);
2556 
2557     // The first four tests are very dense code,
2558     // but not quite dense enough to put four
2559     // jumps in a 16-byte instruction fetch buffer.
2560     // That's good, because some branch predicters
2561     // do not like jumps so close together.
2562     // Make sure of this.
2563     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2564 
2565     // registers used as temp
2566     const Register r11_length    = r11; // elements count to copy
2567     const Register r10_src_klass = r10; // array klass
2568 
2569     //  if (length < 0) return -1;
2570     __ movl(r11_length, length);        // length (elements count, 32-bits value)
2571     __ testl(r11_length, r11_length);
2572     __ jccb(Assembler::negative, L_failed_0);
2573 
2574     __ load_klass(r10_src_klass, src);
2575 #ifdef ASSERT
2576     //  assert(src->klass() != NULL);
2577     {
2578       BLOCK_COMMENT("assert klasses not null {");
2579       Label L1, L2;
2580       __ testptr(r10_src_klass, r10_src_klass);
2581       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
2582       __ bind(L1);
2583       __ stop("broken null klass");
2584       __ bind(L2);
2585       __ load_klass(rax, dst);
2586       __ cmpq(rax, 0);
2587       __ jcc(Assembler::equal, L1);     // this would be broken also
2588       BLOCK_COMMENT("} assert klasses not null done");
2589     }
2590 #endif
2591 
2592     // Load layout helper (32-bits)
2593     //
2594     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2595     // 32        30    24            16              8     2                 0
2596     //
2597     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2598     //
2599 
2600     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2601 
2602     // Handle objArrays completely differently...
2603     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2604     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2605     __ jcc(Assembler::equal, L_objArray);
2606 
2607     //  if (src->klass() != dst->klass()) return -1;
2608     __ load_klass(rax, dst);
2609     __ cmpq(r10_src_klass, rax);
2610     __ jcc(Assembler::notEqual, L_failed);
2611 
2612     const Register rax_lh = rax;  // layout helper
2613     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2614 
2615     //  if (!src->is_Array()) return -1;
2616     __ cmpl(rax_lh, Klass::_lh_neutral_value);
2617     __ jcc(Assembler::greaterEqual, L_failed);
2618 
2619     // At this point, it is known to be a typeArray (array_tag 0x3).
2620 #ifdef ASSERT
2621     {
2622       BLOCK_COMMENT("assert primitive array {");
2623       Label L;
2624       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2625       __ jcc(Assembler::greaterEqual, L);
2626       __ stop("must be a primitive array");
2627       __ bind(L);
2628       BLOCK_COMMENT("} assert primitive array done");
2629     }
2630 #endif
2631 
2632     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2633                            r10, L_failed);
2634 
2635     // TypeArrayKlass
2636     //
2637     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2638     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2639     //
2640 
2641     const Register r10_offset = r10;    // array offset
2642     const Register rax_elsize = rax_lh; // element size
2643 
2644     __ movl(r10_offset, rax_lh);
2645     __ shrl(r10_offset, Klass::_lh_header_size_shift);
2646     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
2647     __ addptr(src, r10_offset);           // src array offset
2648     __ addptr(dst, r10_offset);           // dst array offset
2649     BLOCK_COMMENT("choose copy loop based on element size");
2650     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2651 
2652     // next registers should be set before the jump to corresponding stub
2653     const Register from     = c_rarg0;  // source array address
2654     const Register to       = c_rarg1;  // destination array address
2655     const Register count    = c_rarg2;  // elements count
2656 
2657     // 'from', 'to', 'count' registers should be set in such order
2658     // since they are the same as 'src', 'src_pos', 'dst'.
2659 
2660   __ BIND(L_copy_bytes);
2661     __ cmpl(rax_elsize, 0);
2662     __ jccb(Assembler::notEqual, L_copy_shorts);
2663     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2664     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2665     __ movl2ptr(count, r11_length); // length
2666     __ jump(RuntimeAddress(byte_copy_entry));
2667 
2668   __ BIND(L_copy_shorts);
2669     __ cmpl(rax_elsize, LogBytesPerShort);
2670     __ jccb(Assembler::notEqual, L_copy_ints);
2671     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2672     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2673     __ movl2ptr(count, r11_length); // length
2674     __ jump(RuntimeAddress(short_copy_entry));
2675 
2676   __ BIND(L_copy_ints);
2677     __ cmpl(rax_elsize, LogBytesPerInt);
2678     __ jccb(Assembler::notEqual, L_copy_longs);
2679     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2680     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2681     __ movl2ptr(count, r11_length); // length
2682     __ jump(RuntimeAddress(int_copy_entry));
2683 
2684   __ BIND(L_copy_longs);
2685 #ifdef ASSERT
2686     {
2687       BLOCK_COMMENT("assert long copy {");
2688       Label L;
2689       __ cmpl(rax_elsize, LogBytesPerLong);
2690       __ jcc(Assembler::equal, L);
2691       __ stop("must be long copy, but elsize is wrong");
2692       __ bind(L);
2693       BLOCK_COMMENT("} assert long copy done");
2694     }
2695 #endif
2696     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2697     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2698     __ movl2ptr(count, r11_length); // length
2699     __ jump(RuntimeAddress(long_copy_entry));
2700 
2701     // ObjArrayKlass
2702   __ BIND(L_objArray);
2703     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
2704 
2705     Label L_plain_copy, L_checkcast_copy;
2706     //  test array classes for subtyping
2707     __ load_klass(rax, dst);
2708     __ cmpq(r10_src_klass, rax); // usual case is exact equality
2709     __ jcc(Assembler::notEqual, L_checkcast_copy);
2710 
2711     // Identically typed arrays can be copied without element-wise checks.
2712     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2713                            r10, L_failed);
2714 
2715     __ lea(from, Address(src, src_pos, TIMES_OOP,
2716                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2717     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2718                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2719     __ movl2ptr(count, r11_length); // length
2720   __ BIND(L_plain_copy);
2721     __ jump(RuntimeAddress(oop_copy_entry));
2722 
2723   __ BIND(L_checkcast_copy);
2724     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
2725     {
2726       // Before looking at dst.length, make sure dst is also an objArray.
2727       __ cmpl(Address(rax, lh_offset), objArray_lh);
2728       __ jcc(Assembler::notEqual, L_failed);
2729 
2730       // It is safe to examine both src.length and dst.length.
2731       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2732                              rax, L_failed);
2733 
2734       const Register r11_dst_klass = r11;
2735       __ load_klass(r11_dst_klass, dst); // reload
2736 
2737       // Marshal the base address arguments now, freeing registers.
2738       __ lea(from, Address(src, src_pos, TIMES_OOP,
2739                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2740       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2741                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2742       __ movl(count, length);           // length (reloaded)
2743       Register sco_temp = c_rarg3;      // this register is free now
2744       assert_different_registers(from, to, count, sco_temp,
2745                                  r11_dst_klass, r10_src_klass);
2746       assert_clean_int(count, sco_temp);
2747 
2748       // Generate the type check.
2749       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2750       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2751       assert_clean_int(sco_temp, rax);
2752       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
2753 
2754       // Fetch destination element klass from the ObjArrayKlass header.
2755       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2756       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
2757       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
2758       assert_clean_int(sco_temp, rax);
2759 
2760       // the checkcast_copy loop needs two extra arguments:
2761       assert(c_rarg3 == sco_temp, "#3 already in place");
2762       // Set up arguments for checkcast_copy_entry.
2763       setup_arg_regs(4);
2764       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
2765       __ jump(RuntimeAddress(checkcast_copy_entry));
2766     }
2767 
2768   __ BIND(L_failed);
2769     __ xorptr(rax, rax);
2770     __ notptr(rax); // return -1
2771     __ leave();   // required for proper stackwalking of RuntimeStub frame
2772     __ ret(0);
2773 
2774     return start;
2775   }
2776 
2777   void generate_arraycopy_stubs() {
2778     address entry;
2779     address entry_jbyte_arraycopy;
2780     address entry_jshort_arraycopy;
2781     address entry_jint_arraycopy;
2782     address entry_oop_arraycopy;
2783     address entry_jlong_arraycopy;
2784     address entry_checkcast_arraycopy;
2785 
2786     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
2787                                                                            "jbyte_disjoint_arraycopy");
2788     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
2789                                                                            "jbyte_arraycopy");
2790 
2791     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
2792                                                                             "jshort_disjoint_arraycopy");
2793     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
2794                                                                             "jshort_arraycopy");
2795 
2796     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
2797                                                                               "jint_disjoint_arraycopy");
2798     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
2799                                                                               &entry_jint_arraycopy, "jint_arraycopy");
2800 
2801     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
2802                                                                                "jlong_disjoint_arraycopy");
2803     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
2804                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
2805 
2806 
2807     if (UseCompressedOops) {
2808       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
2809                                                                               "oop_disjoint_arraycopy");
2810       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
2811                                                                               &entry_oop_arraycopy, "oop_arraycopy");
2812       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
2813                                                                                      "oop_disjoint_arraycopy_uninit",
2814                                                                                      /*dest_uninitialized*/true);
2815       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
2816                                                                                      NULL, "oop_arraycopy_uninit",
2817                                                                                      /*dest_uninitialized*/true);
2818     } else {
2819       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
2820                                                                                "oop_disjoint_arraycopy");
2821       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
2822                                                                                &entry_oop_arraycopy, "oop_arraycopy");
2823       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
2824                                                                                       "oop_disjoint_arraycopy_uninit",
2825                                                                                       /*dest_uninitialized*/true);
2826       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
2827                                                                                       NULL, "oop_arraycopy_uninit",
2828                                                                                       /*dest_uninitialized*/true);
2829     }
2830 
2831     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2832     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2833                                                                         /*dest_uninitialized*/true);
2834 
2835     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2836                                                               entry_jbyte_arraycopy,
2837                                                               entry_jshort_arraycopy,
2838                                                               entry_jint_arraycopy,
2839                                                               entry_jlong_arraycopy);
2840     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2841                                                                entry_jbyte_arraycopy,
2842                                                                entry_jshort_arraycopy,
2843                                                                entry_jint_arraycopy,
2844                                                                entry_oop_arraycopy,
2845                                                                entry_jlong_arraycopy,
2846                                                                entry_checkcast_arraycopy);
2847 
2848     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2849     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2850     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2851     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2852     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2853     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2854 
2855     // We don't generate specialized code for HeapWord-aligned source
2856     // arrays, so just use the code we've already generated
2857     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
2858     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
2859 
2860     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
2861     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
2862 
2863     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
2864     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
2865 
2866     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
2867     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
2868 
2869     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
2870     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
2871 
2872     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
2873     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
2874   }
2875 
2876   // AES intrinsic stubs
2877   enum {AESBlockSize = 16};
2878 
2879   address generate_key_shuffle_mask() {
2880     __ align(16);
2881     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2882     address start = __ pc();
2883     __ emit_data64( 0x0405060700010203, relocInfo::none );
2884     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
2885     return start;
2886   }
2887 
2888   address generate_counter_shuffle_mask() {
2889     __ align(16);
2890     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
2891     address start = __ pc();
2892     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
2893     __ emit_data64(0x0001020304050607, relocInfo::none);
2894     return start;
2895   }
2896 
2897   // Utility routine for loading a 128-bit key word in little endian format
2898   // can optionally specify that the shuffle mask is already in an xmmregister
2899   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2900     __ movdqu(xmmdst, Address(key, offset));
2901     if (xmm_shuf_mask != NULL) {
2902       __ pshufb(xmmdst, xmm_shuf_mask);
2903     } else {
2904       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2905     }
2906   }
2907 
2908   // Utility routine for increase 128bit counter (iv in CTR mode)
2909   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
2910     __ pextrq(reg, xmmdst, 0x0);
2911     __ addq(reg, inc_delta);
2912     __ pinsrq(xmmdst, reg, 0x0);
2913     __ jcc(Assembler::carryClear, next_block); // jump if no carry
2914     __ pextrq(reg, xmmdst, 0x01); // Carry
2915     __ addq(reg, 0x01);
2916     __ pinsrq(xmmdst, reg, 0x01); //Carry end
2917     __ BIND(next_block);          // next instruction
2918   }
2919 
2920   // Arguments:
2921   //
2922   // Inputs:
2923   //   c_rarg0   - source byte array address
2924   //   c_rarg1   - destination byte array address
2925   //   c_rarg2   - K (key) in little endian int array
2926   //
2927   address generate_aescrypt_encryptBlock() {
2928     assert(UseAES, "need AES instructions and misaligned SSE support");
2929     __ align(CodeEntryAlignment);
2930     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2931     Label L_doLast;
2932     address start = __ pc();
2933 
2934     const Register from        = c_rarg0;  // source array address
2935     const Register to          = c_rarg1;  // destination array address
2936     const Register key         = c_rarg2;  // key array address
2937     const Register keylen      = rax;
2938 
2939     const XMMRegister xmm_result = xmm0;
2940     const XMMRegister xmm_key_shuf_mask = xmm1;
2941     // On win64 xmm6-xmm15 must be preserved so don't use them.
2942     const XMMRegister xmm_temp1  = xmm2;
2943     const XMMRegister xmm_temp2  = xmm3;
2944     const XMMRegister xmm_temp3  = xmm4;
2945     const XMMRegister xmm_temp4  = xmm5;
2946 
2947     __ enter(); // required for proper stackwalking of RuntimeStub frame
2948 
2949     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
2950     // context for the registers used, where all instructions below are using 128-bit mode
2951     // On EVEX without VL and BW, these instructions will all be AVX.
2952     if (VM_Version::supports_avx512vlbw()) {
2953       __ movl(rax, 0xffff);
2954       __ kmovql(k1, rax);
2955     }
2956 
2957     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2958     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2959 
2960     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2961     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
2962 
2963     // For encryption, the java expanded key ordering is just what we need
2964     // we don't know if the key is aligned, hence not using load-execute form
2965 
2966     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
2967     __ pxor(xmm_result, xmm_temp1);
2968 
2969     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2970     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2971     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2972     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2973 
2974     __ aesenc(xmm_result, xmm_temp1);
2975     __ aesenc(xmm_result, xmm_temp2);
2976     __ aesenc(xmm_result, xmm_temp3);
2977     __ aesenc(xmm_result, xmm_temp4);
2978 
2979     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2980     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2981     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2982     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2983 
2984     __ aesenc(xmm_result, xmm_temp1);
2985     __ aesenc(xmm_result, xmm_temp2);
2986     __ aesenc(xmm_result, xmm_temp3);
2987     __ aesenc(xmm_result, xmm_temp4);
2988 
2989     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2990     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2991 
2992     __ cmpl(keylen, 44);
2993     __ jccb(Assembler::equal, L_doLast);
2994 
2995     __ aesenc(xmm_result, xmm_temp1);
2996     __ aesenc(xmm_result, xmm_temp2);
2997 
2998     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2999     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3000 
3001     __ cmpl(keylen, 52);
3002     __ jccb(Assembler::equal, L_doLast);
3003 
3004     __ aesenc(xmm_result, xmm_temp1);
3005     __ aesenc(xmm_result, xmm_temp2);
3006 
3007     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3008     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3009 
3010     __ BIND(L_doLast);
3011     __ aesenc(xmm_result, xmm_temp1);
3012     __ aesenclast(xmm_result, xmm_temp2);
3013     __ movdqu(Address(to, 0), xmm_result);        // store the result
3014     __ xorptr(rax, rax); // return 0
3015     __ leave(); // required for proper stackwalking of RuntimeStub frame
3016     __ ret(0);
3017 
3018     return start;
3019   }
3020 
3021 
3022   // Arguments:
3023   //
3024   // Inputs:
3025   //   c_rarg0   - source byte array address
3026   //   c_rarg1   - destination byte array address
3027   //   c_rarg2   - K (key) in little endian int array
3028   //
3029   address generate_aescrypt_decryptBlock() {
3030     assert(UseAES, "need AES instructions and misaligned SSE support");
3031     __ align(CodeEntryAlignment);
3032     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3033     Label L_doLast;
3034     address start = __ pc();
3035 
3036     const Register from        = c_rarg0;  // source array address
3037     const Register to          = c_rarg1;  // destination array address
3038     const Register key         = c_rarg2;  // key array address
3039     const Register keylen      = rax;
3040 
3041     const XMMRegister xmm_result = xmm0;
3042     const XMMRegister xmm_key_shuf_mask = xmm1;
3043     // On win64 xmm6-xmm15 must be preserved so don't use them.
3044     const XMMRegister xmm_temp1  = xmm2;
3045     const XMMRegister xmm_temp2  = xmm3;
3046     const XMMRegister xmm_temp3  = xmm4;
3047     const XMMRegister xmm_temp4  = xmm5;
3048 
3049     __ enter(); // required for proper stackwalking of RuntimeStub frame
3050 
3051     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3052     // context for the registers used, where all instructions below are using 128-bit mode
3053     // On EVEX without VL and BW, these instructions will all be AVX.
3054     if (VM_Version::supports_avx512vlbw()) {
3055       __ movl(rax, 0xffff);
3056       __ kmovql(k1, rax);
3057     }
3058 
3059     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3060     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3061 
3062     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3063     __ movdqu(xmm_result, Address(from, 0));
3064 
3065     // for decryption java expanded key ordering is rotated one position from what we want
3066     // so we start from 0x10 here and hit 0x00 last
3067     // we don't know if the key is aligned, hence not using load-execute form
3068     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3069     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3070     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3071     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3072 
3073     __ pxor  (xmm_result, xmm_temp1);
3074     __ aesdec(xmm_result, xmm_temp2);
3075     __ aesdec(xmm_result, xmm_temp3);
3076     __ aesdec(xmm_result, xmm_temp4);
3077 
3078     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3079     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3080     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3081     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3082 
3083     __ aesdec(xmm_result, xmm_temp1);
3084     __ aesdec(xmm_result, xmm_temp2);
3085     __ aesdec(xmm_result, xmm_temp3);
3086     __ aesdec(xmm_result, xmm_temp4);
3087 
3088     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3089     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3090     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3091 
3092     __ cmpl(keylen, 44);
3093     __ jccb(Assembler::equal, L_doLast);
3094 
3095     __ aesdec(xmm_result, xmm_temp1);
3096     __ aesdec(xmm_result, xmm_temp2);
3097 
3098     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3099     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3100 
3101     __ cmpl(keylen, 52);
3102     __ jccb(Assembler::equal, L_doLast);
3103 
3104     __ aesdec(xmm_result, xmm_temp1);
3105     __ aesdec(xmm_result, xmm_temp2);
3106 
3107     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3108     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3109 
3110     __ BIND(L_doLast);
3111     __ aesdec(xmm_result, xmm_temp1);
3112     __ aesdec(xmm_result, xmm_temp2);
3113 
3114     // for decryption the aesdeclast operation is always on key+0x00
3115     __ aesdeclast(xmm_result, xmm_temp3);
3116     __ movdqu(Address(to, 0), xmm_result);  // store the result
3117     __ xorptr(rax, rax); // return 0
3118     __ leave(); // required for proper stackwalking of RuntimeStub frame
3119     __ ret(0);
3120 
3121     return start;
3122   }
3123 
3124 
3125   // Arguments:
3126   //
3127   // Inputs:
3128   //   c_rarg0   - source byte array address
3129   //   c_rarg1   - destination byte array address
3130   //   c_rarg2   - K (key) in little endian int array
3131   //   c_rarg3   - r vector byte array address
3132   //   c_rarg4   - input length
3133   //
3134   // Output:
3135   //   rax       - input length
3136   //
3137   address generate_cipherBlockChaining_encryptAESCrypt() {
3138     assert(UseAES, "need AES instructions and misaligned SSE support");
3139     __ align(CodeEntryAlignment);
3140     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3141     address start = __ pc();
3142 
3143     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3144     const Register from        = c_rarg0;  // source array address
3145     const Register to          = c_rarg1;  // destination array address
3146     const Register key         = c_rarg2;  // key array address
3147     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3148                                            // and left with the results of the last encryption block
3149 #ifndef _WIN64
3150     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3151 #else
3152     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3153     const Register len_reg     = r11;      // pick the volatile windows register
3154 #endif
3155     const Register pos         = rax;
3156 
3157     // xmm register assignments for the loops below
3158     const XMMRegister xmm_result = xmm0;
3159     const XMMRegister xmm_temp   = xmm1;
3160     // keys 0-10 preloaded into xmm2-xmm12
3161     const int XMM_REG_NUM_KEY_FIRST = 2;
3162     const int XMM_REG_NUM_KEY_LAST  = 15;
3163     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3164     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3165     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3166     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3167     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3168 
3169     __ enter(); // required for proper stackwalking of RuntimeStub frame
3170 
3171     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3172     // context for the registers used, where all instructions below are using 128-bit mode
3173     // On EVEX without VL and BW, these instructions will all be AVX.
3174     if (VM_Version::supports_avx512vlbw()) {
3175       __ movl(rax, 0xffff);
3176       __ kmovql(k1, rax);
3177     }
3178 
3179 #ifdef _WIN64
3180     // on win64, fill len_reg from stack position
3181     __ movl(len_reg, len_mem);
3182 #else
3183     __ push(len_reg); // Save
3184 #endif
3185 
3186     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3187     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3188     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3189     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3190       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3191       offset += 0x10;
3192     }
3193     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3194 
3195     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3196     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3197     __ cmpl(rax, 44);
3198     __ jcc(Assembler::notEqual, L_key_192_256);
3199 
3200     // 128 bit code follows here
3201     __ movptr(pos, 0);
3202     __ align(OptoLoopAlignment);
3203 
3204     __ BIND(L_loopTop_128);
3205     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3206     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3207     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3208     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3209       __ aesenc(xmm_result, as_XMMRegister(rnum));
3210     }
3211     __ aesenclast(xmm_result, xmm_key10);
3212     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3213     // no need to store r to memory until we exit
3214     __ addptr(pos, AESBlockSize);
3215     __ subptr(len_reg, AESBlockSize);
3216     __ jcc(Assembler::notEqual, L_loopTop_128);
3217 
3218     __ BIND(L_exit);
3219     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3220 
3221 #ifdef _WIN64
3222     __ movl(rax, len_mem);
3223 #else
3224     __ pop(rax); // return length
3225 #endif
3226     __ leave(); // required for proper stackwalking of RuntimeStub frame
3227     __ ret(0);
3228 
3229     __ BIND(L_key_192_256);
3230     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3231     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3232     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3233     __ cmpl(rax, 52);
3234     __ jcc(Assembler::notEqual, L_key_256);
3235 
3236     // 192-bit code follows here (could be changed to use more xmm registers)
3237     __ movptr(pos, 0);
3238     __ align(OptoLoopAlignment);
3239 
3240     __ BIND(L_loopTop_192);
3241     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3242     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3243     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3244     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3245       __ aesenc(xmm_result, as_XMMRegister(rnum));
3246     }
3247     __ aesenclast(xmm_result, xmm_key12);
3248     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3249     // no need to store r to memory until we exit
3250     __ addptr(pos, AESBlockSize);
3251     __ subptr(len_reg, AESBlockSize);
3252     __ jcc(Assembler::notEqual, L_loopTop_192);
3253     __ jmp(L_exit);
3254 
3255     __ BIND(L_key_256);
3256     // 256-bit code follows here (could be changed to use more xmm registers)
3257     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3258     __ movptr(pos, 0);
3259     __ align(OptoLoopAlignment);
3260 
3261     __ BIND(L_loopTop_256);
3262     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3263     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3264     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3265     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3266       __ aesenc(xmm_result, as_XMMRegister(rnum));
3267     }
3268     load_key(xmm_temp, key, 0xe0);
3269     __ aesenclast(xmm_result, xmm_temp);
3270     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3271     // no need to store r to memory until we exit
3272     __ addptr(pos, AESBlockSize);
3273     __ subptr(len_reg, AESBlockSize);
3274     __ jcc(Assembler::notEqual, L_loopTop_256);
3275     __ jmp(L_exit);
3276 
3277     return start;
3278   }
3279 
3280   // Safefetch stubs.
3281   void generate_safefetch(const char* name, int size, address* entry,
3282                           address* fault_pc, address* continuation_pc) {
3283     // safefetch signatures:
3284     //   int      SafeFetch32(int*      adr, int      errValue);
3285     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3286     //
3287     // arguments:
3288     //   c_rarg0 = adr
3289     //   c_rarg1 = errValue
3290     //
3291     // result:
3292     //   PPC_RET  = *adr or errValue
3293 
3294     StubCodeMark mark(this, "StubRoutines", name);
3295 
3296     // Entry point, pc or function descriptor.
3297     *entry = __ pc();
3298 
3299     // Load *adr into c_rarg1, may fault.
3300     *fault_pc = __ pc();
3301     switch (size) {
3302       case 4:
3303         // int32_t
3304         __ movl(c_rarg1, Address(c_rarg0, 0));
3305         break;
3306       case 8:
3307         // int64_t
3308         __ movq(c_rarg1, Address(c_rarg0, 0));
3309         break;
3310       default:
3311         ShouldNotReachHere();
3312     }
3313 
3314     // return errValue or *adr
3315     *continuation_pc = __ pc();
3316     __ movq(rax, c_rarg1);
3317     __ ret(0);
3318   }
3319 
3320   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3321   // to hide instruction latency
3322   //
3323   // Arguments:
3324   //
3325   // Inputs:
3326   //   c_rarg0   - source byte array address
3327   //   c_rarg1   - destination byte array address
3328   //   c_rarg2   - K (key) in little endian int array
3329   //   c_rarg3   - r vector byte array address
3330   //   c_rarg4   - input length
3331   //
3332   // Output:
3333   //   rax       - input length
3334   //
3335   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3336     assert(UseAES, "need AES instructions and misaligned SSE support");
3337     __ align(CodeEntryAlignment);
3338     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3339     address start = __ pc();
3340 
3341     const Register from        = c_rarg0;  // source array address
3342     const Register to          = c_rarg1;  // destination array address
3343     const Register key         = c_rarg2;  // key array address
3344     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3345                                            // and left with the results of the last encryption block
3346 #ifndef _WIN64
3347     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3348 #else
3349     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3350     const Register len_reg     = r11;      // pick the volatile windows register
3351 #endif
3352     const Register pos         = rax;
3353 
3354     const int PARALLEL_FACTOR = 4;
3355     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3356 
3357     Label L_exit;
3358     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3359     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3360     Label L_singleBlock_loopTop[3]; // 128, 192, 256
3361     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3362     Label L_multiBlock_loopTop[3]; // 128, 192, 256
3363 
3364     // keys 0-10 preloaded into xmm5-xmm15
3365     const int XMM_REG_NUM_KEY_FIRST = 5;
3366     const int XMM_REG_NUM_KEY_LAST  = 15;
3367     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3368     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3369 
3370     __ enter(); // required for proper stackwalking of RuntimeStub frame
3371 
3372     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3373     // context for the registers used, where all instructions below are using 128-bit mode
3374     // On EVEX without VL and BW, these instructions will all be AVX.
3375     if (VM_Version::supports_avx512vlbw()) {
3376       __ movl(rax, 0xffff);
3377       __ kmovql(k1, rax);
3378     }
3379 
3380 #ifdef _WIN64
3381     // on win64, fill len_reg from stack position
3382     __ movl(len_reg, len_mem);
3383 #else
3384     __ push(len_reg); // Save
3385 #endif
3386     __ push(rbx);
3387     // the java expanded key ordering is rotated one position from what we want
3388     // so we start from 0x10 here and hit 0x00 last
3389     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3390     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3391     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3392     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3393       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3394       offset += 0x10;
3395     }
3396     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3397 
3398     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3399 
3400     // registers holding the four results in the parallelized loop
3401     const XMMRegister xmm_result0 = xmm0;
3402     const XMMRegister xmm_result1 = xmm2;
3403     const XMMRegister xmm_result2 = xmm3;
3404     const XMMRegister xmm_result3 = xmm4;
3405 
3406     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3407 
3408     __ xorptr(pos, pos);
3409 
3410     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3411     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3412     __ cmpl(rbx, 52);
3413     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3414     __ cmpl(rbx, 60);
3415     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3416 
3417 #define DoFour(opc, src_reg)           \
3418   __ opc(xmm_result0, src_reg);         \
3419   __ opc(xmm_result1, src_reg);         \
3420   __ opc(xmm_result2, src_reg);         \
3421   __ opc(xmm_result3, src_reg);         \
3422 
3423     for (int k = 0; k < 3; ++k) {
3424       __ BIND(L_multiBlock_loopTopHead[k]);
3425       if (k != 0) {
3426         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3427         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3428       }
3429       if (k == 1) {
3430         __ subptr(rsp, 6 * wordSize);
3431         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3432         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3433         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3434         load_key(xmm1, key, 0xc0);  // 0xc0;
3435         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3436       } else if (k == 2) {
3437         __ subptr(rsp, 10 * wordSize);
3438         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3439         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
3440         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3441         load_key(xmm1, key, 0xe0);  // 0xe0;
3442         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3443         load_key(xmm15, key, 0xb0); // 0xb0;
3444         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3445         load_key(xmm1, key, 0xc0);  // 0xc0;
3446         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3447       }
3448       __ align(OptoLoopAlignment);
3449       __ BIND(L_multiBlock_loopTop[k]);
3450       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3451       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3452 
3453       if  (k != 0) {
3454         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3455         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
3456       }
3457 
3458       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
3459       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3460       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3461       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3462 
3463       DoFour(pxor, xmm_key_first);
3464       if (k == 0) {
3465         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
3466           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3467         }
3468         DoFour(aesdeclast, xmm_key_last);
3469       } else if (k == 1) {
3470         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
3471           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3472         }
3473         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3474         DoFour(aesdec, xmm1);  // key : 0xc0
3475         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3476         DoFour(aesdeclast, xmm_key_last);
3477       } else if (k == 2) {
3478         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
3479           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3480         }
3481         DoFour(aesdec, xmm1);  // key : 0xc0
3482         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
3483         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
3484         DoFour(aesdec, xmm15);  // key : 0xd0
3485         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3486         DoFour(aesdec, xmm1);  // key : 0xe0
3487         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3488         DoFour(aesdeclast, xmm_key_last);
3489       }
3490 
3491       // for each result, xor with the r vector of previous cipher block
3492       __ pxor(xmm_result0, xmm_prev_block_cipher);
3493       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3494       __ pxor(xmm_result1, xmm_prev_block_cipher);
3495       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3496       __ pxor(xmm_result2, xmm_prev_block_cipher);
3497       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3498       __ pxor(xmm_result3, xmm_prev_block_cipher);
3499       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
3500       if (k != 0) {
3501         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
3502       }
3503 
3504       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
3505       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
3506       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
3507       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
3508 
3509       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
3510       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
3511       __ jmp(L_multiBlock_loopTop[k]);
3512 
3513       // registers used in the non-parallelized loops
3514       // xmm register assignments for the loops below
3515       const XMMRegister xmm_result = xmm0;
3516       const XMMRegister xmm_prev_block_cipher_save = xmm2;
3517       const XMMRegister xmm_key11 = xmm3;
3518       const XMMRegister xmm_key12 = xmm4;
3519       const XMMRegister key_tmp = xmm4;
3520 
3521       __ BIND(L_singleBlock_loopTopHead[k]);
3522       if (k == 1) {
3523         __ addptr(rsp, 6 * wordSize);
3524       } else if (k == 2) {
3525         __ addptr(rsp, 10 * wordSize);
3526       }
3527       __ cmpptr(len_reg, 0); // any blocks left??
3528       __ jcc(Assembler::equal, L_exit);
3529       __ BIND(L_singleBlock_loopTopHead2[k]);
3530       if (k == 1) {
3531         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
3532         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
3533       }
3534       if (k == 2) {
3535         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
3536       }
3537       __ align(OptoLoopAlignment);
3538       __ BIND(L_singleBlock_loopTop[k]);
3539       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3540       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3541       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
3542       for (int rnum = 1; rnum <= 9 ; rnum++) {
3543           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3544       }
3545       if (k == 1) {
3546         __ aesdec(xmm_result, xmm_key11);
3547         __ aesdec(xmm_result, xmm_key12);
3548       }
3549       if (k == 2) {
3550         __ aesdec(xmm_result, xmm_key11);
3551         load_key(key_tmp, key, 0xc0);
3552         __ aesdec(xmm_result, key_tmp);
3553         load_key(key_tmp, key, 0xd0);
3554         __ aesdec(xmm_result, key_tmp);
3555         load_key(key_tmp, key, 0xe0);
3556         __ aesdec(xmm_result, key_tmp);
3557       }
3558 
3559       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3560       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3561       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3562       // no need to store r to memory until we exit
3563       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3564       __ addptr(pos, AESBlockSize);
3565       __ subptr(len_reg, AESBlockSize);
3566       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3567       if (k != 2) {
3568         __ jmp(L_exit);
3569       }
3570     } //for 128/192/256
3571 
3572     __ BIND(L_exit);
3573     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
3574     __ pop(rbx);
3575 #ifdef _WIN64
3576     __ movl(rax, len_mem);
3577 #else
3578     __ pop(rax); // return length
3579 #endif
3580     __ leave(); // required for proper stackwalking of RuntimeStub frame
3581     __ ret(0);
3582     return start;
3583 }
3584 
3585   address generate_upper_word_mask() {
3586     __ align(64);
3587     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3588     address start = __ pc();
3589     __ emit_data64(0x0000000000000000, relocInfo::none);
3590     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3591     return start;
3592   }
3593 
3594   address generate_shuffle_byte_flip_mask() {
3595     __ align(64);
3596     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3597     address start = __ pc();
3598     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3599     __ emit_data64(0x0001020304050607, relocInfo::none);
3600     return start;
3601   }
3602 
3603   // ofs and limit are use for multi-block byte array.
3604   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3605   address generate_sha1_implCompress(bool multi_block, const char *name) {
3606     __ align(CodeEntryAlignment);
3607     StubCodeMark mark(this, "StubRoutines", name);
3608     address start = __ pc();
3609 
3610     Register buf = c_rarg0;
3611     Register state = c_rarg1;
3612     Register ofs = c_rarg2;
3613     Register limit = c_rarg3;
3614 
3615     const XMMRegister abcd = xmm0;
3616     const XMMRegister e0 = xmm1;
3617     const XMMRegister e1 = xmm2;
3618     const XMMRegister msg0 = xmm3;
3619 
3620     const XMMRegister msg1 = xmm4;
3621     const XMMRegister msg2 = xmm5;
3622     const XMMRegister msg3 = xmm6;
3623     const XMMRegister shuf_mask = xmm7;
3624 
3625     __ enter();
3626 
3627     __ subptr(rsp, 4 * wordSize);
3628 
3629     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3630       buf, state, ofs, limit, rsp, multi_block);
3631 
3632     __ addptr(rsp, 4 * wordSize);
3633 
3634     __ leave();
3635     __ ret(0);
3636     return start;
3637   }
3638 
3639   address generate_pshuffle_byte_flip_mask() {
3640     __ align(64);
3641     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3642     address start = __ pc();
3643     __ emit_data64(0x0405060700010203, relocInfo::none);
3644     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3645 
3646     if (VM_Version::supports_avx2()) {
3647       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
3648       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3649       // _SHUF_00BA
3650       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3651       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3652       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3653       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3654       // _SHUF_DC00
3655       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3656       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3657       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3658       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3659     }
3660 
3661     return start;
3662   }
3663 
3664   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
3665   address generate_pshuffle_byte_flip_mask_sha512() {
3666     __ align(32);
3667     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
3668     address start = __ pc();
3669     if (VM_Version::supports_avx2()) {
3670       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
3671       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3672       __ emit_data64(0x1011121314151617, relocInfo::none);
3673       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
3674       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
3675       __ emit_data64(0x0000000000000000, relocInfo::none);
3676       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3677       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3678     }
3679 
3680     return start;
3681   }
3682 
3683 // ofs and limit are use for multi-block byte array.
3684 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3685   address generate_sha256_implCompress(bool multi_block, const char *name) {
3686     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
3687     __ align(CodeEntryAlignment);
3688     StubCodeMark mark(this, "StubRoutines", name);
3689     address start = __ pc();
3690 
3691     Register buf = c_rarg0;
3692     Register state = c_rarg1;
3693     Register ofs = c_rarg2;
3694     Register limit = c_rarg3;
3695 
3696     const XMMRegister msg = xmm0;
3697     const XMMRegister state0 = xmm1;
3698     const XMMRegister state1 = xmm2;
3699     const XMMRegister msgtmp0 = xmm3;
3700 
3701     const XMMRegister msgtmp1 = xmm4;
3702     const XMMRegister msgtmp2 = xmm5;
3703     const XMMRegister msgtmp3 = xmm6;
3704     const XMMRegister msgtmp4 = xmm7;
3705 
3706     const XMMRegister shuf_mask = xmm8;
3707 
3708     __ enter();
3709 
3710     __ subptr(rsp, 4 * wordSize);
3711 
3712     if (VM_Version::supports_sha()) {
3713       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3714         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3715     } else if (VM_Version::supports_avx2()) {
3716       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3717         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3718     }
3719     __ addptr(rsp, 4 * wordSize);
3720     __ vzeroupper();
3721     __ leave();
3722     __ ret(0);
3723     return start;
3724   }
3725 
3726   address generate_sha512_implCompress(bool multi_block, const char *name) {
3727     assert(VM_Version::supports_avx2(), "");
3728     assert(VM_Version::supports_bmi2(), "");
3729     __ align(CodeEntryAlignment);
3730     StubCodeMark mark(this, "StubRoutines", name);
3731     address start = __ pc();
3732 
3733     Register buf = c_rarg0;
3734     Register state = c_rarg1;
3735     Register ofs = c_rarg2;
3736     Register limit = c_rarg3;
3737 
3738     const XMMRegister msg = xmm0;
3739     const XMMRegister state0 = xmm1;
3740     const XMMRegister state1 = xmm2;
3741     const XMMRegister msgtmp0 = xmm3;
3742     const XMMRegister msgtmp1 = xmm4;
3743     const XMMRegister msgtmp2 = xmm5;
3744     const XMMRegister msgtmp3 = xmm6;
3745     const XMMRegister msgtmp4 = xmm7;
3746 
3747     const XMMRegister shuf_mask = xmm8;
3748 
3749     __ enter();
3750 
3751     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3752     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3753 
3754     __ vzeroupper();
3755     __ leave();
3756     __ ret(0);
3757     return start;
3758   }
3759 
3760   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
3761   // to hide instruction latency
3762   //
3763   // Arguments:
3764   //
3765   // Inputs:
3766   //   c_rarg0   - source byte array address
3767   //   c_rarg1   - destination byte array address
3768   //   c_rarg2   - K (key) in little endian int array
3769   //   c_rarg3   - counter vector byte array address
3770   //   Linux
3771   //     c_rarg4   -          input length
3772   //     c_rarg5   -          saved encryptedCounter start
3773   //     rbp + 6 * wordSize - saved used length
3774   //   Windows
3775   //     rbp + 6 * wordSize - input length
3776   //     rbp + 7 * wordSize - saved encryptedCounter start
3777   //     rbp + 8 * wordSize - saved used length
3778   //
3779   // Output:
3780   //   rax       - input length
3781   //
3782   address generate_counterMode_AESCrypt_Parallel() {
3783     assert(UseAES, "need AES instructions and misaligned SSE support");
3784     __ align(CodeEntryAlignment);
3785     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3786     address start = __ pc();
3787     const Register from = c_rarg0; // source array address
3788     const Register to = c_rarg1; // destination array address
3789     const Register key = c_rarg2; // key array address
3790     const Register counter = c_rarg3; // counter byte array initialized from counter array address
3791                                       // and updated with the incremented counter in the end
3792 #ifndef _WIN64
3793     const Register len_reg = c_rarg4;
3794     const Register saved_encCounter_start = c_rarg5;
3795     const Register used_addr = r10;
3796     const Address  used_mem(rbp, 2 * wordSize);
3797     const Register used = r11;
3798 #else
3799     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
3800     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
3801     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
3802     const Register len_reg = r10; // pick the first volatile windows register
3803     const Register saved_encCounter_start = r11;
3804     const Register used_addr = r13;
3805     const Register used = r14;
3806 #endif
3807     const Register pos = rax;
3808 
3809     const int PARALLEL_FACTOR = 6;
3810     const XMMRegister xmm_counter_shuf_mask = xmm0;
3811     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
3812     const XMMRegister xmm_curr_counter = xmm2;
3813 
3814     const XMMRegister xmm_key_tmp0 = xmm3;
3815     const XMMRegister xmm_key_tmp1 = xmm4;
3816 
3817     // registers holding the four results in the parallelized loop
3818     const XMMRegister xmm_result0 = xmm5;
3819     const XMMRegister xmm_result1 = xmm6;
3820     const XMMRegister xmm_result2 = xmm7;
3821     const XMMRegister xmm_result3 = xmm8;
3822     const XMMRegister xmm_result4 = xmm9;
3823     const XMMRegister xmm_result5 = xmm10;
3824 
3825     const XMMRegister xmm_from0 = xmm11;
3826     const XMMRegister xmm_from1 = xmm12;
3827     const XMMRegister xmm_from2 = xmm13;
3828     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
3829     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
3830     const XMMRegister xmm_from5 = xmm4;
3831 
3832     //for key_128, key_192, key_256
3833     const int rounds[3] = {10, 12, 14};
3834     Label L_exit_preLoop, L_preLoop_start;
3835     Label L_multiBlock_loopTop[3];
3836     Label L_singleBlockLoopTop[3];
3837     Label L__incCounter[3][6]; //for 6 blocks
3838     Label L__incCounter_single[3]; //for single block, key128, key192, key256
3839     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
3840     Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
3841 
3842     Label L_exit;
3843 
3844     __ enter(); // required for proper stackwalking of RuntimeStub frame
3845 
3846     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3847     // context for the registers used, where all instructions below are using 128-bit mode
3848     // On EVEX without VL and BW, these instructions will all be AVX.
3849     if (VM_Version::supports_avx512vlbw()) {
3850         __ movl(rax, 0xffff);
3851         __ kmovql(k1, rax);
3852     }
3853 
3854 #ifdef _WIN64
3855     // allocate spill slots for r13, r14
3856     enum {
3857         saved_r13_offset,
3858         saved_r14_offset
3859     };
3860     __ subptr(rsp, 2 * wordSize);
3861     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
3862     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
3863 
3864     // on win64, fill len_reg from stack position
3865     __ movl(len_reg, len_mem);
3866     __ movptr(saved_encCounter_start, saved_encCounter_mem);
3867     __ movptr(used_addr, used_mem);
3868     __ movl(used, Address(used_addr, 0));
3869 #else
3870     __ push(len_reg); // Save
3871     __ movptr(used_addr, used_mem);
3872     __ movl(used, Address(used_addr, 0));
3873 #endif
3874 
3875     __ push(rbx); // Save RBX
3876     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
3877     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
3878     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
3879     __ movptr(pos, 0);
3880 
3881     // Use the partially used encrpyted counter from last invocation
3882     __ BIND(L_preLoop_start);
3883     __ cmpptr(used, 16);
3884     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
3885       __ cmpptr(len_reg, 0);
3886       __ jcc(Assembler::lessEqual, L_exit_preLoop);
3887       __ movb(rbx, Address(saved_encCounter_start, used));
3888       __ xorb(rbx, Address(from, pos));
3889       __ movb(Address(to, pos), rbx);
3890       __ addptr(pos, 1);
3891       __ addptr(used, 1);
3892       __ subptr(len_reg, 1);
3893 
3894     __ jmp(L_preLoop_start);
3895 
3896     __ BIND(L_exit_preLoop);
3897     __ movl(Address(used_addr, 0), used);
3898 
3899     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
3900     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
3901     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3902     __ cmpl(rbx, 52);
3903     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
3904     __ cmpl(rbx, 60);
3905     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
3906 
3907 #define CTR_DoSix(opc, src_reg)                \
3908     __ opc(xmm_result0, src_reg);              \
3909     __ opc(xmm_result1, src_reg);              \
3910     __ opc(xmm_result2, src_reg);              \
3911     __ opc(xmm_result3, src_reg);              \
3912     __ opc(xmm_result4, src_reg);              \
3913     __ opc(xmm_result5, src_reg);
3914 
3915     // k == 0 :  generate code for key_128
3916     // k == 1 :  generate code for key_192
3917     // k == 2 :  generate code for key_256
3918     for (int k = 0; k < 3; ++k) {
3919       //multi blocks starts here
3920       __ align(OptoLoopAlignment);
3921       __ BIND(L_multiBlock_loopTop[k]);
3922       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
3923       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
3924       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
3925 
3926       //load, then increase counters
3927       CTR_DoSix(movdqa, xmm_curr_counter);
3928       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
3929       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
3930       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
3931       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
3932       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
3933       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
3934       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
3935       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
3936 
3937       //load two ROUND_KEYs at a time
3938       for (int i = 1; i < rounds[k]; ) {
3939         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
3940         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
3941         CTR_DoSix(aesenc, xmm_key_tmp1);
3942         i++;
3943         if (i != rounds[k]) {
3944           CTR_DoSix(aesenc, xmm_key_tmp0);
3945         } else {
3946           CTR_DoSix(aesenclast, xmm_key_tmp0);
3947         }
3948         i++;
3949       }
3950 
3951       // get next PARALLEL_FACTOR blocks into xmm_result registers
3952       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3953       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3954       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3955       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3956       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
3957       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
3958 
3959       __ pxor(xmm_result0, xmm_from0);
3960       __ pxor(xmm_result1, xmm_from1);
3961       __ pxor(xmm_result2, xmm_from2);
3962       __ pxor(xmm_result3, xmm_from3);
3963       __ pxor(xmm_result4, xmm_from4);
3964       __ pxor(xmm_result5, xmm_from5);
3965 
3966       // store 6 results into the next 64 bytes of output
3967       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
3968       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
3969       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
3970       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
3971       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
3972       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
3973 
3974       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
3975       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
3976       __ jmp(L_multiBlock_loopTop[k]);
3977 
3978       // singleBlock starts here
3979       __ align(OptoLoopAlignment);
3980       __ BIND(L_singleBlockLoopTop[k]);
3981       __ cmpptr(len_reg, 0);
3982       __ jcc(Assembler::lessEqual, L_exit);
3983       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
3984       __ movdqa(xmm_result0, xmm_curr_counter);
3985       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
3986       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
3987       __ pxor(xmm_result0, xmm_key_tmp0);
3988       for (int i = 1; i < rounds[k]; i++) {
3989         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
3990         __ aesenc(xmm_result0, xmm_key_tmp0);
3991       }
3992       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
3993       __ aesenclast(xmm_result0, xmm_key_tmp0);
3994       __ cmpptr(len_reg, AESBlockSize);
3995       __ jcc(Assembler::less, L_processTail_insr[k]);
3996         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3997         __ pxor(xmm_result0, xmm_from0);
3998         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
3999         __ addptr(pos, AESBlockSize);
4000         __ subptr(len_reg, AESBlockSize);
4001         __ jmp(L_singleBlockLoopTop[k]);
4002       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
4003         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
4004         __ testptr(len_reg, 8);
4005         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4006           __ subptr(pos,8);
4007           __ pinsrq(xmm_from0, Address(from, pos), 0);
4008         __ BIND(L_processTail_4_insr[k]);
4009         __ testptr(len_reg, 4);
4010         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4011           __ subptr(pos,4);
4012           __ pslldq(xmm_from0, 4);
4013           __ pinsrd(xmm_from0, Address(from, pos), 0);
4014         __ BIND(L_processTail_2_insr[k]);
4015         __ testptr(len_reg, 2);
4016         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4017           __ subptr(pos, 2);
4018           __ pslldq(xmm_from0, 2);
4019           __ pinsrw(xmm_from0, Address(from, pos), 0);
4020         __ BIND(L_processTail_1_insr[k]);
4021         __ testptr(len_reg, 1);
4022         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4023           __ subptr(pos, 1);
4024           __ pslldq(xmm_from0, 1);
4025           __ pinsrb(xmm_from0, Address(from, pos), 0);
4026         __ BIND(L_processTail_exit_insr[k]);
4027 
4028         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4029         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
4030 
4031         __ testptr(len_reg, 8);
4032         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
4033           __ pextrq(Address(to, pos), xmm_result0, 0);
4034           __ psrldq(xmm_result0, 8);
4035           __ addptr(pos, 8);
4036         __ BIND(L_processTail_4_extr[k]);
4037         __ testptr(len_reg, 4);
4038         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4039           __ pextrd(Address(to, pos), xmm_result0, 0);
4040           __ psrldq(xmm_result0, 4);
4041           __ addptr(pos, 4);
4042         __ BIND(L_processTail_2_extr[k]);
4043         __ testptr(len_reg, 2);
4044         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4045           __ pextrw(Address(to, pos), xmm_result0, 0);
4046           __ psrldq(xmm_result0, 2);
4047           __ addptr(pos, 2);
4048         __ BIND(L_processTail_1_extr[k]);
4049         __ testptr(len_reg, 1);
4050         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4051           __ pextrb(Address(to, pos), xmm_result0, 0);
4052 
4053         __ BIND(L_processTail_exit_extr[k]);
4054         __ movl(Address(used_addr, 0), len_reg);
4055         __ jmp(L_exit);
4056 
4057     }
4058 
4059     __ BIND(L_exit);
4060     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4061     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4062     __ pop(rbx); // pop the saved RBX.
4063 #ifdef _WIN64
4064     __ movl(rax, len_mem);
4065     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4066     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4067     __ addptr(rsp, 2 * wordSize);
4068 #else
4069     __ pop(rax); // return 'len'
4070 #endif
4071     __ leave(); // required for proper stackwalking of RuntimeStub frame
4072     __ ret(0);
4073     return start;
4074   }
4075 
4076   // byte swap x86 long
4077   address generate_ghash_long_swap_mask() {
4078     __ align(CodeEntryAlignment);
4079     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4080     address start = __ pc();
4081     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4082     __ emit_data64(0x0706050403020100, relocInfo::none );
4083   return start;
4084   }
4085 
4086   // byte swap x86 byte array
4087   address generate_ghash_byte_swap_mask() {
4088     __ align(CodeEntryAlignment);
4089     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4090     address start = __ pc();
4091     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4092     __ emit_data64(0x0001020304050607, relocInfo::none );
4093   return start;
4094   }
4095 
4096   /* Single and multi-block ghash operations */
4097   address generate_ghash_processBlocks() {
4098     __ align(CodeEntryAlignment);
4099     Label L_ghash_loop, L_exit;
4100     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4101     address start = __ pc();
4102 
4103     const Register state        = c_rarg0;
4104     const Register subkeyH      = c_rarg1;
4105     const Register data         = c_rarg2;
4106     const Register blocks       = c_rarg3;
4107 
4108     const XMMRegister xmm_temp0 = xmm0;
4109     const XMMRegister xmm_temp1 = xmm1;
4110     const XMMRegister xmm_temp2 = xmm2;
4111     const XMMRegister xmm_temp3 = xmm3;
4112     const XMMRegister xmm_temp4 = xmm4;
4113     const XMMRegister xmm_temp5 = xmm5;
4114     const XMMRegister xmm_temp6 = xmm6;
4115     const XMMRegister xmm_temp7 = xmm7;
4116     const XMMRegister xmm_temp8 = xmm8;
4117     const XMMRegister xmm_temp9 = xmm9;
4118     const XMMRegister xmm_temp10 = xmm10;
4119 
4120     __ enter();
4121 
4122     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
4123     // context for the registers used, where all instructions below are using 128-bit mode
4124     // On EVEX without VL and BW, these instructions will all be AVX.
4125     if (VM_Version::supports_avx512vlbw()) {
4126       __ movl(rax, 0xffff);
4127       __ kmovql(k1, rax);
4128     }
4129 
4130     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4131 
4132     __ movdqu(xmm_temp0, Address(state, 0));
4133     __ pshufb(xmm_temp0, xmm_temp10);
4134 
4135 
4136     __ BIND(L_ghash_loop);
4137     __ movdqu(xmm_temp2, Address(data, 0));
4138     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4139 
4140     __ movdqu(xmm_temp1, Address(subkeyH, 0));
4141     __ pshufb(xmm_temp1, xmm_temp10);
4142 
4143     __ pxor(xmm_temp0, xmm_temp2);
4144 
4145     //
4146     // Multiply with the hash key
4147     //
4148     __ movdqu(xmm_temp3, xmm_temp0);
4149     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
4150     __ movdqu(xmm_temp4, xmm_temp0);
4151     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
4152 
4153     __ movdqu(xmm_temp5, xmm_temp0);
4154     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
4155     __ movdqu(xmm_temp6, xmm_temp0);
4156     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
4157 
4158     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
4159 
4160     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
4161     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
4162     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
4163     __ pxor(xmm_temp3, xmm_temp5);
4164     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
4165                                         // of the carry-less multiplication of
4166                                         // xmm0 by xmm1.
4167 
4168     // We shift the result of the multiplication by one bit position
4169     // to the left to cope for the fact that the bits are reversed.
4170     __ movdqu(xmm_temp7, xmm_temp3);
4171     __ movdqu(xmm_temp8, xmm_temp6);
4172     __ pslld(xmm_temp3, 1);
4173     __ pslld(xmm_temp6, 1);
4174     __ psrld(xmm_temp7, 31);
4175     __ psrld(xmm_temp8, 31);
4176     __ movdqu(xmm_temp9, xmm_temp7);
4177     __ pslldq(xmm_temp8, 4);
4178     __ pslldq(xmm_temp7, 4);
4179     __ psrldq(xmm_temp9, 12);
4180     __ por(xmm_temp3, xmm_temp7);
4181     __ por(xmm_temp6, xmm_temp8);
4182     __ por(xmm_temp6, xmm_temp9);
4183 
4184     //
4185     // First phase of the reduction
4186     //
4187     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
4188     // independently.
4189     __ movdqu(xmm_temp7, xmm_temp3);
4190     __ movdqu(xmm_temp8, xmm_temp3);
4191     __ movdqu(xmm_temp9, xmm_temp3);
4192     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
4193     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
4194     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
4195     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
4196     __ pxor(xmm_temp7, xmm_temp9);
4197     __ movdqu(xmm_temp8, xmm_temp7);
4198     __ pslldq(xmm_temp7, 12);
4199     __ psrldq(xmm_temp8, 4);
4200     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
4201 
4202     //
4203     // Second phase of the reduction
4204     //
4205     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
4206     // shift operations.
4207     __ movdqu(xmm_temp2, xmm_temp3);
4208     __ movdqu(xmm_temp4, xmm_temp3);
4209     __ movdqu(xmm_temp5, xmm_temp3);
4210     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
4211     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
4212     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
4213     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
4214     __ pxor(xmm_temp2, xmm_temp5);
4215     __ pxor(xmm_temp2, xmm_temp8);
4216     __ pxor(xmm_temp3, xmm_temp2);
4217     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
4218 
4219     __ decrement(blocks);
4220     __ jcc(Assembler::zero, L_exit);
4221     __ movdqu(xmm_temp0, xmm_temp6);
4222     __ addptr(data, 16);
4223     __ jmp(L_ghash_loop);
4224 
4225     __ BIND(L_exit);
4226     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
4227     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
4228     __ leave();
4229     __ ret(0);
4230     return start;
4231   }
4232 
4233   /**
4234    *  Arguments:
4235    *
4236    * Inputs:
4237    *   c_rarg0   - int crc
4238    *   c_rarg1   - byte* buf
4239    *   c_rarg2   - int length
4240    *
4241    * Ouput:
4242    *       rax   - int crc result
4243    */
4244   address generate_updateBytesCRC32() {
4245     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
4246 
4247     __ align(CodeEntryAlignment);
4248     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4249 
4250     address start = __ pc();
4251     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4252     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4253     // rscratch1: r10
4254     const Register crc   = c_rarg0;  // crc
4255     const Register buf   = c_rarg1;  // source java byte array address
4256     const Register len   = c_rarg2;  // length
4257     const Register table = c_rarg3;  // crc_table address (reuse register)
4258     const Register tmp   = r11;
4259     assert_different_registers(crc, buf, len, table, tmp, rax);
4260 
4261     BLOCK_COMMENT("Entry:");
4262     __ enter(); // required for proper stackwalking of RuntimeStub frame
4263 
4264     __ kernel_crc32(crc, buf, len, table, tmp);
4265 
4266     __ movl(rax, crc);
4267     __ vzeroupper();
4268     __ leave(); // required for proper stackwalking of RuntimeStub frame
4269     __ ret(0);
4270 
4271     return start;
4272   }
4273 
4274   /**
4275   *  Arguments:
4276   *
4277   * Inputs:
4278   *   c_rarg0   - int crc
4279   *   c_rarg1   - byte* buf
4280   *   c_rarg2   - long length
4281   *   c_rarg3   - table_start - optional (present only when doing a library_call,
4282   *              not used by x86 algorithm)
4283   *
4284   * Ouput:
4285   *       rax   - int crc result
4286   */
4287   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
4288       assert(UseCRC32CIntrinsics, "need SSE4_2");
4289       __ align(CodeEntryAlignment);
4290       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4291       address start = __ pc();
4292       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
4293       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
4294       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
4295       const Register crc = c_rarg0;  // crc
4296       const Register buf = c_rarg1;  // source java byte array address
4297       const Register len = c_rarg2;  // length
4298       const Register a = rax;
4299       const Register j = r9;
4300       const Register k = r10;
4301       const Register l = r11;
4302 #ifdef _WIN64
4303       const Register y = rdi;
4304       const Register z = rsi;
4305 #else
4306       const Register y = rcx;
4307       const Register z = r8;
4308 #endif
4309       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
4310 
4311       BLOCK_COMMENT("Entry:");
4312       __ enter(); // required for proper stackwalking of RuntimeStub frame
4313 #ifdef _WIN64
4314       __ push(y);
4315       __ push(z);
4316 #endif
4317       __ crc32c_ipl_alg2_alt2(crc, buf, len,
4318                               a, j, k,
4319                               l, y, z,
4320                               c_farg0, c_farg1, c_farg2,
4321                               is_pclmulqdq_supported);
4322       __ movl(rax, crc);
4323 #ifdef _WIN64
4324       __ pop(z);
4325       __ pop(y);
4326 #endif
4327       __ vzeroupper();
4328       __ leave(); // required for proper stackwalking of RuntimeStub frame
4329       __ ret(0);
4330 
4331       return start;
4332   }
4333 
4334   /**
4335    *  Arguments:
4336    *
4337    *  Input:
4338    *    c_rarg0   - x address
4339    *    c_rarg1   - x length
4340    *    c_rarg2   - y address
4341    *    c_rarg3   - y length
4342    * not Win64
4343    *    c_rarg4   - z address
4344    *    c_rarg5   - z length
4345    * Win64
4346    *    rsp+40    - z address
4347    *    rsp+48    - z length
4348    */
4349   address generate_multiplyToLen() {
4350     __ align(CodeEntryAlignment);
4351     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4352 
4353     address start = __ pc();
4354     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4355     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4356     const Register x     = rdi;
4357     const Register xlen  = rax;
4358     const Register y     = rsi;
4359     const Register ylen  = rcx;
4360     const Register z     = r8;
4361     const Register zlen  = r11;
4362 
4363     // Next registers will be saved on stack in multiply_to_len().
4364     const Register tmp1  = r12;
4365     const Register tmp2  = r13;
4366     const Register tmp3  = r14;
4367     const Register tmp4  = r15;
4368     const Register tmp5  = rbx;
4369 
4370     BLOCK_COMMENT("Entry:");
4371     __ enter(); // required for proper stackwalking of RuntimeStub frame
4372 
4373 #ifndef _WIN64
4374     __ movptr(zlen, r9); // Save r9 in r11 - zlen
4375 #endif
4376     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
4377                        // ylen => rcx, z => r8, zlen => r11
4378                        // r9 and r10 may be used to save non-volatile registers
4379 #ifdef _WIN64
4380     // last 2 arguments (#4, #5) are on stack on Win64
4381     __ movptr(z, Address(rsp, 6 * wordSize));
4382     __ movptr(zlen, Address(rsp, 7 * wordSize));
4383 #endif
4384 
4385     __ movptr(xlen, rsi);
4386     __ movptr(y,    rdx);
4387     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
4388 
4389     restore_arg_regs();
4390 
4391     __ leave(); // required for proper stackwalking of RuntimeStub frame
4392     __ ret(0);
4393 
4394     return start;
4395   }
4396 
4397   /**
4398   *  Arguments:
4399   *
4400   *  Input:
4401   *    c_rarg0   - obja     address
4402   *    c_rarg1   - objb     address
4403   *    c_rarg3   - length   length
4404   *    c_rarg4   - scale    log2_array_indxscale
4405   *
4406   *  Output:
4407   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
4408   */
4409   address generate_vectorizedMismatch() {
4410     __ align(CodeEntryAlignment);
4411     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
4412     address start = __ pc();
4413 
4414     BLOCK_COMMENT("Entry:");
4415     __ enter();
4416 
4417 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4418     const Register scale = c_rarg0;  //rcx, will exchange with r9
4419     const Register objb = c_rarg1;   //rdx
4420     const Register length = c_rarg2; //r8
4421     const Register obja = c_rarg3;   //r9
4422     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
4423 
4424     const Register tmp1 = r10;
4425     const Register tmp2 = r11;
4426 #endif
4427 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4428     const Register obja = c_rarg0;   //U:rdi
4429     const Register objb = c_rarg1;   //U:rsi
4430     const Register length = c_rarg2; //U:rdx
4431     const Register scale = c_rarg3;  //U:rcx
4432     const Register tmp1 = r8;
4433     const Register tmp2 = r9;
4434 #endif
4435     const Register result = rax; //return value
4436     const XMMRegister vec0 = xmm0;
4437     const XMMRegister vec1 = xmm1;
4438     const XMMRegister vec2 = xmm2;
4439 
4440     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
4441 
4442     __ vzeroupper();
4443     __ leave();
4444     __ ret(0);
4445 
4446     return start;
4447   }
4448 
4449 /**
4450    *  Arguments:
4451    *
4452   //  Input:
4453   //    c_rarg0   - x address
4454   //    c_rarg1   - x length
4455   //    c_rarg2   - z address
4456   //    c_rarg3   - z lenth
4457    *
4458    */
4459   address generate_squareToLen() {
4460 
4461     __ align(CodeEntryAlignment);
4462     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4463 
4464     address start = __ pc();
4465     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4466     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
4467     const Register x      = rdi;
4468     const Register len    = rsi;
4469     const Register z      = r8;
4470     const Register zlen   = rcx;
4471 
4472    const Register tmp1      = r12;
4473    const Register tmp2      = r13;
4474    const Register tmp3      = r14;
4475    const Register tmp4      = r15;
4476    const Register tmp5      = rbx;
4477 
4478     BLOCK_COMMENT("Entry:");
4479     __ enter(); // required for proper stackwalking of RuntimeStub frame
4480 
4481        setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
4482                           // zlen => rcx
4483                           // r9 and r10 may be used to save non-volatile registers
4484     __ movptr(r8, rdx);
4485     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
4486 
4487     restore_arg_regs();
4488 
4489     __ leave(); // required for proper stackwalking of RuntimeStub frame
4490     __ ret(0);
4491 
4492     return start;
4493   }
4494 
4495    /**
4496    *  Arguments:
4497    *
4498    *  Input:
4499    *    c_rarg0   - out address
4500    *    c_rarg1   - in address
4501    *    c_rarg2   - offset
4502    *    c_rarg3   - len
4503    * not Win64
4504    *    c_rarg4   - k
4505    * Win64
4506    *    rsp+40    - k
4507    */
4508   address generate_mulAdd() {
4509     __ align(CodeEntryAlignment);
4510     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4511 
4512     address start = __ pc();
4513     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4514     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4515     const Register out     = rdi;
4516     const Register in      = rsi;
4517     const Register offset  = r11;
4518     const Register len     = rcx;
4519     const Register k       = r8;
4520 
4521     // Next registers will be saved on stack in mul_add().
4522     const Register tmp1  = r12;
4523     const Register tmp2  = r13;
4524     const Register tmp3  = r14;
4525     const Register tmp4  = r15;
4526     const Register tmp5  = rbx;
4527 
4528     BLOCK_COMMENT("Entry:");
4529     __ enter(); // required for proper stackwalking of RuntimeStub frame
4530 
4531     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
4532                        // len => rcx, k => r8
4533                        // r9 and r10 may be used to save non-volatile registers
4534 #ifdef _WIN64
4535     // last argument is on stack on Win64
4536     __ movl(k, Address(rsp, 6 * wordSize));
4537 #endif
4538     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
4539     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
4540 
4541     restore_arg_regs();
4542 
4543     __ leave(); // required for proper stackwalking of RuntimeStub frame
4544     __ ret(0);
4545 
4546     return start;
4547   }
4548 
4549   address generate_libmExp() {
4550     StubCodeMark mark(this, "StubRoutines", "libmExp");
4551 
4552     address start = __ pc();
4553 
4554     const XMMRegister x0  = xmm0;
4555     const XMMRegister x1  = xmm1;
4556     const XMMRegister x2  = xmm2;
4557     const XMMRegister x3  = xmm3;
4558 
4559     const XMMRegister x4  = xmm4;
4560     const XMMRegister x5  = xmm5;
4561     const XMMRegister x6  = xmm6;
4562     const XMMRegister x7  = xmm7;
4563 
4564     const Register tmp   = r11;
4565 
4566     BLOCK_COMMENT("Entry:");
4567     __ enter(); // required for proper stackwalking of RuntimeStub frame
4568 
4569     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4570 
4571     __ leave(); // required for proper stackwalking of RuntimeStub frame
4572     __ ret(0);
4573 
4574     return start;
4575 
4576   }
4577 
4578   address generate_libmLog() {
4579     StubCodeMark mark(this, "StubRoutines", "libmLog");
4580 
4581     address start = __ pc();
4582 
4583     const XMMRegister x0 = xmm0;
4584     const XMMRegister x1 = xmm1;
4585     const XMMRegister x2 = xmm2;
4586     const XMMRegister x3 = xmm3;
4587 
4588     const XMMRegister x4 = xmm4;
4589     const XMMRegister x5 = xmm5;
4590     const XMMRegister x6 = xmm6;
4591     const XMMRegister x7 = xmm7;
4592 
4593     const Register tmp1 = r11;
4594     const Register tmp2 = r8;
4595 
4596     BLOCK_COMMENT("Entry:");
4597     __ enter(); // required for proper stackwalking of RuntimeStub frame
4598 
4599     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
4600 
4601     __ leave(); // required for proper stackwalking of RuntimeStub frame
4602     __ ret(0);
4603 
4604     return start;
4605 
4606   }
4607 
4608   address generate_libmLog10() {
4609     StubCodeMark mark(this, "StubRoutines", "libmLog10");
4610 
4611     address start = __ pc();
4612 
4613     const XMMRegister x0 = xmm0;
4614     const XMMRegister x1 = xmm1;
4615     const XMMRegister x2 = xmm2;
4616     const XMMRegister x3 = xmm3;
4617 
4618     const XMMRegister x4 = xmm4;
4619     const XMMRegister x5 = xmm5;
4620     const XMMRegister x6 = xmm6;
4621     const XMMRegister x7 = xmm7;
4622 
4623     const Register tmp = r11;
4624 
4625     BLOCK_COMMENT("Entry:");
4626     __ enter(); // required for proper stackwalking of RuntimeStub frame
4627 
4628     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4629 
4630     __ leave(); // required for proper stackwalking of RuntimeStub frame
4631     __ ret(0);
4632 
4633     return start;
4634 
4635   }
4636 
4637   address generate_libmPow() {
4638     StubCodeMark mark(this, "StubRoutines", "libmPow");
4639 
4640     address start = __ pc();
4641 
4642     const XMMRegister x0 = xmm0;
4643     const XMMRegister x1 = xmm1;
4644     const XMMRegister x2 = xmm2;
4645     const XMMRegister x3 = xmm3;
4646 
4647     const XMMRegister x4 = xmm4;
4648     const XMMRegister x5 = xmm5;
4649     const XMMRegister x6 = xmm6;
4650     const XMMRegister x7 = xmm7;
4651 
4652     const Register tmp1 = r8;
4653     const Register tmp2 = r9;
4654     const Register tmp3 = r10;
4655     const Register tmp4 = r11;
4656 
4657     BLOCK_COMMENT("Entry:");
4658     __ enter(); // required for proper stackwalking of RuntimeStub frame
4659 
4660     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4661 
4662     __ leave(); // required for proper stackwalking of RuntimeStub frame
4663     __ ret(0);
4664 
4665     return start;
4666 
4667   }
4668 
4669   address generate_libmSin() {
4670     StubCodeMark mark(this, "StubRoutines", "libmSin");
4671 
4672     address start = __ pc();
4673 
4674     const XMMRegister x0 = xmm0;
4675     const XMMRegister x1 = xmm1;
4676     const XMMRegister x2 = xmm2;
4677     const XMMRegister x3 = xmm3;
4678 
4679     const XMMRegister x4 = xmm4;
4680     const XMMRegister x5 = xmm5;
4681     const XMMRegister x6 = xmm6;
4682     const XMMRegister x7 = xmm7;
4683 
4684     const Register tmp1 = r8;
4685     const Register tmp2 = r9;
4686     const Register tmp3 = r10;
4687     const Register tmp4 = r11;
4688 
4689     BLOCK_COMMENT("Entry:");
4690     __ enter(); // required for proper stackwalking of RuntimeStub frame
4691 
4692 #ifdef _WIN64
4693     __ push(rsi);
4694     __ push(rdi);
4695 #endif
4696     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4697 
4698 #ifdef _WIN64
4699     __ pop(rdi);
4700     __ pop(rsi);
4701 #endif
4702 
4703     __ leave(); // required for proper stackwalking of RuntimeStub frame
4704     __ ret(0);
4705 
4706     return start;
4707 
4708   }
4709 
4710   address generate_libmCos() {
4711     StubCodeMark mark(this, "StubRoutines", "libmCos");
4712 
4713     address start = __ pc();
4714 
4715     const XMMRegister x0 = xmm0;
4716     const XMMRegister x1 = xmm1;
4717     const XMMRegister x2 = xmm2;
4718     const XMMRegister x3 = xmm3;
4719 
4720     const XMMRegister x4 = xmm4;
4721     const XMMRegister x5 = xmm5;
4722     const XMMRegister x6 = xmm6;
4723     const XMMRegister x7 = xmm7;
4724 
4725     const Register tmp1 = r8;
4726     const Register tmp2 = r9;
4727     const Register tmp3 = r10;
4728     const Register tmp4 = r11;
4729 
4730     BLOCK_COMMENT("Entry:");
4731     __ enter(); // required for proper stackwalking of RuntimeStub frame
4732 
4733 #ifdef _WIN64
4734     __ push(rsi);
4735     __ push(rdi);
4736 #endif
4737     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4738 
4739 #ifdef _WIN64
4740     __ pop(rdi);
4741     __ pop(rsi);
4742 #endif
4743 
4744     __ leave(); // required for proper stackwalking of RuntimeStub frame
4745     __ ret(0);
4746 
4747     return start;
4748 
4749   }
4750 
4751   address generate_libmTan() {
4752     StubCodeMark mark(this, "StubRoutines", "libmTan");
4753 
4754     address start = __ pc();
4755 
4756     const XMMRegister x0 = xmm0;
4757     const XMMRegister x1 = xmm1;
4758     const XMMRegister x2 = xmm2;
4759     const XMMRegister x3 = xmm3;
4760 
4761     const XMMRegister x4 = xmm4;
4762     const XMMRegister x5 = xmm5;
4763     const XMMRegister x6 = xmm6;
4764     const XMMRegister x7 = xmm7;
4765 
4766     const Register tmp1 = r8;
4767     const Register tmp2 = r9;
4768     const Register tmp3 = r10;
4769     const Register tmp4 = r11;
4770 
4771     BLOCK_COMMENT("Entry:");
4772     __ enter(); // required for proper stackwalking of RuntimeStub frame
4773 
4774 #ifdef _WIN64
4775     __ push(rsi);
4776     __ push(rdi);
4777 #endif
4778     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4779 
4780 #ifdef _WIN64
4781     __ pop(rdi);
4782     __ pop(rsi);
4783 #endif
4784 
4785     __ leave(); // required for proper stackwalking of RuntimeStub frame
4786     __ ret(0);
4787 
4788     return start;
4789 
4790   }
4791 
4792 void push_FrameInfo(MacroAssembler* _masm, Register fi, Register sp, Register fp, address pc) {
4793   if(!sp->is_valid()) __ push(0); else {
4794     if (sp == rsp) {
4795       __ movptr(fi, rsp);
4796       __ push(fi);
4797     } else {
4798       __ push(sp);
4799     }
4800   }
4801   
4802   if(!fp->is_valid()) __ push(0); else __ push(fp);
4803 
4804   __ lea(fi, ExternalAddress(pc));
4805   __ push(fi);
4806 
4807   __ movptr(fi, rsp); // make fi point to the beginning of FramInfo
4808 }
4809 
4810 void push_FrameInfo(MacroAssembler* _masm, Register fi, Register sp, Register fp, Register pc) {
4811   if(!sp->is_valid()) __ push(0); else {
4812     if (sp == rsp) {
4813       __ movptr(fi, rsp);
4814       __ push(fi);
4815     } else {
4816       __ push(sp);
4817     }
4818   }
4819   
4820   if(!fp->is_valid()) __ push(0); else __ push(fp);
4821 
4822   if(!pc->is_valid()) __ push(0); else __ push(pc);
4823 
4824   __ movptr(fi, rsp); // make fi point to the beginning of FramInfo
4825 }
4826 
4827 void pop_FrameInfo(MacroAssembler* _masm, Register sp, Register fp, Register pc) {
4828   if(!pc->is_valid()) __ lea(rsp, Address(rsp, wordSize)); else __ pop(pc);
4829   if(!fp->is_valid()) __ lea(rsp, Address(rsp, wordSize)); else __ pop(fp);
4830   if(!sp->is_valid()) __ lea(rsp, Address(rsp, wordSize)); else __ pop(sp);
4831 }
4832 
4833   // c_rarg1 ContinuationScope
4834 address generate_cont_doYield() {
4835     const char *name = "cont_doYield";
4836 
4837     enum layout {
4838       frameinfo_11  = frame::arg_reg_save_area_bytes/BytesPerInt,
4839       frameinfo_12,
4840       frameinfo_21,
4841       frameinfo_22,
4842       frameinfo_31,
4843       frameinfo_32,
4844       rbp_off,
4845       rbpH_off,
4846       return_off,
4847       return_off2,
4848       framesize // inclusive of return address
4849     };
4850     // assert(is_even(framesize/2), "sp not 16-byte aligned");
4851     int insts_size = 512;
4852     int locs_size  = 64;
4853     CodeBuffer code(name, insts_size, locs_size);
4854     OopMapSet* oop_maps  = new OopMapSet();
4855     MacroAssembler* masm = new MacroAssembler(&code);
4856     MacroAssembler* _masm = masm;
4857 
4858     // MacroAssembler* masm = _masm;
4859     // StubCodeMark mark(this, "StubRoutines", name);
4860 
4861     address start = __ pc();
4862 
4863     Register fi = c_rarg1;
4864 
4865     __ movq(c_rarg2, c_rarg0);          // scopes argument
4866     __ movptr(rax, Address(rsp, 0));    // use return address as the frame pc // __ lea(rax, InternalAddress(pcxxxx));
4867     __ lea(fi, Address(rsp, wordSize)); // skip return address
4868     __ movptr(c_rarg3, rbp);
4869 
4870     __ enter();
4871     
4872     // // return address and rbp are already in place
4873     // __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
4874 
4875     push_FrameInfo(masm, fi, fi, c_rarg3, rax);
4876 
4877     int frame_complete = __ pc() - start;
4878     address the_pc = __ pc();
4879 
4880     __ set_last_Java_frame(rsp, rbp, the_pc); // may be unnecessary. also, consider MacroAssembler::call_VM_leaf_base
4881 
4882     __ call_VM(noreg, CAST_FROM_FN_PTR(address, Continuation::freeze), fi, c_rarg2, false); // do NOT check exceptions; they'll get forwarded to the caller
4883 
4884     Label pinned;
4885     __ pop(rax); // read the pc from the FrameInfo
4886     __ testq(rax, rax);
4887     __ jcc(Assembler::zero, pinned);
4888     
4889     __ pop(rbp); // not pinned -- jump to Continuation.run (the entry frame)
4890     __ pop(fi);
4891     __ movptr(rsp, fi);
4892     __ jmp(rax);
4893     
4894     __ bind(pinned); // pinned -- return to caller
4895     __ lea(rsp, Address(rsp, wordSize*2)); // "pop" the rest of the FrameInfo struct
4896 
4897     __ leave();
4898     __ ret(0);
4899 
4900     // return start;
4901 
4902     OopMap* map = new OopMap(framesize, 1);
4903     // map->set_callee_saved(VMRegImpl::stack2reg(rbp_off), rbp->as_VMReg());
4904     oop_maps->add_gc_map(the_pc - start, map);
4905 
4906     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
4907       RuntimeStub::new_runtime_stub(name,
4908                                     &code,
4909                                     frame_complete,
4910                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4911                                     oop_maps, false);
4912     return stub->entry_point();
4913   }
4914 
4915   address generate_cont_thaw(bool return_barrier) {
4916     address start = __ pc();
4917 
4918     // TODO: Handle Valhalla return types. May require generating different return barriers.
4919 
4920     Register fi = r11;
4921 
4922     if (!return_barrier) {
4923       __ pop(c_rarg3); // pop return address. if we don't do this, we get a drift, where the bottom-most frozen frame continuously grows
4924       // __ lea(rsp, Address(rsp, wordSize)); // pop return address. if we don't do this, we get a drift, where the bottom-most frozen frame continuously grows
4925     }
4926 
4927     Label thaw_fail;
4928     __ movptr(fi, rsp);
4929     __ push(rax); __ push_d(xmm0); // preserve possible return value from a method returning to the return barrier
4930     __ movl(c_rarg2, return_barrier);
4931     push_FrameInfo(_masm, fi, fi, rbp, c_rarg3);
4932     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), fi, c_rarg2);
4933     __ testq(rax, rax);           // rax contains the size of the frames to thaw, 0 if overflow or no more frames
4934     __ jcc(Assembler::zero, thaw_fail);
4935 
4936     pop_FrameInfo(_masm, fi, rbp, c_rarg3); // c_rarg3 would still be our return address
4937     __ pop_d(xmm0); __ pop(rdx);   // TEMPORARILY restore return value (we're going to push it again, but rsp is about to move)
4938 
4939     __ subq(rsp, rax);             // make room for the thawed frames
4940     // __ movptr(fi, rsp);            // where we'll start copying frame (the lowest address)
4941     __ push(rdx); __ push_d(xmm0); // save original return value -- again
4942     __ movl(c_rarg2, return_barrier);
4943     push_FrameInfo(_masm, fi, fi, rbp, c_rarg3);
4944     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::thaw), fi, c_rarg2);
4945 
4946     __ bind(thaw_fail);
4947     pop_FrameInfo(_masm, fi, rbp, rdx);
4948     // __ movl(rbp, 0);
4949     __ pop_d(xmm0); __ pop(rax); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4950     __ movptr(rsp, fi); // we're now on the yield frame (which is above us b/c rsp has been pushed down)
4951     __ jmp(rdx);
4952 
4953     return start;
4954   }
4955 
4956   address generate_cont_thaw() {
4957     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
4958     address start = __ pc();
4959     generate_cont_thaw(false);
4960     return start;
4961   }
4962 
4963   address generate_cont_returnBarrier() {
4964     // TODO: will probably need multiple return barriers depending on return type
4965     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
4966     address start = __ pc();
4967 
4968     if (CONT_FULL_STACK)
4969       __ stop("RETURN BARRIER -- UNREACHABLE 0");
4970 
4971     generate_cont_thaw(true);
4972 
4973     return start;
4974   }
4975 
4976   address generate_cont_getPC() {
4977     StubCodeMark mark(this, "StubRoutines", "GetPC");
4978     address start = __ pc();
4979 
4980     __ movptr(rax, Address(rsp, 0));
4981     __ ret(0);
4982 
4983     return start;
4984   }
4985 
4986   address generate_cont_getSP() {
4987     StubCodeMark mark(this, "StubRoutines", "getSP");
4988     address start = __ pc();
4989 
4990     __ lea(rax, Address(rsp, wordSize));
4991     __ ret(0);
4992 
4993     return start;
4994   }
4995 
4996   address generate_cont_getFP() {
4997     StubCodeMark mark(this, "StubRoutines", "GetFP");
4998     address start = __ pc();
4999 
5000     __ stop("WHAT?");
5001     __ lea(rax, Address(rsp, wordSize));
5002     __ ret(0);
5003 
5004     return start;
5005   }
5006 
5007 #undef __
5008 #define __ masm->
5009 
5010   // Continuation point for throwing of implicit exceptions that are
5011   // not handled in the current activation. Fabricates an exception
5012   // oop and initiates normal exception dispatching in this
5013   // frame. Since we need to preserve callee-saved values (currently
5014   // only for C2, but done for C1 as well) we need a callee-saved oop
5015   // map and therefore have to make these stubs into RuntimeStubs
5016   // rather than BufferBlobs.  If the compiler needs all registers to
5017   // be preserved between the fault point and the exception handler
5018   // then it must assume responsibility for that in
5019   // AbstractCompiler::continuation_for_implicit_null_exception or
5020   // continuation_for_implicit_division_by_zero_exception. All other
5021   // implicit exceptions (e.g., NullPointerException or
5022   // AbstractMethodError on entry) are either at call sites or
5023   // otherwise assume that stack unwinding will be initiated, so
5024   // caller saved registers were assumed volatile in the compiler.
5025   address generate_throw_exception(const char* name,
5026                                    address runtime_entry,
5027                                    Register arg1 = noreg,
5028                                    Register arg2 = noreg) {
5029     // Information about frame layout at time of blocking runtime call.
5030     // Note that we only have to preserve callee-saved registers since
5031     // the compilers are responsible for supplying a continuation point
5032     // if they expect all registers to be preserved.
5033     enum layout {
5034       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
5035       rbp_off2,
5036       return_off,
5037       return_off2,
5038       framesize // inclusive of return address
5039     };
5040 
5041     int insts_size = 512;
5042     int locs_size  = 64;
5043 
5044     CodeBuffer code(name, insts_size, locs_size);
5045     OopMapSet* oop_maps  = new OopMapSet();
5046     MacroAssembler* masm = new MacroAssembler(&code);
5047 
5048     address start = __ pc();
5049 
5050     // This is an inlined and slightly modified version of call_VM
5051     // which has the ability to fetch the return PC out of
5052     // thread-local storage and also sets up last_Java_sp slightly
5053     // differently than the real call_VM
5054 
5055     __ enter(); // required for proper stackwalking of RuntimeStub frame
5056 
5057     assert(is_even(framesize/2), "sp not 16-byte aligned");
5058 
5059     // return address and rbp are already in place
5060     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
5061 
5062     int frame_complete = __ pc() - start;
5063 
5064     // Set up last_Java_sp and last_Java_fp
5065     address the_pc = __ pc();
5066     __ set_last_Java_frame(rsp, rbp, the_pc);
5067     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
5068 
5069     // Call runtime
5070     if (arg1 != noreg) {
5071       assert(arg2 != c_rarg1, "clobbered");
5072       __ movptr(c_rarg1, arg1);
5073     }
5074     if (arg2 != noreg) {
5075       __ movptr(c_rarg2, arg2);
5076     }
5077     __ movptr(c_rarg0, r15_thread);
5078     BLOCK_COMMENT("call runtime_entry");
5079     __ call(RuntimeAddress(runtime_entry));
5080 
5081     // Generate oop map
5082     OopMap* map = new OopMap(framesize, 0);
5083 
5084     oop_maps->add_gc_map(the_pc - start, map);
5085 
5086     __ reset_last_Java_frame(true);
5087 
5088     __ leave(); // required for proper stackwalking of RuntimeStub frame
5089 
5090     // check for pending exceptions
5091 #ifdef ASSERT
5092     Label L;
5093     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
5094             (int32_t) NULL_WORD);
5095     __ jcc(Assembler::notEqual, L);
5096     __ should_not_reach_here();
5097     __ bind(L);
5098 #endif // ASSERT
5099     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5100 
5101 
5102     // codeBlob framesize is in words (not VMRegImpl::slot_size)
5103     RuntimeStub* stub =
5104       RuntimeStub::new_runtime_stub(name,
5105                                     &code,
5106                                     frame_complete,
5107                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
5108                                     oop_maps, false);
5109     return stub->entry_point();
5110   }
5111 
5112   void create_control_words() {
5113     // Round to nearest, 53-bit mode, exceptions masked
5114     StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
5115     // Round to zero, 53-bit mode, exception mased
5116     StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
5117     // Round to nearest, 24-bit mode, exceptions masked
5118     StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
5119     // Round to nearest, 64-bit mode, exceptions masked
5120     StubRoutines::_fpu_cntrl_wrd_64    = 0x037F;
5121     // Round to nearest, 64-bit mode, exceptions masked
5122     StubRoutines::_mxcsr_std           = 0x1F80;
5123     // Note: the following two constants are 80-bit values
5124     //       layout is critical for correct loading by FPU.
5125     // Bias for strict fp multiply/divide
5126     StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
5127     StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
5128     StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
5129     // Un-Bias for strict fp multiply/divide
5130     StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
5131     StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
5132     StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
5133   }
5134 
5135   // Initialization
5136   void generate_initial() {
5137     // Generates all stubs and initializes the entry points
5138 
5139     // This platform-specific settings are needed by generate_call_stub()
5140     create_control_words();
5141 
5142     // entry points that exist in all platforms Note: This is code
5143     // that could be shared among different platforms - however the
5144     // benefit seems to be smaller than the disadvantage of having a
5145     // much more complicated generator structure. See also comment in
5146     // stubRoutines.hpp.
5147 
5148     StubRoutines::_forward_exception_entry = generate_forward_exception();
5149 
5150     StubRoutines::_call_stub_entry =
5151       generate_call_stub(StubRoutines::_call_stub_return_address);
5152 
5153     // is referenced by megamorphic call
5154     StubRoutines::_catch_exception_entry = generate_catch_exception();
5155 
5156     // atomic calls
5157     StubRoutines::_atomic_xchg_entry          = generate_atomic_xchg();
5158     StubRoutines::_atomic_xchg_long_entry     = generate_atomic_xchg_long();
5159     StubRoutines::_atomic_cmpxchg_entry       = generate_atomic_cmpxchg();
5160     StubRoutines::_atomic_cmpxchg_byte_entry  = generate_atomic_cmpxchg_byte();
5161     StubRoutines::_atomic_cmpxchg_long_entry  = generate_atomic_cmpxchg_long();
5162     StubRoutines::_atomic_add_entry           = generate_atomic_add();
5163     StubRoutines::_atomic_add_long_entry      = generate_atomic_add_long();
5164     StubRoutines::_fence_entry                = generate_orderaccess_fence();
5165 
5166     // platform dependent
5167     StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp();
5168     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
5169 
5170     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
5171 
5172     // Build this early so it's available for the interpreter.
5173     StubRoutines::_throw_StackOverflowError_entry =
5174       generate_throw_exception("StackOverflowError throw_exception",
5175                                CAST_FROM_FN_PTR(address,
5176                                                 SharedRuntime::
5177                                                 throw_StackOverflowError));
5178     StubRoutines::_throw_delayed_StackOverflowError_entry =
5179       generate_throw_exception("delayed StackOverflowError throw_exception",
5180                                CAST_FROM_FN_PTR(address,
5181                                                 SharedRuntime::
5182                                                 throw_delayed_StackOverflowError));
5183     if (UseCRC32Intrinsics) {
5184       // set table address before stub generation which use it
5185       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
5186       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5187     }
5188 
5189     if (UseCRC32CIntrinsics) {
5190       bool supports_clmul = VM_Version::supports_clmul();
5191       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
5192       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
5193       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
5194     }
5195     if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
5196       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
5197           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
5198           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5199         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
5200         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
5201         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
5202         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
5203         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
5204         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
5205         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
5206         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
5207         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
5208         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
5209         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
5210         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
5211         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
5212         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
5213       }
5214       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
5215         StubRoutines::_dexp = generate_libmExp();
5216       }
5217       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5218         StubRoutines::_dlog = generate_libmLog();
5219       }
5220       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
5221         StubRoutines::_dlog10 = generate_libmLog10();
5222       }
5223       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
5224         StubRoutines::_dpow = generate_libmPow();
5225       }
5226       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5227         StubRoutines::_dsin = generate_libmSin();
5228       }
5229       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5230         StubRoutines::_dcos = generate_libmCos();
5231       }
5232       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5233         StubRoutines::_dtan = generate_libmTan();
5234       }
5235     }
5236   }
5237 
5238   void generate_phase1() {
5239     // Continuation stubs:
5240     StubRoutines::_cont_thaw          = generate_cont_thaw();
5241     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
5242     StubRoutines::_cont_doYield    = generate_cont_doYield();
5243     StubRoutines::_cont_getSP      = generate_cont_getSP();
5244     StubRoutines::_cont_getPC      = generate_cont_getPC();
5245   }
5246 
5247   void generate_all() {
5248     // Generates all stubs and initializes the entry points
5249 
5250     // These entry points require SharedInfo::stack0 to be set up in
5251     // non-core builds and need to be relocatable, so they each
5252     // fabricate a RuntimeStub internally.
5253     StubRoutines::_throw_AbstractMethodError_entry =
5254       generate_throw_exception("AbstractMethodError throw_exception",
5255                                CAST_FROM_FN_PTR(address,
5256                                                 SharedRuntime::
5257                                                 throw_AbstractMethodError));
5258 
5259     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5260       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5261                                CAST_FROM_FN_PTR(address,
5262                                                 SharedRuntime::
5263                                                 throw_IncompatibleClassChangeError));
5264 
5265     StubRoutines::_throw_NullPointerException_at_call_entry =
5266       generate_throw_exception("NullPointerException at call throw_exception",
5267                                CAST_FROM_FN_PTR(address,
5268                                                 SharedRuntime::
5269                                                 throw_NullPointerException_at_call));
5270 
5271     // entry points that are platform specific
5272     StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
5273     StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
5274     StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
5275     StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
5276 
5277     StubRoutines::x86::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
5278     StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
5279     StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
5280     StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
5281 
5282     // support for verify_oop (must happen after universe_init)
5283     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
5284 
5285     // arraycopy stubs used by compilers
5286     generate_arraycopy_stubs();
5287 
5288     // don't bother generating these AES intrinsic stubs unless global flag is set
5289     if (UseAESIntrinsics) {
5290       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
5291       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5292       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5293       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5294       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
5295     }
5296     if (UseAESCTRIntrinsics){
5297       StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
5298       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
5299     }
5300 
5301     if (UseSHA1Intrinsics) {
5302       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
5303       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
5304       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
5305       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
5306     }
5307     if (UseSHA256Intrinsics) {
5308       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
5309       char* dst = (char*)StubRoutines::x86::_k256_W;
5310       char* src = (char*)StubRoutines::x86::_k256;
5311       for (int ii = 0; ii < 16; ++ii) {
5312         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
5313         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
5314       }
5315       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
5316       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
5317       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
5318       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
5319     }
5320     if (UseSHA512Intrinsics) {
5321       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
5322       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
5323       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
5324       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
5325     }
5326 
5327     // Generate GHASH intrinsics code
5328     if (UseGHASHIntrinsics) {
5329       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
5330       StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
5331       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5332     }
5333 
5334     // Safefetch stubs.
5335     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5336                                                        &StubRoutines::_safefetch32_fault_pc,
5337                                                        &StubRoutines::_safefetch32_continuation_pc);
5338     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5339                                                        &StubRoutines::_safefetchN_fault_pc,
5340                                                        &StubRoutines::_safefetchN_continuation_pc);
5341 #ifdef COMPILER2
5342     if (UseMultiplyToLenIntrinsic) {
5343       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5344     }
5345     if (UseSquareToLenIntrinsic) {
5346       StubRoutines::_squareToLen = generate_squareToLen();
5347     }
5348     if (UseMulAddIntrinsic) {
5349       StubRoutines::_mulAdd = generate_mulAdd();
5350     }
5351 #ifndef _WINDOWS
5352     if (UseMontgomeryMultiplyIntrinsic) {
5353       StubRoutines::_montgomeryMultiply
5354         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
5355     }
5356     if (UseMontgomerySquareIntrinsic) {
5357       StubRoutines::_montgomerySquare
5358         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
5359     }
5360 #endif // WINDOWS
5361 #endif // COMPILER2
5362 
5363     if (UseVectorizedMismatchIntrinsic) {
5364       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
5365     }
5366   }
5367 
5368  public:
5369   StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) {
5370     if (phase == 0) {
5371       generate_initial();
5372     } else if (phase == 1) {
5373       generate_phase1();
5374     } else {
5375       generate_all();
5376     }
5377   }
5378 }; // end class declaration
5379 
5380 void StubGenerator_generate(CodeBuffer* code, int phase) {
5381   StubGenerator g(code, phase);
5382 }