1 /*
   2  * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "interpreter/interpreter.hpp"
  29 #include "nativeInst_x86.hpp"
  30 #include "oops/instanceOop.hpp"
  31 #include "oops/method.hpp"
  32 #include "oops/objArrayKlass.hpp"
  33 #include "oops/oop.inline.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/frame.inline.hpp"
  36 #include "runtime/handles.inline.hpp"
  37 #include "runtime/sharedRuntime.hpp"
  38 #include "runtime/stubCodeGenerator.hpp"
  39 #include "runtime/stubRoutines.hpp"
  40 #include "runtime/thread.inline.hpp"
  41 #ifdef COMPILER2
  42 #include "opto/runtime.hpp"
  43 #endif
  44 #if INCLUDE_ALL_GCS
  45 #include "gc/z/zBarrier.inline.hpp"
  46 #include "gc/z/zGlobals.hpp"
  47 #endif
  48 
  49 // Declaration and definition of StubGenerator (no .hpp file).
  50 // For a more detailed description of the stub routine structure
  51 // see the comment in stubRoutines.hpp
  52 
  53 #define __ _masm->
  54 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  55 #define a__ ((Assembler*)_masm)->
  56 
  57 #ifdef PRODUCT
  58 #define BLOCK_COMMENT(str) /* nothing */
  59 #else
  60 #define BLOCK_COMMENT(str) __ block_comment(str)
  61 #endif
  62 
  63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  64 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  65 
  66 // Stub Code definitions
  67 
  68 class StubGenerator: public StubCodeGenerator {
  69  private:
  70 
  71 #ifdef PRODUCT
  72 #define inc_counter_np(counter) ((void)0)
  73 #else
  74   void inc_counter_np_(int& counter) {
  75     // This can destroy rscratch1 if counter is far from the code cache
  76     __ incrementl(ExternalAddress((address)&counter));
  77   }
  78 #define inc_counter_np(counter) \
  79   BLOCK_COMMENT("inc_counter " #counter); \
  80   inc_counter_np_(counter);
  81 #endif
  82 
  83   // Call stubs are used to call Java from C
  84   //
  85   // Linux Arguments:
  86   //    c_rarg0:   call wrapper address                   address
  87   //    c_rarg1:   result                                 address
  88   //    c_rarg2:   result type                            BasicType
  89   //    c_rarg3:   method                                 Method*
  90   //    c_rarg4:   (interpreter) entry point              address
  91   //    c_rarg5:   parameters                             intptr_t*
  92   //    16(rbp): parameter size (in words)              int
  93   //    24(rbp): thread                                 Thread*
  94   //
  95   //     [ return_from_Java     ] <--- rsp
  96   //     [ argument word n      ]
  97   //      ...
  98   // -12 [ argument word 1      ]
  99   // -11 [ saved r15            ] <--- rsp_after_call
 100   // -10 [ saved r14            ]
 101   //  -9 [ saved r13            ]
 102   //  -8 [ saved r12            ]
 103   //  -7 [ saved rbx            ]
 104   //  -6 [ call wrapper         ]
 105   //  -5 [ result               ]
 106   //  -4 [ result type          ]
 107   //  -3 [ method               ]
 108   //  -2 [ entry point          ]
 109   //  -1 [ parameters           ]
 110   //   0 [ saved rbp            ] <--- rbp
 111   //   1 [ return address       ]
 112   //   2 [ parameter size       ]
 113   //   3 [ thread               ]
 114   //
 115   // Windows Arguments:
 116   //    c_rarg0:   call wrapper address                   address
 117   //    c_rarg1:   result                                 address
 118   //    c_rarg2:   result type                            BasicType
 119   //    c_rarg3:   method                                 Method*
 120   //    48(rbp): (interpreter) entry point              address
 121   //    56(rbp): parameters                             intptr_t*
 122   //    64(rbp): parameter size (in words)              int
 123   //    72(rbp): thread                                 Thread*
 124   //
 125   //     [ return_from_Java     ] <--- rsp
 126   //     [ argument word n      ]
 127   //      ...
 128   // -60 [ argument word 1      ]
 129   // -59 [ saved xmm31          ] <--- rsp after_call
 130   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 131   // -27 [ saved xmm15          ]
 132   //     [ saved xmm7-xmm14     ]
 133   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 134   //  -7 [ saved r15            ]
 135   //  -6 [ saved r14            ]
 136   //  -5 [ saved r13            ]
 137   //  -4 [ saved r12            ]
 138   //  -3 [ saved rdi            ]
 139   //  -2 [ saved rsi            ]
 140   //  -1 [ saved rbx            ]
 141   //   0 [ saved rbp            ] <--- rbp
 142   //   1 [ return address       ]
 143   //   2 [ call wrapper         ]
 144   //   3 [ result               ]
 145   //   4 [ result type          ]
 146   //   5 [ method               ]
 147   //   6 [ entry point          ]
 148   //   7 [ parameters           ]
 149   //   8 [ parameter size       ]
 150   //   9 [ thread               ]
 151   //
 152   //    Windows reserves the callers stack space for arguments 1-4.
 153   //    We spill c_rarg0-c_rarg3 to this space.
 154 
 155   // Call stub stack layout word offsets from rbp
 156   enum call_stub_layout {
 157 #ifdef _WIN64
 158     xmm_save_first     = 6,  // save from xmm6
 159     xmm_save_last      = 31, // to xmm31
 160     xmm_save_base      = -9,
 161     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 162     r15_off            = -7,
 163     r14_off            = -6,
 164     r13_off            = -5,
 165     r12_off            = -4,
 166     rdi_off            = -3,
 167     rsi_off            = -2,
 168     rbx_off            = -1,
 169     rbp_off            =  0,
 170     retaddr_off        =  1,
 171     call_wrapper_off   =  2,
 172     result_off         =  3,
 173     result_type_off    =  4,
 174     method_off         =  5,
 175     entry_point_off    =  6,
 176     parameters_off     =  7,
 177     parameter_size_off =  8,
 178     thread_off         =  9
 179 #else
 180     rsp_after_call_off = -12,
 181     mxcsr_off          = rsp_after_call_off,
 182     r15_off            = -11,
 183     r14_off            = -10,
 184     r13_off            = -9,
 185     r12_off            = -8,
 186     rbx_off            = -7,
 187     call_wrapper_off   = -6,
 188     result_off         = -5,
 189     result_type_off    = -4,
 190     method_off         = -3,
 191     entry_point_off    = -2,
 192     parameters_off     = -1,
 193     rbp_off            =  0,
 194     retaddr_off        =  1,
 195     parameter_size_off =  2,
 196     thread_off         =  3
 197 #endif
 198   };
 199 
 200 #ifdef _WIN64
 201   Address xmm_save(int reg) {
 202     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 203     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 204   }
 205 #endif
 206 
 207   address generate_call_stub(address& return_address) {
 208     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 209            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 210            "adjust this code");
 211     StubCodeMark mark(this, "StubRoutines", "call_stub");
 212     address start = __ pc();
 213 
 214     // same as in generate_catch_exception()!
 215     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 216 
 217     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 218     const Address result        (rbp, result_off         * wordSize);
 219     const Address result_type   (rbp, result_type_off    * wordSize);
 220     const Address method        (rbp, method_off         * wordSize);
 221     const Address entry_point   (rbp, entry_point_off    * wordSize);
 222     const Address parameters    (rbp, parameters_off     * wordSize);
 223     const Address parameter_size(rbp, parameter_size_off * wordSize);
 224 
 225     // same as in generate_catch_exception()!
 226     const Address thread        (rbp, thread_off         * wordSize);
 227 
 228     const Address r15_save(rbp, r15_off * wordSize);
 229     const Address r14_save(rbp, r14_off * wordSize);
 230     const Address r13_save(rbp, r13_off * wordSize);
 231     const Address r12_save(rbp, r12_off * wordSize);
 232     const Address rbx_save(rbp, rbx_off * wordSize);
 233 
 234     // stub code
 235     __ enter();
 236     __ subptr(rsp, -rsp_after_call_off * wordSize);
 237 
 238     // save register parameters
 239 #ifndef _WIN64
 240     __ movptr(parameters,   c_rarg5); // parameters
 241     __ movptr(entry_point,  c_rarg4); // entry_point
 242 #endif
 243 
 244     __ movptr(method,       c_rarg3); // method
 245     __ movl(result_type,  c_rarg2);   // result type
 246     __ movptr(result,       c_rarg1); // result
 247     __ movptr(call_wrapper, c_rarg0); // call wrapper
 248 
 249     // save regs belonging to calling function
 250     __ movptr(rbx_save, rbx);
 251     __ movptr(r12_save, r12);
 252     __ movptr(r13_save, r13);
 253     __ movptr(r14_save, r14);
 254     __ movptr(r15_save, r15);
 255     if (UseAVX > 2) {
 256       __ movl(rbx, 0xffff);
 257       __ kmovwl(k1, rbx);
 258     }
 259 #ifdef _WIN64
 260     int last_reg = 15;
 261     if (UseAVX > 2) {
 262       last_reg = 31;
 263     }
 264     if (VM_Version::supports_evex()) {
 265       for (int i = xmm_save_first; i <= last_reg; i++) {
 266         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 267       }
 268     } else {
 269       for (int i = xmm_save_first; i <= last_reg; i++) {
 270         __ movdqu(xmm_save(i), as_XMMRegister(i));
 271       }
 272     }
 273 
 274     const Address rdi_save(rbp, rdi_off * wordSize);
 275     const Address rsi_save(rbp, rsi_off * wordSize);
 276 
 277     __ movptr(rsi_save, rsi);
 278     __ movptr(rdi_save, rdi);
 279 #else
 280     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 281     {
 282       Label skip_ldmx;
 283       __ stmxcsr(mxcsr_save);
 284       __ movl(rax, mxcsr_save);
 285       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 286       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 287       __ cmp32(rax, mxcsr_std);
 288       __ jcc(Assembler::equal, skip_ldmx);
 289       __ ldmxcsr(mxcsr_std);
 290       __ bind(skip_ldmx);
 291     }
 292 #endif
 293 
 294     // Load up thread register
 295     __ movptr(r15_thread, thread);
 296     __ reinit_heapbase();
 297 
 298 #ifdef ASSERT
 299     // make sure we have no pending exceptions
 300     {
 301       Label L;
 302       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 303       __ jcc(Assembler::equal, L);
 304       __ stop("StubRoutines::call_stub: entered with pending exception");
 305       __ bind(L);
 306     }
 307 #endif
 308 
 309     // pass parameters if any
 310     BLOCK_COMMENT("pass parameters if any");
 311     Label parameters_done;
 312     __ movl(c_rarg3, parameter_size);
 313     __ testl(c_rarg3, c_rarg3);
 314     __ jcc(Assembler::zero, parameters_done);
 315 
 316     Label loop;
 317     __ movptr(c_rarg2, parameters);       // parameter pointer
 318     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 319     __ BIND(loop);
 320     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 321     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 322     __ decrementl(c_rarg1);             // decrement counter
 323     __ push(rax);                       // pass parameter
 324     __ jcc(Assembler::notZero, loop);
 325 
 326     // call Java function
 327     __ BIND(parameters_done);
 328     __ movptr(rbx, method);             // get Method*
 329     __ movptr(c_rarg1, entry_point);    // get entry_point
 330     __ mov(r13, rsp);                   // set sender sp
 331     BLOCK_COMMENT("call Java function");
 332     __ call(c_rarg1);
 333 
 334     BLOCK_COMMENT("call_stub_return_address:");
 335     return_address = __ pc();
 336 
 337     // store result depending on type (everything that is not
 338     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 339     __ movptr(c_rarg0, result);
 340     Label is_long, is_float, is_double, exit;
 341     __ movl(c_rarg1, result_type);
 342     __ cmpl(c_rarg1, T_OBJECT);
 343     __ jcc(Assembler::equal, is_long);
 344     __ cmpl(c_rarg1, T_LONG);
 345     __ jcc(Assembler::equal, is_long);
 346     __ cmpl(c_rarg1, T_FLOAT);
 347     __ jcc(Assembler::equal, is_float);
 348     __ cmpl(c_rarg1, T_DOUBLE);
 349     __ jcc(Assembler::equal, is_double);
 350 
 351     // handle T_INT case
 352     __ movl(Address(c_rarg0, 0), rax);
 353 
 354     __ BIND(exit);
 355 
 356     // pop parameters
 357     __ lea(rsp, rsp_after_call);
 358 
 359 #ifdef ASSERT
 360     // verify that threads correspond
 361     {
 362      Label L1, L2, L3;
 363       __ cmpptr(r15_thread, thread);
 364       __ jcc(Assembler::equal, L1);
 365       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 366       __ bind(L1);
 367       __ get_thread(rbx);
 368       __ cmpptr(r15_thread, thread);
 369       __ jcc(Assembler::equal, L2);
 370       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 371       __ bind(L2);
 372       __ cmpptr(r15_thread, rbx);
 373       __ jcc(Assembler::equal, L3);
 374       __ stop("StubRoutines::call_stub: threads must correspond");
 375       __ bind(L3);
 376     }
 377 #endif
 378 
 379     // restore regs belonging to calling function
 380 #ifdef _WIN64
 381     // emit the restores for xmm regs
 382     if (VM_Version::supports_evex()) {
 383       for (int i = xmm_save_first; i <= last_reg; i++) {
 384         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 385       }
 386     } else {
 387       for (int i = xmm_save_first; i <= last_reg; i++) {
 388         __ movdqu(as_XMMRegister(i), xmm_save(i));
 389       }
 390     }
 391 #endif
 392     __ movptr(r15, r15_save);
 393     __ movptr(r14, r14_save);
 394     __ movptr(r13, r13_save);
 395     __ movptr(r12, r12_save);
 396     __ movptr(rbx, rbx_save);
 397 
 398 #ifdef _WIN64
 399     __ movptr(rdi, rdi_save);
 400     __ movptr(rsi, rsi_save);
 401 #else
 402     __ ldmxcsr(mxcsr_save);
 403 #endif
 404 
 405     // restore rsp
 406     __ addptr(rsp, -rsp_after_call_off * wordSize);
 407 
 408     // return
 409     __ vzeroupper();
 410     __ pop(rbp);
 411     __ ret(0);
 412 
 413     // handle return types different from T_INT
 414     __ BIND(is_long);
 415     __ movq(Address(c_rarg0, 0), rax);
 416     __ jmp(exit);
 417 
 418     __ BIND(is_float);
 419     __ movflt(Address(c_rarg0, 0), xmm0);
 420     __ jmp(exit);
 421 
 422     __ BIND(is_double);
 423     __ movdbl(Address(c_rarg0, 0), xmm0);
 424     __ jmp(exit);
 425 
 426     return start;
 427   }
 428 
 429   // Return point for a Java call if there's an exception thrown in
 430   // Java code.  The exception is caught and transformed into a
 431   // pending exception stored in JavaThread that can be tested from
 432   // within the VM.
 433   //
 434   // Note: Usually the parameters are removed by the callee. In case
 435   // of an exception crossing an activation frame boundary, that is
 436   // not the case if the callee is compiled code => need to setup the
 437   // rsp.
 438   //
 439   // rax: exception oop
 440 
 441   address generate_catch_exception() {
 442     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 443     address start = __ pc();
 444 
 445     // same as in generate_call_stub():
 446     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 447     const Address thread        (rbp, thread_off         * wordSize);
 448 
 449 #ifdef ASSERT
 450     // verify that threads correspond
 451     {
 452       Label L1, L2, L3;
 453       __ cmpptr(r15_thread, thread);
 454       __ jcc(Assembler::equal, L1);
 455       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 456       __ bind(L1);
 457       __ get_thread(rbx);
 458       __ cmpptr(r15_thread, thread);
 459       __ jcc(Assembler::equal, L2);
 460       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 461       __ bind(L2);
 462       __ cmpptr(r15_thread, rbx);
 463       __ jcc(Assembler::equal, L3);
 464       __ stop("StubRoutines::catch_exception: threads must correspond");
 465       __ bind(L3);
 466     }
 467 #endif
 468 
 469     // set pending exception
 470     __ verify_oop(rax);
 471 
 472     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 473     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 474     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 475     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 476 
 477     // complete return to VM
 478     assert(StubRoutines::_call_stub_return_address != NULL,
 479            "_call_stub_return_address must have been generated before");
 480     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 481 
 482     return start;
 483   }
 484 
 485   // Continuation point for runtime calls returning with a pending
 486   // exception.  The pending exception check happened in the runtime
 487   // or native call stub.  The pending exception in Thread is
 488   // converted into a Java-level exception.
 489   //
 490   // Contract with Java-level exception handlers:
 491   // rax: exception
 492   // rdx: throwing pc
 493   //
 494   // NOTE: At entry of this stub, exception-pc must be on stack !!
 495 
 496   address generate_forward_exception() {
 497     StubCodeMark mark(this, "StubRoutines", "forward exception");
 498     address start = __ pc();
 499 
 500     // Upon entry, the sp points to the return address returning into
 501     // Java (interpreted or compiled) code; i.e., the return address
 502     // becomes the throwing pc.
 503     //
 504     // Arguments pushed before the runtime call are still on the stack
 505     // but the exception handler will reset the stack pointer ->
 506     // ignore them.  A potential result in registers can be ignored as
 507     // well.
 508 
 509 #ifdef ASSERT
 510     // make sure this code is only executed if there is a pending exception
 511     {
 512       Label L;
 513       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 514       __ jcc(Assembler::notEqual, L);
 515       __ stop("StubRoutines::forward exception: no pending exception (1)");
 516       __ bind(L);
 517     }
 518 #endif
 519 
 520     // compute exception handler into rbx
 521     __ movptr(c_rarg0, Address(rsp, 0));
 522     BLOCK_COMMENT("call exception_handler_for_return_address");
 523     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 524                          SharedRuntime::exception_handler_for_return_address),
 525                     r15_thread, c_rarg0);
 526     __ mov(rbx, rax);
 527 
 528     // setup rax & rdx, remove return address & clear pending exception
 529     __ pop(rdx);
 530     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 531     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 532 
 533 #ifdef ASSERT
 534     // make sure exception is set
 535     {
 536       Label L;
 537       __ testptr(rax, rax);
 538       __ jcc(Assembler::notEqual, L);
 539       __ stop("StubRoutines::forward exception: no pending exception (2)");
 540       __ bind(L);
 541     }
 542 #endif
 543 
 544     // continue at exception handler (return address removed)
 545     // rax: exception
 546     // rbx: exception handler
 547     // rdx: throwing pc
 548     __ verify_oop(rax);
 549     __ jmp(rbx);
 550 
 551     return start;
 552   }
 553 
 554   // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest)
 555   //
 556   // Arguments :
 557   //    c_rarg0: exchange_value
 558   //    c_rarg0: dest
 559   //
 560   // Result:
 561   //    *dest <- ex, return (orig *dest)
 562   address generate_atomic_xchg() {
 563     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 564     address start = __ pc();
 565 
 566     __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow
 567     __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK
 568     __ ret(0);
 569 
 570     return start;
 571   }
 572 
 573   // Support for intptr_t atomic::xchg_long(jlong exchange_value, volatile jlong* dest)
 574   //
 575   // Arguments :
 576   //    c_rarg0: exchange_value
 577   //    c_rarg1: dest
 578   //
 579   // Result:
 580   //    *dest <- ex, return (orig *dest)
 581   address generate_atomic_xchg_long() {
 582     StubCodeMark mark(this, "StubRoutines", "atomic_xchg_long");
 583     address start = __ pc();
 584 
 585     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 586     __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
 587     __ ret(0);
 588 
 589     return start;
 590   }
 591 
 592   // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest,
 593   //                                         jint compare_value)
 594   //
 595   // Arguments :
 596   //    c_rarg0: exchange_value
 597   //    c_rarg1: dest
 598   //    c_rarg2: compare_value
 599   //
 600   // Result:
 601   //    if ( compare_value == *dest ) {
 602   //       *dest = exchange_value
 603   //       return compare_value;
 604   //    else
 605   //       return *dest;
 606   address generate_atomic_cmpxchg() {
 607     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 608     address start = __ pc();
 609 
 610     __ movl(rax, c_rarg2);
 611    if ( os::is_MP() ) __ lock();
 612     __ cmpxchgl(c_rarg0, Address(c_rarg1, 0));
 613     __ ret(0);
 614 
 615     return start;
 616   }
 617 
 618   // Support for int8_t atomic::atomic_cmpxchg(int8_t exchange_value, volatile int8_t* dest,
 619   //                                           int8_t compare_value)
 620   //
 621   // Arguments :
 622   //    c_rarg0: exchange_value
 623   //    c_rarg1: dest
 624   //    c_rarg2: compare_value
 625   //
 626   // Result:
 627   //    if ( compare_value == *dest ) {
 628   //       *dest = exchange_value
 629   //       return compare_value;
 630   //    else
 631   //       return *dest;
 632   address generate_atomic_cmpxchg_byte() {
 633     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_byte");
 634     address start = __ pc();
 635 
 636     __ movsbq(rax, c_rarg2);
 637    if ( os::is_MP() ) __ lock();
 638     __ cmpxchgb(c_rarg0, Address(c_rarg1, 0));
 639     __ ret(0);
 640 
 641     return start;
 642   }
 643 
 644   // Support for int64_t atomic::atomic_cmpxchg(int64_t exchange_value,
 645   //                                            volatile int64_t* dest,
 646   //                                            int64_t compare_value)
 647   // Arguments :
 648   //    c_rarg0: exchange_value
 649   //    c_rarg1: dest
 650   //    c_rarg2: compare_value
 651   //
 652   // Result:
 653   //    if ( compare_value == *dest ) {
 654   //       *dest = exchange_value
 655   //       return compare_value;
 656   //    else
 657   //       return *dest;
 658   address generate_atomic_cmpxchg_long() {
 659     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 660     address start = __ pc();
 661 
 662     __ movq(rax, c_rarg2);
 663    if ( os::is_MP() ) __ lock();
 664     __ cmpxchgq(c_rarg0, Address(c_rarg1, 0));
 665     __ ret(0);
 666 
 667     return start;
 668   }
 669 
 670   // Support for jint atomic::add(jint add_value, volatile jint* dest)
 671   //
 672   // Arguments :
 673   //    c_rarg0: add_value
 674   //    c_rarg1: dest
 675   //
 676   // Result:
 677   //    *dest += add_value
 678   //    return *dest;
 679   address generate_atomic_add() {
 680     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 681     address start = __ pc();
 682 
 683     __ movl(rax, c_rarg0);
 684    if ( os::is_MP() ) __ lock();
 685     __ xaddl(Address(c_rarg1, 0), c_rarg0);
 686     __ addl(rax, c_rarg0);
 687     __ ret(0);
 688 
 689     return start;
 690   }
 691 
 692   // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest)
 693   //
 694   // Arguments :
 695   //    c_rarg0: add_value
 696   //    c_rarg1: dest
 697   //
 698   // Result:
 699   //    *dest += add_value
 700   //    return *dest;
 701   address generate_atomic_add_long() {
 702     StubCodeMark mark(this, "StubRoutines", "atomic_add_long");
 703     address start = __ pc();
 704 
 705     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 706    if ( os::is_MP() ) __ lock();
 707     __ xaddptr(Address(c_rarg1, 0), c_rarg0);
 708     __ addptr(rax, c_rarg0);
 709     __ ret(0);
 710 
 711     return start;
 712   }
 713 
 714   // Support for intptr_t OrderAccess::fence()
 715   //
 716   // Arguments :
 717   //
 718   // Result:
 719   address generate_orderaccess_fence() {
 720     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 721     address start = __ pc();
 722     __ membar(Assembler::StoreLoad);
 723     __ ret(0);
 724 
 725     return start;
 726   }
 727 
 728   // Support for intptr_t get_previous_fp()
 729   //
 730   // This routine is used to find the previous frame pointer for the
 731   // caller (current_frame_guess). This is used as part of debugging
 732   // ps() is seemingly lost trying to find frames.
 733   // This code assumes that caller current_frame_guess) has a frame.
 734   address generate_get_previous_fp() {
 735     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
 736     const Address old_fp(rbp, 0);
 737     const Address older_fp(rax, 0);
 738     address start = __ pc();
 739 
 740     __ enter();
 741     __ movptr(rax, old_fp); // callers fp
 742     __ movptr(rax, older_fp); // the frame for ps()
 743     __ pop(rbp);
 744     __ ret(0);
 745 
 746     return start;
 747   }
 748 
 749   // Support for intptr_t get_previous_sp()
 750   //
 751   // This routine is used to find the previous stack pointer for the
 752   // caller.
 753   address generate_get_previous_sp() {
 754     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
 755     address start = __ pc();
 756 
 757     __ movptr(rax, rsp);
 758     __ addptr(rax, 8); // return address is at the top of the stack.
 759     __ ret(0);
 760 
 761     return start;
 762   }
 763 
 764   //----------------------------------------------------------------------------------------------------
 765   // Support for void verify_mxcsr()
 766   //
 767   // This routine is used with -Xcheck:jni to verify that native
 768   // JNI code does not return to Java code without restoring the
 769   // MXCSR register to our expected state.
 770 
 771   address generate_verify_mxcsr() {
 772     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 773     address start = __ pc();
 774 
 775     const Address mxcsr_save(rsp, 0);
 776 
 777     if (CheckJNICalls) {
 778       Label ok_ret;
 779       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 780       __ push(rax);
 781       __ subptr(rsp, wordSize);      // allocate a temp location
 782       __ stmxcsr(mxcsr_save);
 783       __ movl(rax, mxcsr_save);
 784       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 785       __ cmp32(rax, mxcsr_std);
 786       __ jcc(Assembler::equal, ok_ret);
 787 
 788       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
 789 
 790       __ ldmxcsr(mxcsr_std);
 791 
 792       __ bind(ok_ret);
 793       __ addptr(rsp, wordSize);
 794       __ pop(rax);
 795     }
 796 
 797     __ ret(0);
 798 
 799     return start;
 800   }
 801 
 802   address generate_load_barrier_stub(Register raddr, address runtime_entry, bool is_weak) {
 803     char *name = (char *)NULL;
 804     {
 805       ResourceMark rm;
 806       stringStream ss;
 807       if (is_weak) {
 808         ss.print("load_barrier_weak_slow_stub_%s", raddr->name());
 809       } else {
 810         ss.print("load_barrier_slow_stub_%s", raddr->name());
 811       }
 812       name = os::strdup(ss.as_string(),mtCode);
 813     }
 814     __ align(CodeEntryAlignment);
 815     StubCodeMark mark(this, "StubRoutines", name);
 816     address start = __ pc();
 817 
 818     // save live registers
 819     if (raddr != rax) {
 820       __ push(rax);
 821     }
 822     if (raddr != rcx) {
 823       __ push(rcx);
 824     }
 825     if (raddr != rdx) {
 826       __ push(rdx);
 827     }
 828     if (raddr != rsi) {
 829       __ push(rsi);
 830     }
 831     if (raddr != rdi) {
 832       __ push(rdi);
 833     }
 834     if (raddr != r8) {
 835       __ push(r8);
 836     }
 837     if (raddr != r9) {
 838       __ push(r9);
 839     }
 840     if (raddr != r10) {
 841       __ push(r10);
 842     }
 843     if (raddr != r11) {
 844       __ push(r11);
 845     }
 846 
 847      __ movq(c_rarg1,raddr);
 848      __ movq(c_rarg0,Address(c_rarg1,0));
 849      __ call_VM_leaf(runtime_entry, c_rarg0, c_rarg1);
 850 
 851     // restore saved registers
 852     if (raddr != r11) {
 853       __ pop(r11);
 854     }
 855     if (raddr != r10) {
 856       __ pop(r10);
 857     }
 858     if (raddr != r9) {
 859       __ pop(r9);
 860     }
 861     if (raddr != r8) {
 862       __ pop(r8);
 863     }
 864     if (raddr != rdi) {
 865       __ pop(rdi);
 866     }
 867     if (raddr != rsi) {
 868       __ pop(rsi);
 869     }
 870     if (raddr != rdx) {
 871       __ pop(rdx);
 872     }
 873     if (raddr != rcx) {
 874        __ pop(rcx);
 875     }
 876     if (raddr != rax) {
 877       __ movq(raddr,rax);
 878       __ pop(rax);
 879     }
 880 
 881     __ ret(0);
 882 
 883     return start;
 884   }
 885 
 886   address generate_f2i_fixup() {
 887     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
 888     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 889 
 890     address start = __ pc();
 891 
 892     Label L;
 893 
 894     __ push(rax);
 895     __ push(c_rarg3);
 896     __ push(c_rarg2);
 897     __ push(c_rarg1);
 898 
 899     __ movl(rax, 0x7f800000);
 900     __ xorl(c_rarg3, c_rarg3);
 901     __ movl(c_rarg2, inout);
 902     __ movl(c_rarg1, c_rarg2);
 903     __ andl(c_rarg1, 0x7fffffff);
 904     __ cmpl(rax, c_rarg1); // NaN? -> 0
 905     __ jcc(Assembler::negative, L);
 906     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
 907     __ movl(c_rarg3, 0x80000000);
 908     __ movl(rax, 0x7fffffff);
 909     __ cmovl(Assembler::positive, c_rarg3, rax);
 910 
 911     __ bind(L);
 912     __ movptr(inout, c_rarg3);
 913 
 914     __ pop(c_rarg1);
 915     __ pop(c_rarg2);
 916     __ pop(c_rarg3);
 917     __ pop(rax);
 918 
 919     __ ret(0);
 920 
 921     return start;
 922   }
 923 
 924   address generate_f2l_fixup() {
 925     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
 926     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 927     address start = __ pc();
 928 
 929     Label L;
 930 
 931     __ push(rax);
 932     __ push(c_rarg3);
 933     __ push(c_rarg2);
 934     __ push(c_rarg1);
 935 
 936     __ movl(rax, 0x7f800000);
 937     __ xorl(c_rarg3, c_rarg3);
 938     __ movl(c_rarg2, inout);
 939     __ movl(c_rarg1, c_rarg2);
 940     __ andl(c_rarg1, 0x7fffffff);
 941     __ cmpl(rax, c_rarg1); // NaN? -> 0
 942     __ jcc(Assembler::negative, L);
 943     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
 944     __ mov64(c_rarg3, 0x8000000000000000);
 945     __ mov64(rax, 0x7fffffffffffffff);
 946     __ cmov(Assembler::positive, c_rarg3, rax);
 947 
 948     __ bind(L);
 949     __ movptr(inout, c_rarg3);
 950 
 951     __ pop(c_rarg1);
 952     __ pop(c_rarg2);
 953     __ pop(c_rarg3);
 954     __ pop(rax);
 955 
 956     __ ret(0);
 957 
 958     return start;
 959   }
 960 
 961   address generate_d2i_fixup() {
 962     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
 963     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 964 
 965     address start = __ pc();
 966 
 967     Label L;
 968 
 969     __ push(rax);
 970     __ push(c_rarg3);
 971     __ push(c_rarg2);
 972     __ push(c_rarg1);
 973     __ push(c_rarg0);
 974 
 975     __ movl(rax, 0x7ff00000);
 976     __ movq(c_rarg2, inout);
 977     __ movl(c_rarg3, c_rarg2);
 978     __ mov(c_rarg1, c_rarg2);
 979     __ mov(c_rarg0, c_rarg2);
 980     __ negl(c_rarg3);
 981     __ shrptr(c_rarg1, 0x20);
 982     __ orl(c_rarg3, c_rarg2);
 983     __ andl(c_rarg1, 0x7fffffff);
 984     __ xorl(c_rarg2, c_rarg2);
 985     __ shrl(c_rarg3, 0x1f);
 986     __ orl(c_rarg1, c_rarg3);
 987     __ cmpl(rax, c_rarg1);
 988     __ jcc(Assembler::negative, L); // NaN -> 0
 989     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
 990     __ movl(c_rarg2, 0x80000000);
 991     __ movl(rax, 0x7fffffff);
 992     __ cmov(Assembler::positive, c_rarg2, rax);
 993 
 994     __ bind(L);
 995     __ movptr(inout, c_rarg2);
 996 
 997     __ pop(c_rarg0);
 998     __ pop(c_rarg1);
 999     __ pop(c_rarg2);
1000     __ pop(c_rarg3);
1001     __ pop(rax);
1002 
1003     __ ret(0);
1004 
1005     return start;
1006   }
1007 
1008   address generate_d2l_fixup() {
1009     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
1010     Address inout(rsp, 6 * wordSize); // return address + 5 saves
1011 
1012     address start = __ pc();
1013 
1014     Label L;
1015 
1016     __ push(rax);
1017     __ push(c_rarg3);
1018     __ push(c_rarg2);
1019     __ push(c_rarg1);
1020     __ push(c_rarg0);
1021 
1022     __ movl(rax, 0x7ff00000);
1023     __ movq(c_rarg2, inout);
1024     __ movl(c_rarg3, c_rarg2);
1025     __ mov(c_rarg1, c_rarg2);
1026     __ mov(c_rarg0, c_rarg2);
1027     __ negl(c_rarg3);
1028     __ shrptr(c_rarg1, 0x20);
1029     __ orl(c_rarg3, c_rarg2);
1030     __ andl(c_rarg1, 0x7fffffff);
1031     __ xorl(c_rarg2, c_rarg2);
1032     __ shrl(c_rarg3, 0x1f);
1033     __ orl(c_rarg1, c_rarg3);
1034     __ cmpl(rax, c_rarg1);
1035     __ jcc(Assembler::negative, L); // NaN -> 0
1036     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
1037     __ mov64(c_rarg2, 0x8000000000000000);
1038     __ mov64(rax, 0x7fffffffffffffff);
1039     __ cmovq(Assembler::positive, c_rarg2, rax);
1040 
1041     __ bind(L);
1042     __ movq(inout, c_rarg2);
1043 
1044     __ pop(c_rarg0);
1045     __ pop(c_rarg1);
1046     __ pop(c_rarg2);
1047     __ pop(c_rarg3);
1048     __ pop(rax);
1049 
1050     __ ret(0);
1051 
1052     return start;
1053   }
1054 
1055   address generate_fp_mask(const char *stub_name, int64_t mask) {
1056     __ align(CodeEntryAlignment);
1057     StubCodeMark mark(this, "StubRoutines", stub_name);
1058     address start = __ pc();
1059 
1060     __ emit_data64( mask, relocInfo::none );
1061     __ emit_data64( mask, relocInfo::none );
1062 
1063     return start;
1064   }
1065 
1066   // Non-destructive plausibility checks for oops
1067   //
1068   // Arguments:
1069   //    all args on stack!
1070   //
1071   // Stack after saving c_rarg3:
1072   //    [tos + 0]: saved c_rarg3
1073   //    [tos + 1]: saved c_rarg2
1074   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
1075   //    [tos + 3]: saved flags
1076   //    [tos + 4]: return address
1077   //  * [tos + 5]: error message (char*)
1078   //  * [tos + 6]: object to verify (oop)
1079   //  * [tos + 7]: saved rax - saved by caller and bashed
1080   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
1081   //  * = popped on exit
1082   address generate_verify_oop() {
1083     StubCodeMark mark(this, "StubRoutines", "verify_oop");
1084     address start = __ pc();
1085 
1086     Label exit, error;
1087 
1088     __ pushf();
1089     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
1090 
1091     __ push(r12);
1092 
1093     // save c_rarg2 and c_rarg3
1094     __ push(c_rarg2);
1095     __ push(c_rarg3);
1096 
1097     enum {
1098            // After previous pushes.
1099            oop_to_verify = 6 * wordSize,
1100            saved_rax     = 7 * wordSize,
1101            saved_r10     = 8 * wordSize,
1102 
1103            // Before the call to MacroAssembler::debug(), see below.
1104            return_addr   = 16 * wordSize,
1105            error_msg     = 17 * wordSize
1106     };
1107 
1108     // get object
1109     __ movptr(rax, Address(rsp, oop_to_verify));
1110 
1111     // make sure object is 'reasonable'
1112     __ testptr(rax, rax);
1113     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
1114 
1115     if (UseLoadBarrier) {
1116       // Check if metadata bits indicate a bad oop
1117       __ testptr(rax, ExternalAddress((address)&ZAddressBadMask));
1118       __ jcc(Assembler::notZero, error);
1119     }
1120 
1121     // Check if the oop is in the right area of memory
1122     __ movptr(c_rarg2, rax);
1123     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1124     __ andptr(c_rarg2, c_rarg3);
1125     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1126     __ cmpptr(c_rarg2, c_rarg3);
1127     __ jcc(Assembler::notZero, error);
1128 
1129     // set r12 to heapbase for load_klass()
1130     __ reinit_heapbase();
1131 
1132     // make sure klass is 'reasonable', which is not zero.
1133     __ load_klass(rax, rax);  // get klass
1134     __ testptr(rax, rax);
1135     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1136 
1137     // return if everything seems ok
1138     __ bind(exit);
1139     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1140     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1141     __ pop(c_rarg3);                             // restore c_rarg3
1142     __ pop(c_rarg2);                             // restore c_rarg2
1143     __ pop(r12);                                 // restore r12
1144     __ popf();                                   // restore flags
1145     __ ret(4 * wordSize);                        // pop caller saved stuff
1146 
1147     // handle errors
1148     __ bind(error);
1149     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1150     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1151     __ pop(c_rarg3);                             // get saved c_rarg3 back
1152     __ pop(c_rarg2);                             // get saved c_rarg2 back
1153     __ pop(r12);                                 // get saved r12 back
1154     __ popf();                                   // get saved flags off stack --
1155                                                  // will be ignored
1156 
1157     __ pusha();                                  // push registers
1158                                                  // (rip is already
1159                                                  // already pushed)
1160     // debug(char* msg, int64_t pc, int64_t regs[])
1161     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1162     // pushed all the registers, so now the stack looks like:
1163     //     [tos +  0] 16 saved registers
1164     //     [tos + 16] return address
1165     //   * [tos + 17] error message (char*)
1166     //   * [tos + 18] object to verify (oop)
1167     //   * [tos + 19] saved rax - saved by caller and bashed
1168     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1169     //   * = popped on exit
1170 
1171     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1172     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1173     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1174     __ mov(r12, rsp);                               // remember rsp
1175     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1176     __ andptr(rsp, -16);                            // align stack as required by ABI
1177     BLOCK_COMMENT("call MacroAssembler::debug");
1178     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1179     __ mov(rsp, r12);                               // restore rsp
1180     __ popa();                                      // pop registers (includes r12)
1181     __ ret(4 * wordSize);                           // pop caller saved stuff
1182 
1183     return start;
1184   }
1185 
1186   //
1187   // Verify that a register contains clean 32-bits positive value
1188   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1189   //
1190   //  Input:
1191   //    Rint  -  32-bits value
1192   //    Rtmp  -  scratch
1193   //
1194   void assert_clean_int(Register Rint, Register Rtmp) {
1195 #ifdef ASSERT
1196     Label L;
1197     assert_different_registers(Rtmp, Rint);
1198     __ movslq(Rtmp, Rint);
1199     __ cmpq(Rtmp, Rint);
1200     __ jcc(Assembler::equal, L);
1201     __ stop("high 32-bits of int value are not 0");
1202     __ bind(L);
1203 #endif
1204   }
1205 
1206   //  Generate overlap test for array copy stubs
1207   //
1208   //  Input:
1209   //     c_rarg0 - from
1210   //     c_rarg1 - to
1211   //     c_rarg2 - element count
1212   //
1213   //  Output:
1214   //     rax   - &from[element count - 1]
1215   //
1216   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1217     assert(no_overlap_target != NULL, "must be generated");
1218     array_overlap_test(no_overlap_target, NULL, sf);
1219   }
1220   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1221     array_overlap_test(NULL, &L_no_overlap, sf);
1222   }
1223   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1224     const Register from     = c_rarg0;
1225     const Register to       = c_rarg1;
1226     const Register count    = c_rarg2;
1227     const Register end_from = rax;
1228 
1229     __ cmpptr(to, from);
1230     __ lea(end_from, Address(from, count, sf, 0));
1231     if (NOLp == NULL) {
1232       ExternalAddress no_overlap(no_overlap_target);
1233       __ jump_cc(Assembler::belowEqual, no_overlap);
1234       __ cmpptr(to, end_from);
1235       __ jump_cc(Assembler::aboveEqual, no_overlap);
1236     } else {
1237       __ jcc(Assembler::belowEqual, (*NOLp));
1238       __ cmpptr(to, end_from);
1239       __ jcc(Assembler::aboveEqual, (*NOLp));
1240     }
1241   }
1242 
1243   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1244   //
1245   // Outputs:
1246   //    rdi - rcx
1247   //    rsi - rdx
1248   //    rdx - r8
1249   //    rcx - r9
1250   //
1251   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1252   // are non-volatile.  r9 and r10 should not be used by the caller.
1253   //
1254   void setup_arg_regs(int nargs = 3) {
1255     const Register saved_rdi = r9;
1256     const Register saved_rsi = r10;
1257     assert(nargs == 3 || nargs == 4, "else fix");
1258 #ifdef _WIN64
1259     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1260            "unexpected argument registers");
1261     if (nargs >= 4)
1262       __ mov(rax, r9);  // r9 is also saved_rdi
1263     __ movptr(saved_rdi, rdi);
1264     __ movptr(saved_rsi, rsi);
1265     __ mov(rdi, rcx); // c_rarg0
1266     __ mov(rsi, rdx); // c_rarg1
1267     __ mov(rdx, r8);  // c_rarg2
1268     if (nargs >= 4)
1269       __ mov(rcx, rax); // c_rarg3 (via rax)
1270 #else
1271     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1272            "unexpected argument registers");
1273 #endif
1274   }
1275 
1276   void restore_arg_regs() {
1277     const Register saved_rdi = r9;
1278     const Register saved_rsi = r10;
1279 #ifdef _WIN64
1280     __ movptr(rdi, saved_rdi);
1281     __ movptr(rsi, saved_rsi);
1282 #endif
1283   }
1284 
1285   // Generate code for an array write pre barrier
1286   //
1287   //     addr    -  starting address
1288   //     count   -  element count
1289   //
1290   //     Destroy no registers!
1291   //
1292   void gen_load_ref_array_barrier(Register addr, Register count) {
1293     BarrierSet* bs = Universe::heap()->barrier_set();
1294     switch (bs->kind()) {
1295       case BarrierSet::Z:
1296         __ pusha();                      // push registers
1297         if (count == c_rarg0) {
1298           if (addr == c_rarg1) {
1299             // exactly backwards!!
1300             __ xchgptr(c_rarg1, c_rarg0);
1301           } else {
1302             __ movptr(c_rarg1, count);
1303             __ movptr(c_rarg0, addr);
1304           }
1305         } else {
1306           __ movptr(c_rarg0, addr);
1307           __ movptr(c_rarg1, count);
1308         }
1309         __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<void (*)(volatile oop*, size_t)>(ZBarrier::load_barrier_on_oop_array)), 2);
1310         __ popa();
1311         break;
1312       case BarrierSet::G1SATBCTLogging:
1313       case BarrierSet::CardTableModRef:
1314       case BarrierSet::CardTableForRS:
1315       case BarrierSet::CardTableExtension:
1316       case BarrierSet::ModRef:
1317         // No barrier
1318         break;
1319       default:
1320         ShouldNotReachHere();
1321         break;
1322     }
1323   }
1324 
1325   // Generate code for an array write pre barrier
1326   //
1327   //     addr    -  starting address
1328   //     count   -  element count
1329   //     tmp     - scratch register
1330   //
1331   //     Destroy no registers!
1332   //
1333   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
1334     BarrierSet* bs = Universe::heap()->barrier_set();
1335     switch (bs->kind()) {
1336       case BarrierSet::G1SATBCTLogging:
1337         // With G1, don't generate the call if we statically know that the target in uninitialized
1338         if (!dest_uninitialized) {
1339           Label filtered;
1340           Address in_progress(r15_thread, in_bytes(JavaThread::satb_mark_queue_offset() +
1341                                                    SATBMarkQueue::byte_offset_of_active()));
1342           // Is marking active?
1343           if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
1344             __ cmpl(in_progress, 0);
1345           } else {
1346             assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
1347             __ cmpb(in_progress, 0);
1348           }
1349           __ jcc(Assembler::equal, filtered);
1350 
1351            __ pusha();                      // push registers
1352            if (count == c_rarg0) {
1353              if (addr == c_rarg1) {
1354                // exactly backwards!!
1355                __ xchgptr(c_rarg1, c_rarg0);
1356              } else {
1357                __ movptr(c_rarg1, count);
1358                __ movptr(c_rarg0, addr);
1359              }
1360            } else {
1361              __ movptr(c_rarg0, addr);
1362              __ movptr(c_rarg1, count);
1363            }
1364            __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
1365            __ popa();
1366 
1367            __ bind(filtered);
1368         }
1369          break;
1370       case BarrierSet::CardTableForRS:
1371       case BarrierSet::CardTableExtension:
1372       case BarrierSet::ModRef:
1373       case BarrierSet::Z:
1374         // No barrier
1375         break;
1376       default:
1377         ShouldNotReachHere();
1378 
1379     }
1380   }
1381 
1382   //
1383   // Generate code for an array write post barrier
1384   //
1385   //  Input:
1386   //     start    - register containing starting address of destination array
1387   //     count    - elements count
1388   //     scratch  - scratch register
1389   //
1390   //  The input registers are overwritten.
1391   //
1392   void  gen_write_ref_array_post_barrier(Register start, Register count, Register scratch) {
1393     assert_different_registers(start, count, scratch);
1394     BarrierSet* bs = Universe::heap()->barrier_set();
1395     switch (bs->kind()) {
1396       case BarrierSet::G1SATBCTLogging:
1397         {
1398           __ pusha();             // push registers (overkill)
1399           if (c_rarg0 == count) { // On win64 c_rarg0 == rcx
1400             assert_different_registers(c_rarg1, start);
1401             __ mov(c_rarg1, count);
1402             __ mov(c_rarg0, start);
1403           } else {
1404             assert_different_registers(c_rarg0, count);
1405             __ mov(c_rarg0, start);
1406             __ mov(c_rarg1, count);
1407           }
1408           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
1409           __ popa();
1410         }
1411         break;
1412       case BarrierSet::CardTableForRS:
1413       case BarrierSet::CardTableExtension:
1414         {
1415           CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
1416           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1417 
1418           Label L_loop, L_done;
1419           const Register end = count;
1420 
1421           __ testl(count, count);
1422           __ jcc(Assembler::zero, L_done); // zero count - nothing to do
1423 
1424           __ leaq(end, Address(start, count, TIMES_OOP, 0));  // end == start+count*oop_size
1425           __ subptr(end, BytesPerHeapOop); // end - 1 to make inclusive
1426           __ shrptr(start, CardTableModRefBS::card_shift);
1427           __ shrptr(end,   CardTableModRefBS::card_shift);
1428           __ subptr(end, start); // end --> cards count
1429 
1430           int64_t disp = (int64_t) ct->byte_map_base;
1431           __ mov64(scratch, disp);
1432           __ addptr(start, scratch);
1433         __ BIND(L_loop);
1434           __ movb(Address(start, count, Address::times_1), 0);
1435           __ decrement(count);
1436           __ jcc(Assembler::greaterEqual, L_loop);
1437         __ BIND(L_done);
1438         }
1439         break;
1440       case BarrierSet::Z:
1441         // No barrier
1442         break;
1443       default:
1444         ShouldNotReachHere();
1445 
1446     }
1447   }
1448 
1449 
1450   // Copy big chunks forward
1451   //
1452   // Inputs:
1453   //   end_from     - source arrays end address
1454   //   end_to       - destination array end address
1455   //   qword_count  - 64-bits element count, negative
1456   //   to           - scratch
1457   //   L_copy_bytes - entry label
1458   //   L_copy_8_bytes  - exit  label
1459   //
1460   void copy_bytes_forward(Register end_from, Register end_to,
1461                              Register qword_count, Register to,
1462                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1463     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1464     Label L_loop;
1465     __ align(OptoLoopAlignment);
1466     if (UseUnalignedLoadStores) {
1467       Label L_end;
1468       if (UseAVX > 2) {
1469         __ movl(to, 0xffff);
1470         __ kmovwl(k1, to);
1471       }
1472       // Copy 64-bytes per iteration
1473       __ BIND(L_loop);
1474       if (UseAVX > 2) {
1475         __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
1476         __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
1477       } else if (UseAVX == 2) {
1478         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1479         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1480         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1481         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1482       } else {
1483         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1484         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1485         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1486         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1487         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1488         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1489         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1490         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1491       }
1492       __ BIND(L_copy_bytes);
1493       __ addptr(qword_count, 8);
1494       __ jcc(Assembler::lessEqual, L_loop);
1495       __ subptr(qword_count, 4);  // sub(8) and add(4)
1496       __ jccb(Assembler::greater, L_end);
1497       // Copy trailing 32 bytes
1498       if (UseAVX >= 2) {
1499         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1500         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1501       } else {
1502         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1503         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1504         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1505         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1506       }
1507       __ addptr(qword_count, 4);
1508       __ BIND(L_end);
1509       if (UseAVX >= 2) {
1510         // clean upper bits of YMM registers
1511         __ vpxor(xmm0, xmm0);
1512         __ vpxor(xmm1, xmm1);
1513       }
1514     } else {
1515       // Copy 32-bytes per iteration
1516       __ BIND(L_loop);
1517       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1518       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1519       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1520       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1521       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1522       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1523       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1524       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1525 
1526       __ BIND(L_copy_bytes);
1527       __ addptr(qword_count, 4);
1528       __ jcc(Assembler::lessEqual, L_loop);
1529     }
1530     __ subptr(qword_count, 4);
1531     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1532   }
1533 
1534   // Copy big chunks backward
1535   //
1536   // Inputs:
1537   //   from         - source arrays address
1538   //   dest         - destination array address
1539   //   qword_count  - 64-bits element count
1540   //   to           - scratch
1541   //   L_copy_bytes - entry label
1542   //   L_copy_8_bytes  - exit  label
1543   //
1544   void copy_bytes_backward(Register from, Register dest,
1545                               Register qword_count, Register to,
1546                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1547     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1548     Label L_loop;
1549     __ align(OptoLoopAlignment);
1550     if (UseUnalignedLoadStores) {
1551       Label L_end;
1552       if (UseAVX > 2) {
1553         __ movl(to, 0xffff);
1554         __ kmovwl(k1, to);
1555       }
1556       // Copy 64-bytes per iteration
1557       __ BIND(L_loop);
1558       if (UseAVX > 2) {
1559         __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
1560         __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
1561       } else if (UseAVX == 2) {
1562         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1563         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1564         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1565         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1566       } else {
1567         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1568         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1569         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1570         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1571         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1572         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1573         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1574         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1575       }
1576       __ BIND(L_copy_bytes);
1577       __ subptr(qword_count, 8);
1578       __ jcc(Assembler::greaterEqual, L_loop);
1579 
1580       __ addptr(qword_count, 4);  // add(8) and sub(4)
1581       __ jccb(Assembler::less, L_end);
1582       // Copy trailing 32 bytes
1583       if (UseAVX >= 2) {
1584         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1585         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1586       } else {
1587         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1588         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1589         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1590         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1591       }
1592       __ subptr(qword_count, 4);
1593       __ BIND(L_end);
1594       if (UseAVX >= 2) {
1595         // clean upper bits of YMM registers
1596         __ vpxor(xmm0, xmm0);
1597         __ vpxor(xmm1, xmm1);
1598       }
1599     } else {
1600       // Copy 32-bytes per iteration
1601       __ BIND(L_loop);
1602       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1603       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1604       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1605       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1606       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1607       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1608       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1609       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1610 
1611       __ BIND(L_copy_bytes);
1612       __ subptr(qword_count, 4);
1613       __ jcc(Assembler::greaterEqual, L_loop);
1614     }
1615     __ addptr(qword_count, 4);
1616     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1617   }
1618 
1619 
1620   // Arguments:
1621   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1622   //             ignored
1623   //   name    - stub name string
1624   //
1625   // Inputs:
1626   //   c_rarg0   - source array address
1627   //   c_rarg1   - destination array address
1628   //   c_rarg2   - element count, treated as ssize_t, can be zero
1629   //
1630   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1631   // we let the hardware handle it.  The one to eight bytes within words,
1632   // dwords or qwords that span cache line boundaries will still be loaded
1633   // and stored atomically.
1634   //
1635   // Side Effects:
1636   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1637   //   used by generate_conjoint_byte_copy().
1638   //
1639   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1640     __ align(CodeEntryAlignment);
1641     StubCodeMark mark(this, "StubRoutines", name);
1642     address start = __ pc();
1643 
1644     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1645     Label L_copy_byte, L_exit;
1646     const Register from        = rdi;  // source array address
1647     const Register to          = rsi;  // destination array address
1648     const Register count       = rdx;  // elements count
1649     const Register byte_count  = rcx;
1650     const Register qword_count = count;
1651     const Register end_from    = from; // source array end address
1652     const Register end_to      = to;   // destination array end address
1653     // End pointers are inclusive, and if count is not zero they point
1654     // to the last unit copied:  end_to[0] := end_from[0]
1655 
1656     __ enter(); // required for proper stackwalking of RuntimeStub frame
1657     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1658 
1659     if (entry != NULL) {
1660       *entry = __ pc();
1661        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1662       BLOCK_COMMENT("Entry:");
1663     }
1664 
1665     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1666                       // r9 and r10 may be used to save non-volatile registers
1667 
1668     // 'from', 'to' and 'count' are now valid
1669     __ movptr(byte_count, count);
1670     __ shrptr(count, 3); // count => qword_count
1671 
1672     // Copy from low to high addresses.  Use 'to' as scratch.
1673     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1674     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1675     __ negptr(qword_count); // make the count negative
1676     __ jmp(L_copy_bytes);
1677 
1678     // Copy trailing qwords
1679   __ BIND(L_copy_8_bytes);
1680     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1681     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1682     __ increment(qword_count);
1683     __ jcc(Assembler::notZero, L_copy_8_bytes);
1684 
1685     // Check for and copy trailing dword
1686   __ BIND(L_copy_4_bytes);
1687     __ testl(byte_count, 4);
1688     __ jccb(Assembler::zero, L_copy_2_bytes);
1689     __ movl(rax, Address(end_from, 8));
1690     __ movl(Address(end_to, 8), rax);
1691 
1692     __ addptr(end_from, 4);
1693     __ addptr(end_to, 4);
1694 
1695     // Check for and copy trailing word
1696   __ BIND(L_copy_2_bytes);
1697     __ testl(byte_count, 2);
1698     __ jccb(Assembler::zero, L_copy_byte);
1699     __ movw(rax, Address(end_from, 8));
1700     __ movw(Address(end_to, 8), rax);
1701 
1702     __ addptr(end_from, 2);
1703     __ addptr(end_to, 2);
1704 
1705     // Check for and copy trailing byte
1706   __ BIND(L_copy_byte);
1707     __ testl(byte_count, 1);
1708     __ jccb(Assembler::zero, L_exit);
1709     __ movb(rax, Address(end_from, 8));
1710     __ movb(Address(end_to, 8), rax);
1711 
1712   __ BIND(L_exit);
1713     restore_arg_regs();
1714     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1715     __ xorptr(rax, rax); // return 0
1716     __ vzeroupper();
1717     __ leave(); // required for proper stackwalking of RuntimeStub frame
1718     __ ret(0);
1719 
1720     // Copy in multi-bytes chunks
1721     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1722     __ jmp(L_copy_4_bytes);
1723 
1724     return start;
1725   }
1726 
1727   // Arguments:
1728   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1729   //             ignored
1730   //   name    - stub name string
1731   //
1732   // Inputs:
1733   //   c_rarg0   - source array address
1734   //   c_rarg1   - destination array address
1735   //   c_rarg2   - element count, treated as ssize_t, can be zero
1736   //
1737   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1738   // we let the hardware handle it.  The one to eight bytes within words,
1739   // dwords or qwords that span cache line boundaries will still be loaded
1740   // and stored atomically.
1741   //
1742   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1743                                       address* entry, const char *name) {
1744     __ align(CodeEntryAlignment);
1745     StubCodeMark mark(this, "StubRoutines", name);
1746     address start = __ pc();
1747 
1748     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1749     const Register from        = rdi;  // source array address
1750     const Register to          = rsi;  // destination array address
1751     const Register count       = rdx;  // elements count
1752     const Register byte_count  = rcx;
1753     const Register qword_count = count;
1754 
1755     __ enter(); // required for proper stackwalking of RuntimeStub frame
1756     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1757 
1758     if (entry != NULL) {
1759       *entry = __ pc();
1760       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1761       BLOCK_COMMENT("Entry:");
1762     }
1763 
1764     array_overlap_test(nooverlap_target, Address::times_1);
1765     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1766                       // r9 and r10 may be used to save non-volatile registers
1767 
1768     // 'from', 'to' and 'count' are now valid
1769     __ movptr(byte_count, count);
1770     __ shrptr(count, 3);   // count => qword_count
1771 
1772     // Copy from high to low addresses.
1773 
1774     // Check for and copy trailing byte
1775     __ testl(byte_count, 1);
1776     __ jcc(Assembler::zero, L_copy_2_bytes);
1777     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1778     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1779     __ decrement(byte_count); // Adjust for possible trailing word
1780 
1781     // Check for and copy trailing word
1782   __ BIND(L_copy_2_bytes);
1783     __ testl(byte_count, 2);
1784     __ jcc(Assembler::zero, L_copy_4_bytes);
1785     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1786     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1787 
1788     // Check for and copy trailing dword
1789   __ BIND(L_copy_4_bytes);
1790     __ testl(byte_count, 4);
1791     __ jcc(Assembler::zero, L_copy_bytes);
1792     __ movl(rax, Address(from, qword_count, Address::times_8));
1793     __ movl(Address(to, qword_count, Address::times_8), rax);
1794     __ jmp(L_copy_bytes);
1795 
1796     // Copy trailing qwords
1797   __ BIND(L_copy_8_bytes);
1798     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1799     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1800     __ decrement(qword_count);
1801     __ jcc(Assembler::notZero, L_copy_8_bytes);
1802 
1803     restore_arg_regs();
1804     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1805     __ xorptr(rax, rax); // return 0
1806     __ vzeroupper();
1807     __ leave(); // required for proper stackwalking of RuntimeStub frame
1808     __ ret(0);
1809 
1810     // Copy in multi-bytes chunks
1811     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1812 
1813     restore_arg_regs();
1814     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1815     __ xorptr(rax, rax); // return 0
1816     __ vzeroupper();
1817     __ leave(); // required for proper stackwalking of RuntimeStub frame
1818     __ ret(0);
1819 
1820     return start;
1821   }
1822 
1823   // Arguments:
1824   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1825   //             ignored
1826   //   name    - stub name string
1827   //
1828   // Inputs:
1829   //   c_rarg0   - source array address
1830   //   c_rarg1   - destination array address
1831   //   c_rarg2   - element count, treated as ssize_t, can be zero
1832   //
1833   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1834   // let the hardware handle it.  The two or four words within dwords
1835   // or qwords that span cache line boundaries will still be loaded
1836   // and stored atomically.
1837   //
1838   // Side Effects:
1839   //   disjoint_short_copy_entry is set to the no-overlap entry point
1840   //   used by generate_conjoint_short_copy().
1841   //
1842   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1843     __ align(CodeEntryAlignment);
1844     StubCodeMark mark(this, "StubRoutines", name);
1845     address start = __ pc();
1846 
1847     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1848     const Register from        = rdi;  // source array address
1849     const Register to          = rsi;  // destination array address
1850     const Register count       = rdx;  // elements count
1851     const Register word_count  = rcx;
1852     const Register qword_count = count;
1853     const Register end_from    = from; // source array end address
1854     const Register end_to      = to;   // destination array end address
1855     // End pointers are inclusive, and if count is not zero they point
1856     // to the last unit copied:  end_to[0] := end_from[0]
1857 
1858     __ enter(); // required for proper stackwalking of RuntimeStub frame
1859     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1860 
1861     if (entry != NULL) {
1862       *entry = __ pc();
1863       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1864       BLOCK_COMMENT("Entry:");
1865     }
1866 
1867     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1868                       // r9 and r10 may be used to save non-volatile registers
1869 
1870     // 'from', 'to' and 'count' are now valid
1871     __ movptr(word_count, count);
1872     __ shrptr(count, 2); // count => qword_count
1873 
1874     // Copy from low to high addresses.  Use 'to' as scratch.
1875     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1876     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1877     __ negptr(qword_count);
1878     __ jmp(L_copy_bytes);
1879 
1880     // Copy trailing qwords
1881   __ BIND(L_copy_8_bytes);
1882     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1883     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1884     __ increment(qword_count);
1885     __ jcc(Assembler::notZero, L_copy_8_bytes);
1886 
1887     // Original 'dest' is trashed, so we can't use it as a
1888     // base register for a possible trailing word copy
1889 
1890     // Check for and copy trailing dword
1891   __ BIND(L_copy_4_bytes);
1892     __ testl(word_count, 2);
1893     __ jccb(Assembler::zero, L_copy_2_bytes);
1894     __ movl(rax, Address(end_from, 8));
1895     __ movl(Address(end_to, 8), rax);
1896 
1897     __ addptr(end_from, 4);
1898     __ addptr(end_to, 4);
1899 
1900     // Check for and copy trailing word
1901   __ BIND(L_copy_2_bytes);
1902     __ testl(word_count, 1);
1903     __ jccb(Assembler::zero, L_exit);
1904     __ movw(rax, Address(end_from, 8));
1905     __ movw(Address(end_to, 8), rax);
1906 
1907   __ BIND(L_exit);
1908     restore_arg_regs();
1909     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1910     __ xorptr(rax, rax); // return 0
1911     __ vzeroupper();
1912     __ leave(); // required for proper stackwalking of RuntimeStub frame
1913     __ ret(0);
1914 
1915     // Copy in multi-bytes chunks
1916     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1917     __ jmp(L_copy_4_bytes);
1918 
1919     return start;
1920   }
1921 
1922   address generate_fill(BasicType t, bool aligned, const char *name) {
1923     __ align(CodeEntryAlignment);
1924     StubCodeMark mark(this, "StubRoutines", name);
1925     address start = __ pc();
1926 
1927     BLOCK_COMMENT("Entry:");
1928 
1929     const Register to       = c_rarg0;  // source array address
1930     const Register value    = c_rarg1;  // value
1931     const Register count    = c_rarg2;  // elements count
1932 
1933     __ enter(); // required for proper stackwalking of RuntimeStub frame
1934 
1935     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1936 
1937     __ vzeroupper();
1938     __ leave(); // required for proper stackwalking of RuntimeStub frame
1939     __ ret(0);
1940     return start;
1941   }
1942 
1943   // Arguments:
1944   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1945   //             ignored
1946   //   name    - stub name string
1947   //
1948   // Inputs:
1949   //   c_rarg0   - source array address
1950   //   c_rarg1   - destination array address
1951   //   c_rarg2   - element count, treated as ssize_t, can be zero
1952   //
1953   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1954   // let the hardware handle it.  The two or four words within dwords
1955   // or qwords that span cache line boundaries will still be loaded
1956   // and stored atomically.
1957   //
1958   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1959                                        address *entry, const char *name) {
1960     __ align(CodeEntryAlignment);
1961     StubCodeMark mark(this, "StubRoutines", name);
1962     address start = __ pc();
1963 
1964     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1965     const Register from        = rdi;  // source array address
1966     const Register to          = rsi;  // destination array address
1967     const Register count       = rdx;  // elements count
1968     const Register word_count  = rcx;
1969     const Register qword_count = count;
1970 
1971     __ enter(); // required for proper stackwalking of RuntimeStub frame
1972     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1973 
1974     if (entry != NULL) {
1975       *entry = __ pc();
1976       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1977       BLOCK_COMMENT("Entry:");
1978     }
1979 
1980     array_overlap_test(nooverlap_target, Address::times_2);
1981     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1982                       // r9 and r10 may be used to save non-volatile registers
1983 
1984     // 'from', 'to' and 'count' are now valid
1985     __ movptr(word_count, count);
1986     __ shrptr(count, 2); // count => qword_count
1987 
1988     // Copy from high to low addresses.  Use 'to' as scratch.
1989 
1990     // Check for and copy trailing word
1991     __ testl(word_count, 1);
1992     __ jccb(Assembler::zero, L_copy_4_bytes);
1993     __ movw(rax, Address(from, word_count, Address::times_2, -2));
1994     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1995 
1996     // Check for and copy trailing dword
1997   __ BIND(L_copy_4_bytes);
1998     __ testl(word_count, 2);
1999     __ jcc(Assembler::zero, L_copy_bytes);
2000     __ movl(rax, Address(from, qword_count, Address::times_8));
2001     __ movl(Address(to, qword_count, Address::times_8), rax);
2002     __ jmp(L_copy_bytes);
2003 
2004     // Copy trailing qwords
2005   __ BIND(L_copy_8_bytes);
2006     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2007     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2008     __ decrement(qword_count);
2009     __ jcc(Assembler::notZero, L_copy_8_bytes);
2010 
2011     restore_arg_regs();
2012     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2013     __ xorptr(rax, rax); // return 0
2014     __ vzeroupper();
2015     __ leave(); // required for proper stackwalking of RuntimeStub frame
2016     __ ret(0);
2017 
2018     // Copy in multi-bytes chunks
2019     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2020 
2021     restore_arg_regs();
2022     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2023     __ xorptr(rax, rax); // return 0
2024     __ vzeroupper();
2025     __ leave(); // required for proper stackwalking of RuntimeStub frame
2026     __ ret(0);
2027 
2028     return start;
2029   }
2030 
2031   // Arguments:
2032   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2033   //             ignored
2034   //   is_oop  - true => oop array, so generate store check code
2035   //   name    - stub name string
2036   //
2037   // Inputs:
2038   //   c_rarg0   - source array address
2039   //   c_rarg1   - destination array address
2040   //   c_rarg2   - element count, treated as ssize_t, can be zero
2041   //
2042   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2043   // the hardware handle it.  The two dwords within qwords that span
2044   // cache line boundaries will still be loaded and stored atomicly.
2045   //
2046   // Side Effects:
2047   //   disjoint_int_copy_entry is set to the no-overlap entry point
2048   //   used by generate_conjoint_int_oop_copy().
2049   //
2050   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
2051                                          const char *name, bool dest_uninitialized = false) {
2052     __ align(CodeEntryAlignment);
2053     StubCodeMark mark(this, "StubRoutines", name);
2054     address start = __ pc();
2055 
2056     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2057     const Register from        = rdi;  // source array address
2058     const Register to          = rsi;  // destination array address
2059     const Register count       = rdx;  // elements count
2060     const Register dword_count = rcx;
2061     const Register qword_count = count;
2062     const Register end_from    = from; // source array end address
2063     const Register end_to      = to;   // destination array end address
2064     const Register saved_to    = r11;  // saved destination array address
2065     // End pointers are inclusive, and if count is not zero they point
2066     // to the last unit copied:  end_to[0] := end_from[0]
2067 
2068     __ enter(); // required for proper stackwalking of RuntimeStub frame
2069     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2070 
2071     if (entry != NULL) {
2072       *entry = __ pc();
2073       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2074       BLOCK_COMMENT("Entry:");
2075     }
2076 
2077     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2078                       // r9 and r10 may be used to save non-volatile registers
2079     if (is_oop) {
2080       __ movq(saved_to, to);
2081       gen_load_ref_array_barrier(from, count);
2082       gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
2083     }
2084 
2085     // 'from', 'to' and 'count' are now valid
2086     __ movptr(dword_count, count);
2087     __ shrptr(count, 1); // count => qword_count
2088 
2089     // Copy from low to high addresses.  Use 'to' as scratch.
2090     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2091     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2092     __ negptr(qword_count);
2093     __ jmp(L_copy_bytes);
2094 
2095     // Copy trailing qwords
2096   __ BIND(L_copy_8_bytes);
2097     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2098     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2099     __ increment(qword_count);
2100     __ jcc(Assembler::notZero, L_copy_8_bytes);
2101 
2102     // Check for and copy trailing dword
2103   __ BIND(L_copy_4_bytes);
2104     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2105     __ jccb(Assembler::zero, L_exit);
2106     __ movl(rax, Address(end_from, 8));
2107     __ movl(Address(end_to, 8), rax);
2108 
2109   __ BIND(L_exit);
2110     if (is_oop) {
2111       gen_write_ref_array_post_barrier(saved_to, dword_count, rax);
2112     }
2113     restore_arg_regs();
2114     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2115     __ vzeroupper();
2116     __ xorptr(rax, rax); // return 0
2117     __ leave(); // required for proper stackwalking of RuntimeStub frame
2118     __ ret(0);
2119 
2120     // Copy in multi-bytes chunks
2121     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2122     __ jmp(L_copy_4_bytes);
2123 
2124     return start;
2125   }
2126 
2127   // Arguments:
2128   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2129   //             ignored
2130   //   is_oop  - true => oop array, so generate store check code
2131   //   name    - stub name string
2132   //
2133   // Inputs:
2134   //   c_rarg0   - source array address
2135   //   c_rarg1   - destination array address
2136   //   c_rarg2   - element count, treated as ssize_t, can be zero
2137   //
2138   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2139   // the hardware handle it.  The two dwords within qwords that span
2140   // cache line boundaries will still be loaded and stored atomicly.
2141   //
2142   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2143                                          address *entry, const char *name,
2144                                          bool dest_uninitialized = false) {
2145     __ align(CodeEntryAlignment);
2146     StubCodeMark mark(this, "StubRoutines", name);
2147     address start = __ pc();
2148 
2149     Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
2150     const Register from        = rdi;  // source array address
2151     const Register to          = rsi;  // destination array address
2152     const Register count       = rdx;  // elements count
2153     const Register dword_count = rcx;
2154     const Register qword_count = count;
2155 
2156     __ enter(); // required for proper stackwalking of RuntimeStub frame
2157     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2158 
2159     if (entry != NULL) {
2160       *entry = __ pc();
2161        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2162       BLOCK_COMMENT("Entry:");
2163     }
2164 
2165     array_overlap_test(nooverlap_target, Address::times_4);
2166     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2167                       // r9 and r10 may be used to save non-volatile registers
2168 
2169     if (is_oop) {
2170       // no registers are destroyed by this call
2171       gen_load_ref_array_barrier(from, count);
2172       gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
2173     }
2174 
2175     assert_clean_int(count, rax); // Make sure 'count' is clean int.
2176     // 'from', 'to' and 'count' are now valid
2177     __ movptr(dword_count, count);
2178     __ shrptr(count, 1); // count => qword_count
2179 
2180     // Copy from high to low addresses.  Use 'to' as scratch.
2181 
2182     // Check for and copy trailing dword
2183     __ testl(dword_count, 1);
2184     __ jcc(Assembler::zero, L_copy_bytes);
2185     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2186     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2187     __ jmp(L_copy_bytes);
2188 
2189     // Copy trailing qwords
2190   __ BIND(L_copy_8_bytes);
2191     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2192     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2193     __ decrement(qword_count);
2194     __ jcc(Assembler::notZero, L_copy_8_bytes);
2195 
2196     if (is_oop) {
2197       __ jmp(L_exit);
2198     }
2199     restore_arg_regs();
2200     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2201     __ xorptr(rax, rax); // return 0
2202     __ vzeroupper();
2203     __ leave(); // required for proper stackwalking of RuntimeStub frame
2204     __ ret(0);
2205 
2206     // Copy in multi-bytes chunks
2207     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2208 
2209   __ BIND(L_exit);
2210     if (is_oop) {
2211       gen_write_ref_array_post_barrier(to, dword_count, rax);
2212     }
2213     restore_arg_regs();
2214     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2215     __ xorptr(rax, rax); // return 0
2216     __ vzeroupper();
2217     __ leave(); // required for proper stackwalking of RuntimeStub frame
2218     __ ret(0);
2219 
2220     return start;
2221   }
2222 
2223   // Arguments:
2224   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2225   //             ignored
2226   //   is_oop  - true => oop array, so generate store check code
2227   //   name    - stub name string
2228   //
2229   // Inputs:
2230   //   c_rarg0   - source array address
2231   //   c_rarg1   - destination array address
2232   //   c_rarg2   - element count, treated as ssize_t, can be zero
2233   //
2234  // Side Effects:
2235   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2236   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2237   //
2238   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2239                                           const char *name, bool dest_uninitialized = false) {
2240     __ align(CodeEntryAlignment);
2241     StubCodeMark mark(this, "StubRoutines", name);
2242     address start = __ pc();
2243 
2244     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2245     const Register from        = rdi;  // source array address
2246     const Register to          = rsi;  // destination array address
2247     const Register qword_count = rdx;  // elements count
2248     const Register end_from    = from; // source array end address
2249     const Register end_to      = rcx;  // destination array end address
2250     const Register saved_to    = to;
2251     const Register saved_count = r11;
2252     // End pointers are inclusive, and if count is not zero they point
2253     // to the last unit copied:  end_to[0] := end_from[0]
2254 
2255     __ enter(); // required for proper stackwalking of RuntimeStub frame
2256     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2257     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2258 
2259     if (entry != NULL) {
2260       *entry = __ pc();
2261       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2262       BLOCK_COMMENT("Entry:");
2263     }
2264 
2265     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2266                       // r9 and r10 may be used to save non-volatile registers
2267     // 'from', 'to' and 'qword_count' are now valid
2268     if (is_oop) {
2269       // Save to and count for store barrier
2270       __ movptr(saved_count, qword_count);
2271       // no registers are destroyed by this call
2272       gen_load_ref_array_barrier(from, qword_count);
2273       gen_write_ref_array_pre_barrier(to, qword_count, dest_uninitialized);
2274     }
2275 
2276     // Copy from low to high addresses.  Use 'to' as scratch.
2277     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2278     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2279     __ negptr(qword_count);
2280     __ jmp(L_copy_bytes);
2281 
2282     // Copy trailing qwords
2283   __ BIND(L_copy_8_bytes);
2284     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2285     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2286     __ increment(qword_count);
2287     __ jcc(Assembler::notZero, L_copy_8_bytes);
2288 
2289     if (is_oop) {
2290       __ jmp(L_exit);
2291     } else {
2292       restore_arg_regs();
2293       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2294       __ xorptr(rax, rax); // return 0
2295       __ vzeroupper();
2296       __ leave(); // required for proper stackwalking of RuntimeStub frame
2297       __ ret(0);
2298     }
2299 
2300     // Copy in multi-bytes chunks
2301     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2302 
2303     if (is_oop) {
2304     __ BIND(L_exit);
2305       gen_write_ref_array_post_barrier(saved_to, saved_count, rax);
2306     }
2307     restore_arg_regs();
2308     if (is_oop) {
2309       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2310     } else {
2311       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2312     }
2313     __ vzeroupper();
2314     __ xorptr(rax, rax); // return 0
2315     __ leave(); // required for proper stackwalking of RuntimeStub frame
2316     __ ret(0);
2317 
2318     return start;
2319   }
2320 
2321   // Arguments:
2322   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2323   //             ignored
2324   //   is_oop  - true => oop array, so generate store check code
2325   //   name    - stub name string
2326   //
2327   // Inputs:
2328   //   c_rarg0   - source array address
2329   //   c_rarg1   - destination array address
2330   //   c_rarg2   - element count, treated as ssize_t, can be zero
2331   //
2332   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2333                                           address nooverlap_target, address *entry,
2334                                           const char *name, bool dest_uninitialized = false) {
2335     __ align(CodeEntryAlignment);
2336     StubCodeMark mark(this, "StubRoutines", name);
2337     address start = __ pc();
2338 
2339     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2340     const Register from        = rdi;  // source array address
2341     const Register to          = rsi;  // destination array address
2342     const Register qword_count = rdx;  // elements count
2343     const Register saved_count = rcx;
2344 
2345     __ enter(); // required for proper stackwalking of RuntimeStub frame
2346     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2347 
2348     if (entry != NULL) {
2349       *entry = __ pc();
2350       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2351       BLOCK_COMMENT("Entry:");
2352     }
2353 
2354     array_overlap_test(nooverlap_target, Address::times_8);
2355     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2356                       // r9 and r10 may be used to save non-volatile registers
2357     // 'from', 'to' and 'qword_count' are now valid
2358     if (is_oop) {
2359       // Save to and count for store barrier
2360       __ movptr(saved_count, qword_count);
2361       // No registers are destroyed by this call
2362       gen_load_ref_array_barrier(from, saved_count);
2363       gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
2364     }
2365 
2366     __ jmp(L_copy_bytes);
2367 
2368     // Copy trailing qwords
2369   __ BIND(L_copy_8_bytes);
2370     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2371     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2372     __ decrement(qword_count);
2373     __ jcc(Assembler::notZero, L_copy_8_bytes);
2374 
2375     if (is_oop) {
2376       __ jmp(L_exit);
2377     } else {
2378       restore_arg_regs();
2379       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2380       __ xorptr(rax, rax); // return 0
2381       __ vzeroupper();
2382       __ leave(); // required for proper stackwalking of RuntimeStub frame
2383       __ ret(0);
2384     }
2385 
2386     // Copy in multi-bytes chunks
2387     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2388 
2389     if (is_oop) {
2390     __ BIND(L_exit);
2391       gen_write_ref_array_post_barrier(to, saved_count, rax);
2392     }
2393     restore_arg_regs();
2394     if (is_oop) {
2395       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2396     } else {
2397       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2398     }
2399     __ vzeroupper();
2400     __ xorptr(rax, rax); // return 0
2401     __ leave(); // required for proper stackwalking of RuntimeStub frame
2402     __ ret(0);
2403 
2404     return start;
2405   }
2406 
2407 
2408   // Helper for generating a dynamic type check.
2409   // Smashes no registers.
2410   void generate_type_check(Register sub_klass,
2411                            Register super_check_offset,
2412                            Register super_klass,
2413                            Label& L_success) {
2414     assert_different_registers(sub_klass, super_check_offset, super_klass);
2415 
2416     BLOCK_COMMENT("type_check:");
2417 
2418     Label L_miss;
2419 
2420     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2421                                      super_check_offset);
2422     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2423 
2424     // Fall through on failure!
2425     __ BIND(L_miss);
2426   }
2427 
2428   //
2429   //  Generate checkcasting array copy stub
2430   //
2431   //  Input:
2432   //    c_rarg0   - source array address
2433   //    c_rarg1   - destination array address
2434   //    c_rarg2   - element count, treated as ssize_t, can be zero
2435   //    c_rarg3   - size_t ckoff (super_check_offset)
2436   // not Win64
2437   //    c_rarg4   - oop ckval (super_klass)
2438   // Win64
2439   //    rsp+40    - oop ckval (super_klass)
2440   //
2441   //  Output:
2442   //    rax ==  0  -  success
2443   //    rax == -1^K - failure, where K is partial transfer count
2444   //
2445   address generate_checkcast_copy(const char *name, address *entry,
2446                                   bool dest_uninitialized = false) {
2447 
2448     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2449 
2450     // Input registers (after setup_arg_regs)
2451     const Register from        = rdi;   // source array address
2452     const Register to          = rsi;   // destination array address
2453     const Register length      = rdx;   // elements count
2454     const Register ckoff       = rcx;   // super_check_offset
2455     const Register ckval       = r8;    // super_klass
2456 
2457     // Registers used as temps (r13, r14 are save-on-entry)
2458     const Register end_from    = from;  // source array end address
2459     const Register end_to      = r13;   // destination array end address
2460     const Register count       = rdx;   // -(count_remaining)
2461     const Register r14_length  = r14;   // saved copy of length
2462     // End pointers are inclusive, and if length is not zero they point
2463     // to the last unit copied:  end_to[0] := end_from[0]
2464 
2465     const Register rax_oop    = rax;    // actual oop copied
2466     const Register r11_klass  = r11;    // oop._klass
2467 
2468     //---------------------------------------------------------------
2469     // Assembler stub will be used for this call to arraycopy
2470     // if the two arrays are subtypes of Object[] but the
2471     // destination array type is not equal to or a supertype
2472     // of the source type.  Each element must be separately
2473     // checked.
2474 
2475     __ align(CodeEntryAlignment);
2476     StubCodeMark mark(this, "StubRoutines", name);
2477     address start = __ pc();
2478 
2479     __ enter(); // required for proper stackwalking of RuntimeStub frame
2480 
2481 #ifdef ASSERT
2482     // caller guarantees that the arrays really are different
2483     // otherwise, we would have to make conjoint checks
2484     { Label L;
2485       array_overlap_test(L, TIMES_OOP);
2486       __ stop("checkcast_copy within a single array");
2487       __ bind(L);
2488     }
2489 #endif //ASSERT
2490 
2491     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2492                        // ckoff => rcx, ckval => r8
2493                        // r9 and r10 may be used to save non-volatile registers
2494 #ifdef _WIN64
2495     // last argument (#4) is on stack on Win64
2496     __ movptr(ckval, Address(rsp, 6 * wordSize));
2497 #endif
2498 
2499     // Caller of this entry point must set up the argument registers.
2500     if (entry != NULL) {
2501       *entry = __ pc();
2502       BLOCK_COMMENT("Entry:");
2503     }
2504 
2505     // allocate spill slots for r13, r14
2506     enum {
2507       saved_r13_offset,
2508       saved_r14_offset,
2509       saved_rbp_offset
2510     };
2511     __ subptr(rsp, saved_rbp_offset * wordSize);
2512     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2513     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2514 
2515     // check that int operands are properly extended to size_t
2516     assert_clean_int(length, rax);
2517     assert_clean_int(ckoff, rax);
2518 
2519 #ifdef ASSERT
2520     BLOCK_COMMENT("assert consistent ckoff/ckval");
2521     // The ckoff and ckval must be mutually consistent,
2522     // even though caller generates both.
2523     { Label L;
2524       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2525       __ cmpl(ckoff, Address(ckval, sco_offset));
2526       __ jcc(Assembler::equal, L);
2527       __ stop("super_check_offset inconsistent");
2528       __ bind(L);
2529     }
2530 #endif //ASSERT
2531 
2532     // Loop-invariant addresses.  They are exclusive end pointers.
2533     Address end_from_addr(from, length, TIMES_OOP, 0);
2534     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2535     // Loop-variant addresses.  They assume post-incremented count < 0.
2536     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2537     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2538 
2539     gen_load_ref_array_barrier(from, count);
2540     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
2541 
2542     // Copy from low to high addresses, indexed from the end of each array.
2543     __ lea(end_from, end_from_addr);
2544     __ lea(end_to,   end_to_addr);
2545     __ movptr(r14_length, length);        // save a copy of the length
2546     assert(length == count, "");          // else fix next line:
2547     __ negptr(count);                     // negate and test the length
2548     __ jcc(Assembler::notZero, L_load_element);
2549 
2550     // Empty array:  Nothing to do.
2551     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2552     __ jmp(L_done);
2553 
2554     // ======== begin loop ========
2555     // (Loop is rotated; its entry is L_load_element.)
2556     // Loop control:
2557     //   for (count = -count; count != 0; count++)
2558     // Base pointers src, dst are biased by 8*(count-1),to last element.
2559     __ align(OptoLoopAlignment);
2560 
2561     __ BIND(L_store_element);
2562     __ store_heap_oop(to_element_addr, rax_oop);  // store the oop
2563     __ increment(count);               // increment the count toward zero
2564     __ jcc(Assembler::zero, L_do_card_marks);
2565 
2566     // ======== loop entry is here ========
2567     __ BIND(L_load_element);
2568     __ load_heap_oop(rax_oop, from_element_addr); // load the oop
2569     __ testptr(rax_oop, rax_oop);
2570     __ jcc(Assembler::zero, L_store_element);
2571 
2572     __ load_klass(r11_klass, rax_oop);// query the object klass
2573     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2574     // ======== end loop ========
2575 
2576     // It was a real error; we must depend on the caller to finish the job.
2577     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2578     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2579     // and report their number to the caller.
2580     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2581     Label L_post_barrier;
2582     __ addptr(r14_length, count);     // K = (original - remaining) oops
2583     __ movptr(rax, r14_length);       // save the value
2584     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2585     __ jccb(Assembler::notZero, L_post_barrier);
2586     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2587 
2588     // Come here on success only.
2589     __ BIND(L_do_card_marks);
2590     __ xorptr(rax, rax);              // return 0 on success
2591 
2592     __ BIND(L_post_barrier);
2593     gen_write_ref_array_post_barrier(to, r14_length, rscratch1);
2594 
2595     // Common exit point (success or failure).
2596     __ BIND(L_done);
2597     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2598     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2599     restore_arg_regs();
2600     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2601     __ leave(); // required for proper stackwalking of RuntimeStub frame
2602     __ ret(0);
2603 
2604     return start;
2605   }
2606 
2607   //
2608   //  Generate 'unsafe' array copy stub
2609   //  Though just as safe as the other stubs, it takes an unscaled
2610   //  size_t argument instead of an element count.
2611   //
2612   //  Input:
2613   //    c_rarg0   - source array address
2614   //    c_rarg1   - destination array address
2615   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2616   //
2617   // Examines the alignment of the operands and dispatches
2618   // to a long, int, short, or byte copy loop.
2619   //
2620   address generate_unsafe_copy(const char *name,
2621                                address byte_copy_entry, address short_copy_entry,
2622                                address int_copy_entry, address long_copy_entry) {
2623 
2624     Label L_long_aligned, L_int_aligned, L_short_aligned;
2625 
2626     // Input registers (before setup_arg_regs)
2627     const Register from        = c_rarg0;  // source array address
2628     const Register to          = c_rarg1;  // destination array address
2629     const Register size        = c_rarg2;  // byte count (size_t)
2630 
2631     // Register used as a temp
2632     const Register bits        = rax;      // test copy of low bits
2633 
2634     __ align(CodeEntryAlignment);
2635     StubCodeMark mark(this, "StubRoutines", name);
2636     address start = __ pc();
2637 
2638     __ enter(); // required for proper stackwalking of RuntimeStub frame
2639 
2640     // bump this on entry, not on exit:
2641     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2642 
2643     __ mov(bits, from);
2644     __ orptr(bits, to);
2645     __ orptr(bits, size);
2646 
2647     __ testb(bits, BytesPerLong-1);
2648     __ jccb(Assembler::zero, L_long_aligned);
2649 
2650     __ testb(bits, BytesPerInt-1);
2651     __ jccb(Assembler::zero, L_int_aligned);
2652 
2653     __ testb(bits, BytesPerShort-1);
2654     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2655 
2656     __ BIND(L_short_aligned);
2657     __ shrptr(size, LogBytesPerShort); // size => short_count
2658     __ jump(RuntimeAddress(short_copy_entry));
2659 
2660     __ BIND(L_int_aligned);
2661     __ shrptr(size, LogBytesPerInt); // size => int_count
2662     __ jump(RuntimeAddress(int_copy_entry));
2663 
2664     __ BIND(L_long_aligned);
2665     __ shrptr(size, LogBytesPerLong); // size => qword_count
2666     __ jump(RuntimeAddress(long_copy_entry));
2667 
2668     return start;
2669   }
2670 
2671   // Perform range checks on the proposed arraycopy.
2672   // Kills temp, but nothing else.
2673   // Also, clean the sign bits of src_pos and dst_pos.
2674   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2675                               Register src_pos, // source position (c_rarg1)
2676                               Register dst,     // destination array oo (c_rarg2)
2677                               Register dst_pos, // destination position (c_rarg3)
2678                               Register length,
2679                               Register temp,
2680                               Label& L_failed) {
2681     BLOCK_COMMENT("arraycopy_range_checks:");
2682 
2683     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2684     __ movl(temp, length);
2685     __ addl(temp, src_pos);             // src_pos + length
2686     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2687     __ jcc(Assembler::above, L_failed);
2688 
2689     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2690     __ movl(temp, length);
2691     __ addl(temp, dst_pos);             // dst_pos + length
2692     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2693     __ jcc(Assembler::above, L_failed);
2694 
2695     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2696     // Move with sign extension can be used since they are positive.
2697     __ movslq(src_pos, src_pos);
2698     __ movslq(dst_pos, dst_pos);
2699 
2700     BLOCK_COMMENT("arraycopy_range_checks done");
2701   }
2702 
2703   //
2704   //  Generate generic array copy stubs
2705   //
2706   //  Input:
2707   //    c_rarg0    -  src oop
2708   //    c_rarg1    -  src_pos (32-bits)
2709   //    c_rarg2    -  dst oop
2710   //    c_rarg3    -  dst_pos (32-bits)
2711   // not Win64
2712   //    c_rarg4    -  element count (32-bits)
2713   // Win64
2714   //    rsp+40     -  element count (32-bits)
2715   //
2716   //  Output:
2717   //    rax ==  0  -  success
2718   //    rax == -1^K - failure, where K is partial transfer count
2719   //
2720   address generate_generic_copy(const char *name,
2721                                 address byte_copy_entry, address short_copy_entry,
2722                                 address int_copy_entry, address oop_copy_entry,
2723                                 address long_copy_entry, address checkcast_copy_entry) {
2724 
2725     Label L_failed, L_failed_0, L_objArray;
2726     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2727 
2728     // Input registers
2729     const Register src        = c_rarg0;  // source array oop
2730     const Register src_pos    = c_rarg1;  // source position
2731     const Register dst        = c_rarg2;  // destination array oop
2732     const Register dst_pos    = c_rarg3;  // destination position
2733 #ifndef _WIN64
2734     const Register length     = c_rarg4;
2735 #else
2736     const Address  length(rsp, 6 * wordSize);  // elements count is on stack on Win64
2737 #endif
2738 
2739     { int modulus = CodeEntryAlignment;
2740       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2741       int advance = target - (__ offset() % modulus);
2742       if (advance < 0)  advance += modulus;
2743       if (advance > 0)  __ nop(advance);
2744     }
2745     StubCodeMark mark(this, "StubRoutines", name);
2746 
2747     // Short-hop target to L_failed.  Makes for denser prologue code.
2748     __ BIND(L_failed_0);
2749     __ jmp(L_failed);
2750     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2751 
2752     __ align(CodeEntryAlignment);
2753     address start = __ pc();
2754 
2755     __ enter(); // required for proper stackwalking of RuntimeStub frame
2756 
2757     // bump this on entry, not on exit:
2758     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2759 
2760     //-----------------------------------------------------------------------
2761     // Assembler stub will be used for this call to arraycopy
2762     // if the following conditions are met:
2763     //
2764     // (1) src and dst must not be null.
2765     // (2) src_pos must not be negative.
2766     // (3) dst_pos must not be negative.
2767     // (4) length  must not be negative.
2768     // (5) src klass and dst klass should be the same and not NULL.
2769     // (6) src and dst should be arrays.
2770     // (7) src_pos + length must not exceed length of src.
2771     // (8) dst_pos + length must not exceed length of dst.
2772     //
2773 
2774     //  if (src == NULL) return -1;
2775     __ testptr(src, src);         // src oop
2776     size_t j1off = __ offset();
2777     __ jccb(Assembler::zero, L_failed_0);
2778 
2779     //  if (src_pos < 0) return -1;
2780     __ testl(src_pos, src_pos); // src_pos (32-bits)
2781     __ jccb(Assembler::negative, L_failed_0);
2782 
2783     //  if (dst == NULL) return -1;
2784     __ testptr(dst, dst);         // dst oop
2785     __ jccb(Assembler::zero, L_failed_0);
2786 
2787     //  if (dst_pos < 0) return -1;
2788     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2789     size_t j4off = __ offset();
2790     __ jccb(Assembler::negative, L_failed_0);
2791 
2792     // The first four tests are very dense code,
2793     // but not quite dense enough to put four
2794     // jumps in a 16-byte instruction fetch buffer.
2795     // That's good, because some branch predicters
2796     // do not like jumps so close together.
2797     // Make sure of this.
2798     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2799 
2800     // registers used as temp
2801     const Register r11_length    = r11; // elements count to copy
2802     const Register r10_src_klass = r10; // array klass
2803 
2804     //  if (length < 0) return -1;
2805     __ movl(r11_length, length);        // length (elements count, 32-bits value)
2806     __ testl(r11_length, r11_length);
2807     __ jccb(Assembler::negative, L_failed_0);
2808 
2809     __ load_klass(r10_src_klass, src);
2810 #ifdef ASSERT
2811     //  assert(src->klass() != NULL);
2812     {
2813       BLOCK_COMMENT("assert klasses not null {");
2814       Label L1, L2;
2815       __ testptr(r10_src_klass, r10_src_klass);
2816       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
2817       __ bind(L1);
2818       __ stop("broken null klass");
2819       __ bind(L2);
2820       __ load_klass(rax, dst);
2821       __ cmpq(rax, 0);
2822       __ jcc(Assembler::equal, L1);     // this would be broken also
2823       BLOCK_COMMENT("} assert klasses not null done");
2824     }
2825 #endif
2826 
2827     // Load layout helper (32-bits)
2828     //
2829     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2830     // 32        30    24            16              8     2                 0
2831     //
2832     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2833     //
2834 
2835     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2836 
2837     // Handle objArrays completely differently...
2838     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2839     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2840     __ jcc(Assembler::equal, L_objArray);
2841 
2842     //  if (src->klass() != dst->klass()) return -1;
2843     __ load_klass(rax, dst);
2844     __ cmpq(r10_src_klass, rax);
2845     __ jcc(Assembler::notEqual, L_failed);
2846 
2847     const Register rax_lh = rax;  // layout helper
2848     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2849 
2850     //  if (!src->is_Array()) return -1;
2851     __ cmpl(rax_lh, Klass::_lh_neutral_value);
2852     __ jcc(Assembler::greaterEqual, L_failed);
2853 
2854     // At this point, it is known to be a typeArray (array_tag 0x3).
2855 #ifdef ASSERT
2856     {
2857       BLOCK_COMMENT("assert primitive array {");
2858       Label L;
2859       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2860       __ jcc(Assembler::greaterEqual, L);
2861       __ stop("must be a primitive array");
2862       __ bind(L);
2863       BLOCK_COMMENT("} assert primitive array done");
2864     }
2865 #endif
2866 
2867     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2868                            r10, L_failed);
2869 
2870     // TypeArrayKlass
2871     //
2872     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2873     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2874     //
2875 
2876     const Register r10_offset = r10;    // array offset
2877     const Register rax_elsize = rax_lh; // element size
2878 
2879     __ movl(r10_offset, rax_lh);
2880     __ shrl(r10_offset, Klass::_lh_header_size_shift);
2881     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
2882     __ addptr(src, r10_offset);           // src array offset
2883     __ addptr(dst, r10_offset);           // dst array offset
2884     BLOCK_COMMENT("choose copy loop based on element size");
2885     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2886 
2887     // next registers should be set before the jump to corresponding stub
2888     const Register from     = c_rarg0;  // source array address
2889     const Register to       = c_rarg1;  // destination array address
2890     const Register count    = c_rarg2;  // elements count
2891 
2892     // 'from', 'to', 'count' registers should be set in such order
2893     // since they are the same as 'src', 'src_pos', 'dst'.
2894 
2895   __ BIND(L_copy_bytes);
2896     __ cmpl(rax_elsize, 0);
2897     __ jccb(Assembler::notEqual, L_copy_shorts);
2898     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2899     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2900     __ movl2ptr(count, r11_length); // length
2901     __ jump(RuntimeAddress(byte_copy_entry));
2902 
2903   __ BIND(L_copy_shorts);
2904     __ cmpl(rax_elsize, LogBytesPerShort);
2905     __ jccb(Assembler::notEqual, L_copy_ints);
2906     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2907     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2908     __ movl2ptr(count, r11_length); // length
2909     __ jump(RuntimeAddress(short_copy_entry));
2910 
2911   __ BIND(L_copy_ints);
2912     __ cmpl(rax_elsize, LogBytesPerInt);
2913     __ jccb(Assembler::notEqual, L_copy_longs);
2914     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2915     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2916     __ movl2ptr(count, r11_length); // length
2917     __ jump(RuntimeAddress(int_copy_entry));
2918 
2919   __ BIND(L_copy_longs);
2920 #ifdef ASSERT
2921     {
2922       BLOCK_COMMENT("assert long copy {");
2923       Label L;
2924       __ cmpl(rax_elsize, LogBytesPerLong);
2925       __ jcc(Assembler::equal, L);
2926       __ stop("must be long copy, but elsize is wrong");
2927       __ bind(L);
2928       BLOCK_COMMENT("} assert long copy done");
2929     }
2930 #endif
2931     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2932     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2933     __ movl2ptr(count, r11_length); // length
2934     __ jump(RuntimeAddress(long_copy_entry));
2935 
2936     // ObjArrayKlass
2937   __ BIND(L_objArray);
2938     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
2939 
2940     Label L_plain_copy, L_checkcast_copy;
2941     //  test array classes for subtyping
2942     __ load_klass(rax, dst);
2943     __ cmpq(r10_src_klass, rax); // usual case is exact equality
2944     __ jcc(Assembler::notEqual, L_checkcast_copy);
2945 
2946     // Identically typed arrays can be copied without element-wise checks.
2947     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2948                            r10, L_failed);
2949 
2950     __ lea(from, Address(src, src_pos, TIMES_OOP,
2951                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2952     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2953                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2954     __ movl2ptr(count, r11_length); // length
2955   __ BIND(L_plain_copy);
2956     __ jump(RuntimeAddress(oop_copy_entry));
2957 
2958   __ BIND(L_checkcast_copy);
2959     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
2960     {
2961       // Before looking at dst.length, make sure dst is also an objArray.
2962       __ cmpl(Address(rax, lh_offset), objArray_lh);
2963       __ jcc(Assembler::notEqual, L_failed);
2964 
2965       // It is safe to examine both src.length and dst.length.
2966       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2967                              rax, L_failed);
2968 
2969       const Register r11_dst_klass = r11;
2970       __ load_klass(r11_dst_klass, dst); // reload
2971 
2972       // Marshal the base address arguments now, freeing registers.
2973       __ lea(from, Address(src, src_pos, TIMES_OOP,
2974                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2975       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2976                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2977       __ movl(count, length);           // length (reloaded)
2978       Register sco_temp = c_rarg3;      // this register is free now
2979       assert_different_registers(from, to, count, sco_temp,
2980                                  r11_dst_klass, r10_src_klass);
2981       assert_clean_int(count, sco_temp);
2982 
2983       // Generate the type check.
2984       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2985       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2986       assert_clean_int(sco_temp, rax);
2987       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
2988 
2989       // Fetch destination element klass from the ObjArrayKlass header.
2990       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2991       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
2992       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
2993       assert_clean_int(sco_temp, rax);
2994 
2995       // the checkcast_copy loop needs two extra arguments:
2996       assert(c_rarg3 == sco_temp, "#3 already in place");
2997       // Set up arguments for checkcast_copy_entry.
2998       setup_arg_regs(4);
2999       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3000       __ jump(RuntimeAddress(checkcast_copy_entry));
3001     }
3002 
3003   __ BIND(L_failed);
3004     __ xorptr(rax, rax);
3005     __ notptr(rax); // return -1
3006     __ leave();   // required for proper stackwalking of RuntimeStub frame
3007     __ ret(0);
3008 
3009     return start;
3010   }
3011 
3012   void generate_arraycopy_stubs() {
3013     address entry;
3014     address entry_jbyte_arraycopy;
3015     address entry_jshort_arraycopy;
3016     address entry_jint_arraycopy;
3017     address entry_oop_arraycopy;
3018     address entry_jlong_arraycopy;
3019     address entry_checkcast_arraycopy;
3020 
3021     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
3022                                                                            "jbyte_disjoint_arraycopy");
3023     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3024                                                                            "jbyte_arraycopy");
3025 
3026     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3027                                                                             "jshort_disjoint_arraycopy");
3028     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3029                                                                             "jshort_arraycopy");
3030 
3031     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
3032                                                                               "jint_disjoint_arraycopy");
3033     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
3034                                                                               &entry_jint_arraycopy, "jint_arraycopy");
3035 
3036     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
3037                                                                                "jlong_disjoint_arraycopy");
3038     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
3039                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
3040 
3041 
3042     if (UseCompressedOops) {
3043       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
3044                                                                               "oop_disjoint_arraycopy");
3045       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
3046                                                                               &entry_oop_arraycopy, "oop_arraycopy");
3047       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
3048                                                                                      "oop_disjoint_arraycopy_uninit",
3049                                                                                      /*dest_uninitialized*/true);
3050       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
3051                                                                                      NULL, "oop_arraycopy_uninit",
3052                                                                                      /*dest_uninitialized*/true);
3053     } else {
3054       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
3055                                                                                "oop_disjoint_arraycopy");
3056       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
3057                                                                                &entry_oop_arraycopy, "oop_arraycopy");
3058       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
3059                                                                                       "oop_disjoint_arraycopy_uninit",
3060                                                                                       /*dest_uninitialized*/true);
3061       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
3062                                                                                       NULL, "oop_arraycopy_uninit",
3063                                                                                       /*dest_uninitialized*/true);
3064     }
3065 
3066     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3067     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3068                                                                         /*dest_uninitialized*/true);
3069 
3070     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3071                                                               entry_jbyte_arraycopy,
3072                                                               entry_jshort_arraycopy,
3073                                                               entry_jint_arraycopy,
3074                                                               entry_jlong_arraycopy);
3075     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3076                                                                entry_jbyte_arraycopy,
3077                                                                entry_jshort_arraycopy,
3078                                                                entry_jint_arraycopy,
3079                                                                entry_oop_arraycopy,
3080                                                                entry_jlong_arraycopy,
3081                                                                entry_checkcast_arraycopy);
3082 
3083     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3084     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3085     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3086     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3087     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3088     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3089 
3090     // We don't generate specialized code for HeapWord-aligned source
3091     // arrays, so just use the code we've already generated
3092     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
3093     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
3094 
3095     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3096     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
3097 
3098     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
3099     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
3100 
3101     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
3102     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
3103 
3104     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
3105     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
3106 
3107     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
3108     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
3109   }
3110 
3111   // AES intrinsic stubs
3112   enum {AESBlockSize = 16};
3113 
3114   address generate_key_shuffle_mask() {
3115     __ align(16);
3116     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3117     address start = __ pc();
3118     __ emit_data64( 0x0405060700010203, relocInfo::none );
3119     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3120     return start;
3121   }
3122 
3123   address generate_counter_shuffle_mask() {
3124     __ align(16);
3125     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3126     address start = __ pc();
3127     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3128     __ emit_data64(0x0001020304050607, relocInfo::none);
3129     return start;
3130   }
3131 
3132   // Utility routine for loading a 128-bit key word in little endian format
3133   // can optionally specify that the shuffle mask is already in an xmmregister
3134   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3135     __ movdqu(xmmdst, Address(key, offset));
3136     if (xmm_shuf_mask != NULL) {
3137       __ pshufb(xmmdst, xmm_shuf_mask);
3138     } else {
3139       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3140     }
3141   }
3142 
3143   // Utility routine for increase 128bit counter (iv in CTR mode)
3144   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3145     __ pextrq(reg, xmmdst, 0x0);
3146     __ addq(reg, inc_delta);
3147     __ pinsrq(xmmdst, reg, 0x0);
3148     __ jcc(Assembler::carryClear, next_block); // jump if no carry
3149     __ pextrq(reg, xmmdst, 0x01); // Carry
3150     __ addq(reg, 0x01);
3151     __ pinsrq(xmmdst, reg, 0x01); //Carry end
3152     __ BIND(next_block);          // next instruction
3153   }
3154 
3155   // Arguments:
3156   //
3157   // Inputs:
3158   //   c_rarg0   - source byte array address
3159   //   c_rarg1   - destination byte array address
3160   //   c_rarg2   - K (key) in little endian int array
3161   //
3162   address generate_aescrypt_encryptBlock() {
3163     assert(UseAES, "need AES instructions and misaligned SSE support");
3164     __ align(CodeEntryAlignment);
3165     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3166     Label L_doLast;
3167     address start = __ pc();
3168 
3169     const Register from        = c_rarg0;  // source array address
3170     const Register to          = c_rarg1;  // destination array address
3171     const Register key         = c_rarg2;  // key array address
3172     const Register keylen      = rax;
3173 
3174     const XMMRegister xmm_result = xmm0;
3175     const XMMRegister xmm_key_shuf_mask = xmm1;
3176     // On win64 xmm6-xmm15 must be preserved so don't use them.
3177     const XMMRegister xmm_temp1  = xmm2;
3178     const XMMRegister xmm_temp2  = xmm3;
3179     const XMMRegister xmm_temp3  = xmm4;
3180     const XMMRegister xmm_temp4  = xmm5;
3181 
3182     __ enter(); // required for proper stackwalking of RuntimeStub frame
3183 
3184     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3185     // context for the registers used, where all instructions below are using 128-bit mode
3186     // On EVEX without VL and BW, these instructions will all be AVX.
3187     if (VM_Version::supports_avx512vlbw()) {
3188       __ movl(rax, 0xffff);
3189       __ kmovql(k1, rax);
3190     }
3191 
3192     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3193     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3194 
3195     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3196     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3197 
3198     // For encryption, the java expanded key ordering is just what we need
3199     // we don't know if the key is aligned, hence not using load-execute form
3200 
3201     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3202     __ pxor(xmm_result, xmm_temp1);
3203 
3204     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3205     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3206     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3207     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3208 
3209     __ aesenc(xmm_result, xmm_temp1);
3210     __ aesenc(xmm_result, xmm_temp2);
3211     __ aesenc(xmm_result, xmm_temp3);
3212     __ aesenc(xmm_result, xmm_temp4);
3213 
3214     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3215     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3216     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3217     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3218 
3219     __ aesenc(xmm_result, xmm_temp1);
3220     __ aesenc(xmm_result, xmm_temp2);
3221     __ aesenc(xmm_result, xmm_temp3);
3222     __ aesenc(xmm_result, xmm_temp4);
3223 
3224     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3225     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3226 
3227     __ cmpl(keylen, 44);
3228     __ jccb(Assembler::equal, L_doLast);
3229 
3230     __ aesenc(xmm_result, xmm_temp1);
3231     __ aesenc(xmm_result, xmm_temp2);
3232 
3233     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3234     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3235 
3236     __ cmpl(keylen, 52);
3237     __ jccb(Assembler::equal, L_doLast);
3238 
3239     __ aesenc(xmm_result, xmm_temp1);
3240     __ aesenc(xmm_result, xmm_temp2);
3241 
3242     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3243     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3244 
3245     __ BIND(L_doLast);
3246     __ aesenc(xmm_result, xmm_temp1);
3247     __ aesenclast(xmm_result, xmm_temp2);
3248     __ movdqu(Address(to, 0), xmm_result);        // store the result
3249     __ xorptr(rax, rax); // return 0
3250     __ leave(); // required for proper stackwalking of RuntimeStub frame
3251     __ ret(0);
3252 
3253     return start;
3254   }
3255 
3256 
3257   // Arguments:
3258   //
3259   // Inputs:
3260   //   c_rarg0   - source byte array address
3261   //   c_rarg1   - destination byte array address
3262   //   c_rarg2   - K (key) in little endian int array
3263   //
3264   address generate_aescrypt_decryptBlock() {
3265     assert(UseAES, "need AES instructions and misaligned SSE support");
3266     __ align(CodeEntryAlignment);
3267     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3268     Label L_doLast;
3269     address start = __ pc();
3270 
3271     const Register from        = c_rarg0;  // source array address
3272     const Register to          = c_rarg1;  // destination array address
3273     const Register key         = c_rarg2;  // key array address
3274     const Register keylen      = rax;
3275 
3276     const XMMRegister xmm_result = xmm0;
3277     const XMMRegister xmm_key_shuf_mask = xmm1;
3278     // On win64 xmm6-xmm15 must be preserved so don't use them.
3279     const XMMRegister xmm_temp1  = xmm2;
3280     const XMMRegister xmm_temp2  = xmm3;
3281     const XMMRegister xmm_temp3  = xmm4;
3282     const XMMRegister xmm_temp4  = xmm5;
3283 
3284     __ enter(); // required for proper stackwalking of RuntimeStub frame
3285 
3286     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3287     // context for the registers used, where all instructions below are using 128-bit mode
3288     // On EVEX without VL and BW, these instructions will all be AVX.
3289     if (VM_Version::supports_avx512vlbw()) {
3290       __ movl(rax, 0xffff);
3291       __ kmovql(k1, rax);
3292     }
3293 
3294     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3295     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3296 
3297     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3298     __ movdqu(xmm_result, Address(from, 0));
3299 
3300     // for decryption java expanded key ordering is rotated one position from what we want
3301     // so we start from 0x10 here and hit 0x00 last
3302     // we don't know if the key is aligned, hence not using load-execute form
3303     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3304     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3305     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3306     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3307 
3308     __ pxor  (xmm_result, xmm_temp1);
3309     __ aesdec(xmm_result, xmm_temp2);
3310     __ aesdec(xmm_result, xmm_temp3);
3311     __ aesdec(xmm_result, xmm_temp4);
3312 
3313     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3314     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3315     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3316     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3317 
3318     __ aesdec(xmm_result, xmm_temp1);
3319     __ aesdec(xmm_result, xmm_temp2);
3320     __ aesdec(xmm_result, xmm_temp3);
3321     __ aesdec(xmm_result, xmm_temp4);
3322 
3323     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3324     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3325     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3326 
3327     __ cmpl(keylen, 44);
3328     __ jccb(Assembler::equal, L_doLast);
3329 
3330     __ aesdec(xmm_result, xmm_temp1);
3331     __ aesdec(xmm_result, xmm_temp2);
3332 
3333     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3334     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3335 
3336     __ cmpl(keylen, 52);
3337     __ jccb(Assembler::equal, L_doLast);
3338 
3339     __ aesdec(xmm_result, xmm_temp1);
3340     __ aesdec(xmm_result, xmm_temp2);
3341 
3342     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3343     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3344 
3345     __ BIND(L_doLast);
3346     __ aesdec(xmm_result, xmm_temp1);
3347     __ aesdec(xmm_result, xmm_temp2);
3348 
3349     // for decryption the aesdeclast operation is always on key+0x00
3350     __ aesdeclast(xmm_result, xmm_temp3);
3351     __ movdqu(Address(to, 0), xmm_result);  // store the result
3352     __ xorptr(rax, rax); // return 0
3353     __ leave(); // required for proper stackwalking of RuntimeStub frame
3354     __ ret(0);
3355 
3356     return start;
3357   }
3358 
3359 
3360   // Arguments:
3361   //
3362   // Inputs:
3363   //   c_rarg0   - source byte array address
3364   //   c_rarg1   - destination byte array address
3365   //   c_rarg2   - K (key) in little endian int array
3366   //   c_rarg3   - r vector byte array address
3367   //   c_rarg4   - input length
3368   //
3369   // Output:
3370   //   rax       - input length
3371   //
3372   address generate_cipherBlockChaining_encryptAESCrypt() {
3373     assert(UseAES, "need AES instructions and misaligned SSE support");
3374     __ align(CodeEntryAlignment);
3375     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3376     address start = __ pc();
3377 
3378     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3379     const Register from        = c_rarg0;  // source array address
3380     const Register to          = c_rarg1;  // destination array address
3381     const Register key         = c_rarg2;  // key array address
3382     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3383                                            // and left with the results of the last encryption block
3384 #ifndef _WIN64
3385     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3386 #else
3387     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3388     const Register len_reg     = r11;      // pick the volatile windows register
3389 #endif
3390     const Register pos         = rax;
3391 
3392     // xmm register assignments for the loops below
3393     const XMMRegister xmm_result = xmm0;
3394     const XMMRegister xmm_temp   = xmm1;
3395     // keys 0-10 preloaded into xmm2-xmm12
3396     const int XMM_REG_NUM_KEY_FIRST = 2;
3397     const int XMM_REG_NUM_KEY_LAST  = 15;
3398     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3399     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3400     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3401     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3402     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3403 
3404     __ enter(); // required for proper stackwalking of RuntimeStub frame
3405 
3406     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3407     // context for the registers used, where all instructions below are using 128-bit mode
3408     // On EVEX without VL and BW, these instructions will all be AVX.
3409     if (VM_Version::supports_avx512vlbw()) {
3410       __ movl(rax, 0xffff);
3411       __ kmovql(k1, rax);
3412     }
3413 
3414 #ifdef _WIN64
3415     // on win64, fill len_reg from stack position
3416     __ movl(len_reg, len_mem);
3417 #else
3418     __ push(len_reg); // Save
3419 #endif
3420 
3421     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3422     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3423     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3424     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3425       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3426       offset += 0x10;
3427     }
3428     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3429 
3430     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3431     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3432     __ cmpl(rax, 44);
3433     __ jcc(Assembler::notEqual, L_key_192_256);
3434 
3435     // 128 bit code follows here
3436     __ movptr(pos, 0);
3437     __ align(OptoLoopAlignment);
3438 
3439     __ BIND(L_loopTop_128);
3440     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3441     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3442     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3443     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3444       __ aesenc(xmm_result, as_XMMRegister(rnum));
3445     }
3446     __ aesenclast(xmm_result, xmm_key10);
3447     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3448     // no need to store r to memory until we exit
3449     __ addptr(pos, AESBlockSize);
3450     __ subptr(len_reg, AESBlockSize);
3451     __ jcc(Assembler::notEqual, L_loopTop_128);
3452 
3453     __ BIND(L_exit);
3454     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3455 
3456 #ifdef _WIN64
3457     __ movl(rax, len_mem);
3458 #else
3459     __ pop(rax); // return length
3460 #endif
3461     __ leave(); // required for proper stackwalking of RuntimeStub frame
3462     __ ret(0);
3463 
3464     __ BIND(L_key_192_256);
3465     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3466     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3467     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3468     __ cmpl(rax, 52);
3469     __ jcc(Assembler::notEqual, L_key_256);
3470 
3471     // 192-bit code follows here (could be changed to use more xmm registers)
3472     __ movptr(pos, 0);
3473     __ align(OptoLoopAlignment);
3474 
3475     __ BIND(L_loopTop_192);
3476     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3477     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3478     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3479     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3480       __ aesenc(xmm_result, as_XMMRegister(rnum));
3481     }
3482     __ aesenclast(xmm_result, xmm_key12);
3483     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3484     // no need to store r to memory until we exit
3485     __ addptr(pos, AESBlockSize);
3486     __ subptr(len_reg, AESBlockSize);
3487     __ jcc(Assembler::notEqual, L_loopTop_192);
3488     __ jmp(L_exit);
3489 
3490     __ BIND(L_key_256);
3491     // 256-bit code follows here (could be changed to use more xmm registers)
3492     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3493     __ movptr(pos, 0);
3494     __ align(OptoLoopAlignment);
3495 
3496     __ BIND(L_loopTop_256);
3497     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3498     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3499     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3500     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3501       __ aesenc(xmm_result, as_XMMRegister(rnum));
3502     }
3503     load_key(xmm_temp, key, 0xe0);
3504     __ aesenclast(xmm_result, xmm_temp);
3505     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3506     // no need to store r to memory until we exit
3507     __ addptr(pos, AESBlockSize);
3508     __ subptr(len_reg, AESBlockSize);
3509     __ jcc(Assembler::notEqual, L_loopTop_256);
3510     __ jmp(L_exit);
3511 
3512     return start;
3513   }
3514 
3515   // Safefetch stubs.
3516   void generate_safefetch(const char* name, int size, address* entry,
3517                           address* fault_pc, address* continuation_pc) {
3518     // safefetch signatures:
3519     //   int      SafeFetch32(int*      adr, int      errValue);
3520     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3521     //
3522     // arguments:
3523     //   c_rarg0 = adr
3524     //   c_rarg1 = errValue
3525     //
3526     // result:
3527     //   PPC_RET  = *adr or errValue
3528 
3529     StubCodeMark mark(this, "StubRoutines", name);
3530 
3531     // Entry point, pc or function descriptor.
3532     *entry = __ pc();
3533 
3534     // Load *adr into c_rarg1, may fault.
3535     *fault_pc = __ pc();
3536     switch (size) {
3537       case 4:
3538         // int32_t
3539         __ movl(c_rarg1, Address(c_rarg0, 0));
3540         break;
3541       case 8:
3542         // int64_t
3543         __ movq(c_rarg1, Address(c_rarg0, 0));
3544         break;
3545       default:
3546         ShouldNotReachHere();
3547     }
3548 
3549     // return errValue or *adr
3550     *continuation_pc = __ pc();
3551     __ movq(rax, c_rarg1);
3552     __ ret(0);
3553   }
3554 
3555   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3556   // to hide instruction latency
3557   //
3558   // Arguments:
3559   //
3560   // Inputs:
3561   //   c_rarg0   - source byte array address
3562   //   c_rarg1   - destination byte array address
3563   //   c_rarg2   - K (key) in little endian int array
3564   //   c_rarg3   - r vector byte array address
3565   //   c_rarg4   - input length
3566   //
3567   // Output:
3568   //   rax       - input length
3569   //
3570   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3571     assert(UseAES, "need AES instructions and misaligned SSE support");
3572     __ align(CodeEntryAlignment);
3573     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3574     address start = __ pc();
3575 
3576     const Register from        = c_rarg0;  // source array address
3577     const Register to          = c_rarg1;  // destination array address
3578     const Register key         = c_rarg2;  // key array address
3579     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3580                                            // and left with the results of the last encryption block
3581 #ifndef _WIN64
3582     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3583 #else
3584     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3585     const Register len_reg     = r11;      // pick the volatile windows register
3586 #endif
3587     const Register pos         = rax;
3588 
3589     const int PARALLEL_FACTOR = 4;
3590     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3591 
3592     Label L_exit;
3593     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3594     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3595     Label L_singleBlock_loopTop[3]; // 128, 192, 256
3596     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3597     Label L_multiBlock_loopTop[3]; // 128, 192, 256
3598 
3599     // keys 0-10 preloaded into xmm5-xmm15
3600     const int XMM_REG_NUM_KEY_FIRST = 5;
3601     const int XMM_REG_NUM_KEY_LAST  = 15;
3602     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3603     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3604 
3605     __ enter(); // required for proper stackwalking of RuntimeStub frame
3606 
3607     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3608     // context for the registers used, where all instructions below are using 128-bit mode
3609     // On EVEX without VL and BW, these instructions will all be AVX.
3610     if (VM_Version::supports_avx512vlbw()) {
3611       __ movl(rax, 0xffff);
3612       __ kmovql(k1, rax);
3613     }
3614 
3615 #ifdef _WIN64
3616     // on win64, fill len_reg from stack position
3617     __ movl(len_reg, len_mem);
3618 #else
3619     __ push(len_reg); // Save
3620 #endif
3621     __ push(rbx);
3622     // the java expanded key ordering is rotated one position from what we want
3623     // so we start from 0x10 here and hit 0x00 last
3624     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3625     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3626     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3627     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3628       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3629       offset += 0x10;
3630     }
3631     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3632 
3633     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3634 
3635     // registers holding the four results in the parallelized loop
3636     const XMMRegister xmm_result0 = xmm0;
3637     const XMMRegister xmm_result1 = xmm2;
3638     const XMMRegister xmm_result2 = xmm3;
3639     const XMMRegister xmm_result3 = xmm4;
3640 
3641     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3642 
3643     __ xorptr(pos, pos);
3644 
3645     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3646     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3647     __ cmpl(rbx, 52);
3648     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3649     __ cmpl(rbx, 60);
3650     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3651 
3652 #define DoFour(opc, src_reg)           \
3653   __ opc(xmm_result0, src_reg);         \
3654   __ opc(xmm_result1, src_reg);         \
3655   __ opc(xmm_result2, src_reg);         \
3656   __ opc(xmm_result3, src_reg);         \
3657 
3658     for (int k = 0; k < 3; ++k) {
3659       __ BIND(L_multiBlock_loopTopHead[k]);
3660       if (k != 0) {
3661         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3662         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3663       }
3664       if (k == 1) {
3665         __ subptr(rsp, 6 * wordSize);
3666         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3667         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3668         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3669         load_key(xmm1, key, 0xc0);  // 0xc0;
3670         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3671       } else if (k == 2) {
3672         __ subptr(rsp, 10 * wordSize);
3673         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3674         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
3675         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3676         load_key(xmm1, key, 0xe0);  // 0xe0;
3677         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3678         load_key(xmm15, key, 0xb0); // 0xb0;
3679         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3680         load_key(xmm1, key, 0xc0);  // 0xc0;
3681         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3682       }
3683       __ align(OptoLoopAlignment);
3684       __ BIND(L_multiBlock_loopTop[k]);
3685       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3686       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3687 
3688       if  (k != 0) {
3689         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3690         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
3691       }
3692 
3693       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
3694       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3695       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3696       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3697 
3698       DoFour(pxor, xmm_key_first);
3699       if (k == 0) {
3700         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
3701           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3702         }
3703         DoFour(aesdeclast, xmm_key_last);
3704       } else if (k == 1) {
3705         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
3706           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3707         }
3708         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3709         DoFour(aesdec, xmm1);  // key : 0xc0
3710         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3711         DoFour(aesdeclast, xmm_key_last);
3712       } else if (k == 2) {
3713         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
3714           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3715         }
3716         DoFour(aesdec, xmm1);  // key : 0xc0
3717         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
3718         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
3719         DoFour(aesdec, xmm15);  // key : 0xd0
3720         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3721         DoFour(aesdec, xmm1);  // key : 0xe0
3722         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3723         DoFour(aesdeclast, xmm_key_last);
3724       }
3725 
3726       // for each result, xor with the r vector of previous cipher block
3727       __ pxor(xmm_result0, xmm_prev_block_cipher);
3728       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3729       __ pxor(xmm_result1, xmm_prev_block_cipher);
3730       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3731       __ pxor(xmm_result2, xmm_prev_block_cipher);
3732       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3733       __ pxor(xmm_result3, xmm_prev_block_cipher);
3734       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
3735       if (k != 0) {
3736         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
3737       }
3738 
3739       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
3740       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
3741       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
3742       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
3743 
3744       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
3745       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
3746       __ jmp(L_multiBlock_loopTop[k]);
3747 
3748       // registers used in the non-parallelized loops
3749       // xmm register assignments for the loops below
3750       const XMMRegister xmm_result = xmm0;
3751       const XMMRegister xmm_prev_block_cipher_save = xmm2;
3752       const XMMRegister xmm_key11 = xmm3;
3753       const XMMRegister xmm_key12 = xmm4;
3754       const XMMRegister key_tmp = xmm4;
3755 
3756       __ BIND(L_singleBlock_loopTopHead[k]);
3757       if (k == 1) {
3758         __ addptr(rsp, 6 * wordSize);
3759       } else if (k == 2) {
3760         __ addptr(rsp, 10 * wordSize);
3761       }
3762       __ cmpptr(len_reg, 0); // any blocks left??
3763       __ jcc(Assembler::equal, L_exit);
3764       __ BIND(L_singleBlock_loopTopHead2[k]);
3765       if (k == 1) {
3766         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
3767         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
3768       }
3769       if (k == 2) {
3770         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
3771       }
3772       __ align(OptoLoopAlignment);
3773       __ BIND(L_singleBlock_loopTop[k]);
3774       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3775       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3776       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
3777       for (int rnum = 1; rnum <= 9 ; rnum++) {
3778           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3779       }
3780       if (k == 1) {
3781         __ aesdec(xmm_result, xmm_key11);
3782         __ aesdec(xmm_result, xmm_key12);
3783       }
3784       if (k == 2) {
3785         __ aesdec(xmm_result, xmm_key11);
3786         load_key(key_tmp, key, 0xc0);
3787         __ aesdec(xmm_result, key_tmp);
3788         load_key(key_tmp, key, 0xd0);
3789         __ aesdec(xmm_result, key_tmp);
3790         load_key(key_tmp, key, 0xe0);
3791         __ aesdec(xmm_result, key_tmp);
3792       }
3793 
3794       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3795       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3796       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3797       // no need to store r to memory until we exit
3798       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3799       __ addptr(pos, AESBlockSize);
3800       __ subptr(len_reg, AESBlockSize);
3801       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3802       if (k != 2) {
3803         __ jmp(L_exit);
3804       }
3805     } //for 128/192/256
3806 
3807     __ BIND(L_exit);
3808     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
3809     __ pop(rbx);
3810 #ifdef _WIN64
3811     __ movl(rax, len_mem);
3812 #else
3813     __ pop(rax); // return length
3814 #endif
3815     __ leave(); // required for proper stackwalking of RuntimeStub frame
3816     __ ret(0);
3817     return start;
3818 }
3819 
3820   address generate_upper_word_mask() {
3821     __ align(64);
3822     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3823     address start = __ pc();
3824     __ emit_data64(0x0000000000000000, relocInfo::none);
3825     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3826     return start;
3827   }
3828 
3829   address generate_shuffle_byte_flip_mask() {
3830     __ align(64);
3831     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3832     address start = __ pc();
3833     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3834     __ emit_data64(0x0001020304050607, relocInfo::none);
3835     return start;
3836   }
3837 
3838   // ofs and limit are use for multi-block byte array.
3839   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3840   address generate_sha1_implCompress(bool multi_block, const char *name) {
3841     __ align(CodeEntryAlignment);
3842     StubCodeMark mark(this, "StubRoutines", name);
3843     address start = __ pc();
3844 
3845     Register buf = c_rarg0;
3846     Register state = c_rarg1;
3847     Register ofs = c_rarg2;
3848     Register limit = c_rarg3;
3849 
3850     const XMMRegister abcd = xmm0;
3851     const XMMRegister e0 = xmm1;
3852     const XMMRegister e1 = xmm2;
3853     const XMMRegister msg0 = xmm3;
3854 
3855     const XMMRegister msg1 = xmm4;
3856     const XMMRegister msg2 = xmm5;
3857     const XMMRegister msg3 = xmm6;
3858     const XMMRegister shuf_mask = xmm7;
3859 
3860     __ enter();
3861 
3862     __ subptr(rsp, 4 * wordSize);
3863 
3864     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3865       buf, state, ofs, limit, rsp, multi_block);
3866 
3867     __ addptr(rsp, 4 * wordSize);
3868 
3869     __ leave();
3870     __ ret(0);
3871     return start;
3872   }
3873 
3874   address generate_pshuffle_byte_flip_mask() {
3875     __ align(64);
3876     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3877     address start = __ pc();
3878     __ emit_data64(0x0405060700010203, relocInfo::none);
3879     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3880 
3881     if (VM_Version::supports_avx2()) {
3882       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
3883       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3884       // _SHUF_00BA
3885       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3886       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3887       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3888       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3889       // _SHUF_DC00
3890       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3891       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3892       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3893       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3894     }
3895 
3896     return start;
3897   }
3898 
3899   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
3900   address generate_pshuffle_byte_flip_mask_sha512() {
3901     __ align(32);
3902     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
3903     address start = __ pc();
3904     if (VM_Version::supports_avx2()) {
3905       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
3906       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3907       __ emit_data64(0x1011121314151617, relocInfo::none);
3908       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
3909       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
3910       __ emit_data64(0x0000000000000000, relocInfo::none);
3911       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3912       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3913     }
3914 
3915     return start;
3916   }
3917 
3918 // ofs and limit are use for multi-block byte array.
3919 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3920   address generate_sha256_implCompress(bool multi_block, const char *name) {
3921     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
3922     __ align(CodeEntryAlignment);
3923     StubCodeMark mark(this, "StubRoutines", name);
3924     address start = __ pc();
3925 
3926     Register buf = c_rarg0;
3927     Register state = c_rarg1;
3928     Register ofs = c_rarg2;
3929     Register limit = c_rarg3;
3930 
3931     const XMMRegister msg = xmm0;
3932     const XMMRegister state0 = xmm1;
3933     const XMMRegister state1 = xmm2;
3934     const XMMRegister msgtmp0 = xmm3;
3935 
3936     const XMMRegister msgtmp1 = xmm4;
3937     const XMMRegister msgtmp2 = xmm5;
3938     const XMMRegister msgtmp3 = xmm6;
3939     const XMMRegister msgtmp4 = xmm7;
3940 
3941     const XMMRegister shuf_mask = xmm8;
3942 
3943     __ enter();
3944 
3945     __ subptr(rsp, 4 * wordSize);
3946 
3947     if (VM_Version::supports_sha()) {
3948       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3949         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3950     } else if (VM_Version::supports_avx2()) {
3951       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3952         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3953     }
3954     __ addptr(rsp, 4 * wordSize);
3955     __ vzeroupper();
3956     __ leave();
3957     __ ret(0);
3958     return start;
3959   }
3960 
3961   address generate_sha512_implCompress(bool multi_block, const char *name) {
3962     assert(VM_Version::supports_avx2(), "");
3963     assert(VM_Version::supports_bmi2(), "");
3964     __ align(CodeEntryAlignment);
3965     StubCodeMark mark(this, "StubRoutines", name);
3966     address start = __ pc();
3967 
3968     Register buf = c_rarg0;
3969     Register state = c_rarg1;
3970     Register ofs = c_rarg2;
3971     Register limit = c_rarg3;
3972 
3973     const XMMRegister msg = xmm0;
3974     const XMMRegister state0 = xmm1;
3975     const XMMRegister state1 = xmm2;
3976     const XMMRegister msgtmp0 = xmm3;
3977     const XMMRegister msgtmp1 = xmm4;
3978     const XMMRegister msgtmp2 = xmm5;
3979     const XMMRegister msgtmp3 = xmm6;
3980     const XMMRegister msgtmp4 = xmm7;
3981 
3982     const XMMRegister shuf_mask = xmm8;
3983 
3984     __ enter();
3985 
3986     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3987     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3988 
3989     __ vzeroupper();
3990     __ leave();
3991     __ ret(0);
3992     return start;
3993   }
3994 
3995   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
3996   // to hide instruction latency
3997   //
3998   // Arguments:
3999   //
4000   // Inputs:
4001   //   c_rarg0   - source byte array address
4002   //   c_rarg1   - destination byte array address
4003   //   c_rarg2   - K (key) in little endian int array
4004   //   c_rarg3   - counter vector byte array address
4005   //   Linux
4006   //     c_rarg4   -          input length
4007   //     c_rarg5   -          saved encryptedCounter start
4008   //     rbp + 6 * wordSize - saved used length
4009   //   Windows
4010   //     rbp + 6 * wordSize - input length
4011   //     rbp + 7 * wordSize - saved encryptedCounter start
4012   //     rbp + 8 * wordSize - saved used length
4013   //
4014   // Output:
4015   //   rax       - input length
4016   //
4017   address generate_counterMode_AESCrypt_Parallel() {
4018     assert(UseAES, "need AES instructions and misaligned SSE support");
4019     __ align(CodeEntryAlignment);
4020     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4021     address start = __ pc();
4022     const Register from = c_rarg0; // source array address
4023     const Register to = c_rarg1; // destination array address
4024     const Register key = c_rarg2; // key array address
4025     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4026                                       // and updated with the incremented counter in the end
4027 #ifndef _WIN64
4028     const Register len_reg = c_rarg4;
4029     const Register saved_encCounter_start = c_rarg5;
4030     const Register used_addr = r10;
4031     const Address  used_mem(rbp, 2 * wordSize);
4032     const Register used = r11;
4033 #else
4034     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4035     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
4036     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
4037     const Register len_reg = r10; // pick the first volatile windows register
4038     const Register saved_encCounter_start = r11;
4039     const Register used_addr = r13;
4040     const Register used = r14;
4041 #endif
4042     const Register pos = rax;
4043 
4044     const int PARALLEL_FACTOR = 6;
4045     const XMMRegister xmm_counter_shuf_mask = xmm0;
4046     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
4047     const XMMRegister xmm_curr_counter = xmm2;
4048 
4049     const XMMRegister xmm_key_tmp0 = xmm3;
4050     const XMMRegister xmm_key_tmp1 = xmm4;
4051 
4052     // registers holding the four results in the parallelized loop
4053     const XMMRegister xmm_result0 = xmm5;
4054     const XMMRegister xmm_result1 = xmm6;
4055     const XMMRegister xmm_result2 = xmm7;
4056     const XMMRegister xmm_result3 = xmm8;
4057     const XMMRegister xmm_result4 = xmm9;
4058     const XMMRegister xmm_result5 = xmm10;
4059 
4060     const XMMRegister xmm_from0 = xmm11;
4061     const XMMRegister xmm_from1 = xmm12;
4062     const XMMRegister xmm_from2 = xmm13;
4063     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
4064     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
4065     const XMMRegister xmm_from5 = xmm4;
4066 
4067     //for key_128, key_192, key_256
4068     const int rounds[3] = {10, 12, 14};
4069     Label L_exit_preLoop, L_preLoop_start;
4070     Label L_multiBlock_loopTop[3];
4071     Label L_singleBlockLoopTop[3];
4072     Label L__incCounter[3][6]; //for 6 blocks
4073     Label L__incCounter_single[3]; //for single block, key128, key192, key256
4074     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
4075     Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
4076 
4077     Label L_exit;
4078 
4079     __ enter(); // required for proper stackwalking of RuntimeStub frame
4080 
4081     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
4082     // context for the registers used, where all instructions below are using 128-bit mode
4083     // On EVEX without VL and BW, these instructions will all be AVX.
4084     if (VM_Version::supports_avx512vlbw()) {
4085         __ movl(rax, 0xffff);
4086         __ kmovql(k1, rax);
4087     }
4088 
4089 #ifdef _WIN64
4090     // allocate spill slots for r13, r14
4091     enum {
4092         saved_r13_offset,
4093         saved_r14_offset
4094     };
4095     __ subptr(rsp, 2 * wordSize);
4096     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
4097     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
4098 
4099     // on win64, fill len_reg from stack position
4100     __ movl(len_reg, len_mem);
4101     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4102     __ movptr(used_addr, used_mem);
4103     __ movl(used, Address(used_addr, 0));
4104 #else
4105     __ push(len_reg); // Save
4106     __ movptr(used_addr, used_mem);
4107     __ movl(used, Address(used_addr, 0));
4108 #endif
4109 
4110     __ push(rbx); // Save RBX
4111     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
4112     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
4113     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
4114     __ movptr(pos, 0);
4115 
4116     // Use the partially used encrpyted counter from last invocation
4117     __ BIND(L_preLoop_start);
4118     __ cmpptr(used, 16);
4119     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
4120       __ cmpptr(len_reg, 0);
4121       __ jcc(Assembler::lessEqual, L_exit_preLoop);
4122       __ movb(rbx, Address(saved_encCounter_start, used));
4123       __ xorb(rbx, Address(from, pos));
4124       __ movb(Address(to, pos), rbx);
4125       __ addptr(pos, 1);
4126       __ addptr(used, 1);
4127       __ subptr(len_reg, 1);
4128 
4129     __ jmp(L_preLoop_start);
4130 
4131     __ BIND(L_exit_preLoop);
4132     __ movl(Address(used_addr, 0), used);
4133 
4134     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
4135     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
4136     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4137     __ cmpl(rbx, 52);
4138     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
4139     __ cmpl(rbx, 60);
4140     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
4141 
4142 #define CTR_DoSix(opc, src_reg)                \
4143     __ opc(xmm_result0, src_reg);              \
4144     __ opc(xmm_result1, src_reg);              \
4145     __ opc(xmm_result2, src_reg);              \
4146     __ opc(xmm_result3, src_reg);              \
4147     __ opc(xmm_result4, src_reg);              \
4148     __ opc(xmm_result5, src_reg);
4149 
4150     // k == 0 :  generate code for key_128
4151     // k == 1 :  generate code for key_192
4152     // k == 2 :  generate code for key_256
4153     for (int k = 0; k < 3; ++k) {
4154       //multi blocks starts here
4155       __ align(OptoLoopAlignment);
4156       __ BIND(L_multiBlock_loopTop[k]);
4157       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
4158       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
4159       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4160 
4161       //load, then increase counters
4162       CTR_DoSix(movdqa, xmm_curr_counter);
4163       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
4164       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
4165       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
4166       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
4167       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
4168       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
4169       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
4170       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
4171 
4172       //load two ROUND_KEYs at a time
4173       for (int i = 1; i < rounds[k]; ) {
4174         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4175         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4176         CTR_DoSix(aesenc, xmm_key_tmp1);
4177         i++;
4178         if (i != rounds[k]) {
4179           CTR_DoSix(aesenc, xmm_key_tmp0);
4180         } else {
4181           CTR_DoSix(aesenclast, xmm_key_tmp0);
4182         }
4183         i++;
4184       }
4185 
4186       // get next PARALLEL_FACTOR blocks into xmm_result registers
4187       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4188       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4189       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4190       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4191       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4192       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4193 
4194       __ pxor(xmm_result0, xmm_from0);
4195       __ pxor(xmm_result1, xmm_from1);
4196       __ pxor(xmm_result2, xmm_from2);
4197       __ pxor(xmm_result3, xmm_from3);
4198       __ pxor(xmm_result4, xmm_from4);
4199       __ pxor(xmm_result5, xmm_from5);
4200 
4201       // store 6 results into the next 64 bytes of output
4202       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4203       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4204       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4205       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4206       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4207       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4208 
4209       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4210       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4211       __ jmp(L_multiBlock_loopTop[k]);
4212 
4213       // singleBlock starts here
4214       __ align(OptoLoopAlignment);
4215       __ BIND(L_singleBlockLoopTop[k]);
4216       __ cmpptr(len_reg, 0);
4217       __ jcc(Assembler::lessEqual, L_exit);
4218       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4219       __ movdqa(xmm_result0, xmm_curr_counter);
4220       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4221       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4222       __ pxor(xmm_result0, xmm_key_tmp0);
4223       for (int i = 1; i < rounds[k]; i++) {
4224         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4225         __ aesenc(xmm_result0, xmm_key_tmp0);
4226       }
4227       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4228       __ aesenclast(xmm_result0, xmm_key_tmp0);
4229       __ cmpptr(len_reg, AESBlockSize);
4230       __ jcc(Assembler::less, L_processTail_insr[k]);
4231         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4232         __ pxor(xmm_result0, xmm_from0);
4233         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4234         __ addptr(pos, AESBlockSize);
4235         __ subptr(len_reg, AESBlockSize);
4236         __ jmp(L_singleBlockLoopTop[k]);
4237       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
4238         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
4239         __ testptr(len_reg, 8);
4240         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4241           __ subptr(pos,8);
4242           __ pinsrq(xmm_from0, Address(from, pos), 0);
4243         __ BIND(L_processTail_4_insr[k]);
4244         __ testptr(len_reg, 4);
4245         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4246           __ subptr(pos,4);
4247           __ pslldq(xmm_from0, 4);
4248           __ pinsrd(xmm_from0, Address(from, pos), 0);
4249         __ BIND(L_processTail_2_insr[k]);
4250         __ testptr(len_reg, 2);
4251         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4252           __ subptr(pos, 2);
4253           __ pslldq(xmm_from0, 2);
4254           __ pinsrw(xmm_from0, Address(from, pos), 0);
4255         __ BIND(L_processTail_1_insr[k]);
4256         __ testptr(len_reg, 1);
4257         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4258           __ subptr(pos, 1);
4259           __ pslldq(xmm_from0, 1);
4260           __ pinsrb(xmm_from0, Address(from, pos), 0);
4261         __ BIND(L_processTail_exit_insr[k]);
4262 
4263         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4264         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
4265 
4266         __ testptr(len_reg, 8);
4267         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
4268           __ pextrq(Address(to, pos), xmm_result0, 0);
4269           __ psrldq(xmm_result0, 8);
4270           __ addptr(pos, 8);
4271         __ BIND(L_processTail_4_extr[k]);
4272         __ testptr(len_reg, 4);
4273         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4274           __ pextrd(Address(to, pos), xmm_result0, 0);
4275           __ psrldq(xmm_result0, 4);
4276           __ addptr(pos, 4);
4277         __ BIND(L_processTail_2_extr[k]);
4278         __ testptr(len_reg, 2);
4279         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4280           __ pextrw(Address(to, pos), xmm_result0, 0);
4281           __ psrldq(xmm_result0, 2);
4282           __ addptr(pos, 2);
4283         __ BIND(L_processTail_1_extr[k]);
4284         __ testptr(len_reg, 1);
4285         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4286           __ pextrb(Address(to, pos), xmm_result0, 0);
4287 
4288         __ BIND(L_processTail_exit_extr[k]);
4289         __ movl(Address(used_addr, 0), len_reg);
4290         __ jmp(L_exit);
4291 
4292     }
4293 
4294     __ BIND(L_exit);
4295     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4296     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4297     __ pop(rbx); // pop the saved RBX.
4298 #ifdef _WIN64
4299     __ movl(rax, len_mem);
4300     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4301     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4302     __ addptr(rsp, 2 * wordSize);
4303 #else
4304     __ pop(rax); // return 'len'
4305 #endif
4306     __ leave(); // required for proper stackwalking of RuntimeStub frame
4307     __ ret(0);
4308     return start;
4309   }
4310 
4311   // byte swap x86 long
4312   address generate_ghash_long_swap_mask() {
4313     __ align(CodeEntryAlignment);
4314     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4315     address start = __ pc();
4316     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4317     __ emit_data64(0x0706050403020100, relocInfo::none );
4318   return start;
4319   }
4320 
4321   // byte swap x86 byte array
4322   address generate_ghash_byte_swap_mask() {
4323     __ align(CodeEntryAlignment);
4324     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4325     address start = __ pc();
4326     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4327     __ emit_data64(0x0001020304050607, relocInfo::none );
4328   return start;
4329   }
4330 
4331   /* Single and multi-block ghash operations */
4332   address generate_ghash_processBlocks() {
4333     __ align(CodeEntryAlignment);
4334     Label L_ghash_loop, L_exit;
4335     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4336     address start = __ pc();
4337 
4338     const Register state        = c_rarg0;
4339     const Register subkeyH      = c_rarg1;
4340     const Register data         = c_rarg2;
4341     const Register blocks       = c_rarg3;
4342 
4343     const XMMRegister xmm_temp0 = xmm0;
4344     const XMMRegister xmm_temp1 = xmm1;
4345     const XMMRegister xmm_temp2 = xmm2;
4346     const XMMRegister xmm_temp3 = xmm3;
4347     const XMMRegister xmm_temp4 = xmm4;
4348     const XMMRegister xmm_temp5 = xmm5;
4349     const XMMRegister xmm_temp6 = xmm6;
4350     const XMMRegister xmm_temp7 = xmm7;
4351     const XMMRegister xmm_temp8 = xmm8;
4352     const XMMRegister xmm_temp9 = xmm9;
4353     const XMMRegister xmm_temp10 = xmm10;
4354 
4355     __ enter();
4356 
4357     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
4358     // context for the registers used, where all instructions below are using 128-bit mode
4359     // On EVEX without VL and BW, these instructions will all be AVX.
4360     if (VM_Version::supports_avx512vlbw()) {
4361       __ movl(rax, 0xffff);
4362       __ kmovql(k1, rax);
4363     }
4364 
4365     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4366 
4367     __ movdqu(xmm_temp0, Address(state, 0));
4368     __ pshufb(xmm_temp0, xmm_temp10);
4369 
4370 
4371     __ BIND(L_ghash_loop);
4372     __ movdqu(xmm_temp2, Address(data, 0));
4373     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4374 
4375     __ movdqu(xmm_temp1, Address(subkeyH, 0));
4376     __ pshufb(xmm_temp1, xmm_temp10);
4377 
4378     __ pxor(xmm_temp0, xmm_temp2);
4379 
4380     //
4381     // Multiply with the hash key
4382     //
4383     __ movdqu(xmm_temp3, xmm_temp0);
4384     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
4385     __ movdqu(xmm_temp4, xmm_temp0);
4386     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
4387 
4388     __ movdqu(xmm_temp5, xmm_temp0);
4389     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
4390     __ movdqu(xmm_temp6, xmm_temp0);
4391     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
4392 
4393     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
4394 
4395     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
4396     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
4397     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
4398     __ pxor(xmm_temp3, xmm_temp5);
4399     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
4400                                         // of the carry-less multiplication of
4401                                         // xmm0 by xmm1.
4402 
4403     // We shift the result of the multiplication by one bit position
4404     // to the left to cope for the fact that the bits are reversed.
4405     __ movdqu(xmm_temp7, xmm_temp3);
4406     __ movdqu(xmm_temp8, xmm_temp6);
4407     __ pslld(xmm_temp3, 1);
4408     __ pslld(xmm_temp6, 1);
4409     __ psrld(xmm_temp7, 31);
4410     __ psrld(xmm_temp8, 31);
4411     __ movdqu(xmm_temp9, xmm_temp7);
4412     __ pslldq(xmm_temp8, 4);
4413     __ pslldq(xmm_temp7, 4);
4414     __ psrldq(xmm_temp9, 12);
4415     __ por(xmm_temp3, xmm_temp7);
4416     __ por(xmm_temp6, xmm_temp8);
4417     __ por(xmm_temp6, xmm_temp9);
4418 
4419     //
4420     // First phase of the reduction
4421     //
4422     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
4423     // independently.
4424     __ movdqu(xmm_temp7, xmm_temp3);
4425     __ movdqu(xmm_temp8, xmm_temp3);
4426     __ movdqu(xmm_temp9, xmm_temp3);
4427     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
4428     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
4429     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
4430     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
4431     __ pxor(xmm_temp7, xmm_temp9);
4432     __ movdqu(xmm_temp8, xmm_temp7);
4433     __ pslldq(xmm_temp7, 12);
4434     __ psrldq(xmm_temp8, 4);
4435     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
4436 
4437     //
4438     // Second phase of the reduction
4439     //
4440     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
4441     // shift operations.
4442     __ movdqu(xmm_temp2, xmm_temp3);
4443     __ movdqu(xmm_temp4, xmm_temp3);
4444     __ movdqu(xmm_temp5, xmm_temp3);
4445     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
4446     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
4447     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
4448     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
4449     __ pxor(xmm_temp2, xmm_temp5);
4450     __ pxor(xmm_temp2, xmm_temp8);
4451     __ pxor(xmm_temp3, xmm_temp2);
4452     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
4453 
4454     __ decrement(blocks);
4455     __ jcc(Assembler::zero, L_exit);
4456     __ movdqu(xmm_temp0, xmm_temp6);
4457     __ addptr(data, 16);
4458     __ jmp(L_ghash_loop);
4459 
4460     __ BIND(L_exit);
4461     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
4462     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
4463     __ leave();
4464     __ ret(0);
4465     return start;
4466   }
4467 
4468   /**
4469    *  Arguments:
4470    *
4471    * Inputs:
4472    *   c_rarg0   - int crc
4473    *   c_rarg1   - byte* buf
4474    *   c_rarg2   - int length
4475    *
4476    * Ouput:
4477    *       rax   - int crc result
4478    */
4479   address generate_updateBytesCRC32() {
4480     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
4481 
4482     __ align(CodeEntryAlignment);
4483     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4484 
4485     address start = __ pc();
4486     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4487     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4488     // rscratch1: r10
4489     const Register crc   = c_rarg0;  // crc
4490     const Register buf   = c_rarg1;  // source java byte array address
4491     const Register len   = c_rarg2;  // length
4492     const Register table = c_rarg3;  // crc_table address (reuse register)
4493     const Register tmp   = r11;
4494     assert_different_registers(crc, buf, len, table, tmp, rax);
4495 
4496     BLOCK_COMMENT("Entry:");
4497     __ enter(); // required for proper stackwalking of RuntimeStub frame
4498 
4499     __ kernel_crc32(crc, buf, len, table, tmp);
4500 
4501     __ movl(rax, crc);
4502     __ vzeroupper();
4503     __ leave(); // required for proper stackwalking of RuntimeStub frame
4504     __ ret(0);
4505 
4506     return start;
4507   }
4508 
4509   /**
4510   *  Arguments:
4511   *
4512   * Inputs:
4513   *   c_rarg0   - int crc
4514   *   c_rarg1   - byte* buf
4515   *   c_rarg2   - long length
4516   *   c_rarg3   - table_start - optional (present only when doing a library_call,
4517   *              not used by x86 algorithm)
4518   *
4519   * Ouput:
4520   *       rax   - int crc result
4521   */
4522   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
4523       assert(UseCRC32CIntrinsics, "need SSE4_2");
4524       __ align(CodeEntryAlignment);
4525       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4526       address start = __ pc();
4527       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
4528       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
4529       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
4530       const Register crc = c_rarg0;  // crc
4531       const Register buf = c_rarg1;  // source java byte array address
4532       const Register len = c_rarg2;  // length
4533       const Register a = rax;
4534       const Register j = r9;
4535       const Register k = r10;
4536       const Register l = r11;
4537 #ifdef _WIN64
4538       const Register y = rdi;
4539       const Register z = rsi;
4540 #else
4541       const Register y = rcx;
4542       const Register z = r8;
4543 #endif
4544       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
4545 
4546       BLOCK_COMMENT("Entry:");
4547       __ enter(); // required for proper stackwalking of RuntimeStub frame
4548 #ifdef _WIN64
4549       __ push(y);
4550       __ push(z);
4551 #endif
4552       __ crc32c_ipl_alg2_alt2(crc, buf, len,
4553                               a, j, k,
4554                               l, y, z,
4555                               c_farg0, c_farg1, c_farg2,
4556                               is_pclmulqdq_supported);
4557       __ movl(rax, crc);
4558 #ifdef _WIN64
4559       __ pop(z);
4560       __ pop(y);
4561 #endif
4562       __ vzeroupper();
4563       __ leave(); // required for proper stackwalking of RuntimeStub frame
4564       __ ret(0);
4565 
4566       return start;
4567   }
4568 
4569   /**
4570    *  Arguments:
4571    *
4572    *  Input:
4573    *    c_rarg0   - x address
4574    *    c_rarg1   - x length
4575    *    c_rarg2   - y address
4576    *    c_rarg3   - y lenth
4577    * not Win64
4578    *    c_rarg4   - z address
4579    *    c_rarg5   - z length
4580    * Win64
4581    *    rsp+40    - z address
4582    *    rsp+48    - z length
4583    */
4584   address generate_multiplyToLen() {
4585     __ align(CodeEntryAlignment);
4586     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4587 
4588     address start = __ pc();
4589     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4590     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4591     const Register x     = rdi;
4592     const Register xlen  = rax;
4593     const Register y     = rsi;
4594     const Register ylen  = rcx;
4595     const Register z     = r8;
4596     const Register zlen  = r11;
4597 
4598     // Next registers will be saved on stack in multiply_to_len().
4599     const Register tmp1  = r12;
4600     const Register tmp2  = r13;
4601     const Register tmp3  = r14;
4602     const Register tmp4  = r15;
4603     const Register tmp5  = rbx;
4604 
4605     BLOCK_COMMENT("Entry:");
4606     __ enter(); // required for proper stackwalking of RuntimeStub frame
4607 
4608 #ifndef _WIN64
4609     __ movptr(zlen, r9); // Save r9 in r11 - zlen
4610 #endif
4611     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
4612                        // ylen => rcx, z => r8, zlen => r11
4613                        // r9 and r10 may be used to save non-volatile registers
4614 #ifdef _WIN64
4615     // last 2 arguments (#4, #5) are on stack on Win64
4616     __ movptr(z, Address(rsp, 6 * wordSize));
4617     __ movptr(zlen, Address(rsp, 7 * wordSize));
4618 #endif
4619 
4620     __ movptr(xlen, rsi);
4621     __ movptr(y,    rdx);
4622     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
4623 
4624     restore_arg_regs();
4625 
4626     __ leave(); // required for proper stackwalking of RuntimeStub frame
4627     __ ret(0);
4628 
4629     return start;
4630   }
4631 
4632   /**
4633   *  Arguments:
4634   *
4635   *  Input:
4636   *    c_rarg0   - obja     address
4637   *    c_rarg1   - objb     address
4638   *    c_rarg3   - length   length
4639   *    c_rarg4   - scale    log2_array_indxscale
4640   *
4641   *  Output:
4642   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
4643   */
4644   address generate_vectorizedMismatch() {
4645     __ align(CodeEntryAlignment);
4646     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
4647     address start = __ pc();
4648 
4649     BLOCK_COMMENT("Entry:");
4650     __ enter();
4651 
4652 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4653     const Register scale = c_rarg0;  //rcx, will exchange with r9
4654     const Register objb = c_rarg1;   //rdx
4655     const Register length = c_rarg2; //r8
4656     const Register obja = c_rarg3;   //r9
4657     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
4658 
4659     const Register tmp1 = r10;
4660     const Register tmp2 = r11;
4661 #endif
4662 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4663     const Register obja = c_rarg0;   //U:rdi
4664     const Register objb = c_rarg1;   //U:rsi
4665     const Register length = c_rarg2; //U:rdx
4666     const Register scale = c_rarg3;  //U:rcx
4667     const Register tmp1 = r8;
4668     const Register tmp2 = r9;
4669 #endif
4670     const Register result = rax; //return value
4671     const XMMRegister vec0 = xmm0;
4672     const XMMRegister vec1 = xmm1;
4673     const XMMRegister vec2 = xmm2;
4674 
4675     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
4676 
4677     __ vzeroupper();
4678     __ leave();
4679     __ ret(0);
4680 
4681     return start;
4682   }
4683 
4684 /**
4685    *  Arguments:
4686    *
4687   //  Input:
4688   //    c_rarg0   - x address
4689   //    c_rarg1   - x length
4690   //    c_rarg2   - z address
4691   //    c_rarg3   - z lenth
4692    *
4693    */
4694   address generate_squareToLen() {
4695 
4696     __ align(CodeEntryAlignment);
4697     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4698 
4699     address start = __ pc();
4700     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4701     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
4702     const Register x      = rdi;
4703     const Register len    = rsi;
4704     const Register z      = r8;
4705     const Register zlen   = rcx;
4706 
4707    const Register tmp1      = r12;
4708    const Register tmp2      = r13;
4709    const Register tmp3      = r14;
4710    const Register tmp4      = r15;
4711    const Register tmp5      = rbx;
4712 
4713     BLOCK_COMMENT("Entry:");
4714     __ enter(); // required for proper stackwalking of RuntimeStub frame
4715 
4716        setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
4717                           // zlen => rcx
4718                           // r9 and r10 may be used to save non-volatile registers
4719     __ movptr(r8, rdx);
4720     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
4721 
4722     restore_arg_regs();
4723 
4724     __ leave(); // required for proper stackwalking of RuntimeStub frame
4725     __ ret(0);
4726 
4727     return start;
4728   }
4729 
4730    /**
4731    *  Arguments:
4732    *
4733    *  Input:
4734    *    c_rarg0   - out address
4735    *    c_rarg1   - in address
4736    *    c_rarg2   - offset
4737    *    c_rarg3   - len
4738    * not Win64
4739    *    c_rarg4   - k
4740    * Win64
4741    *    rsp+40    - k
4742    */
4743   address generate_mulAdd() {
4744     __ align(CodeEntryAlignment);
4745     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4746 
4747     address start = __ pc();
4748     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4749     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4750     const Register out     = rdi;
4751     const Register in      = rsi;
4752     const Register offset  = r11;
4753     const Register len     = rcx;
4754     const Register k       = r8;
4755 
4756     // Next registers will be saved on stack in mul_add().
4757     const Register tmp1  = r12;
4758     const Register tmp2  = r13;
4759     const Register tmp3  = r14;
4760     const Register tmp4  = r15;
4761     const Register tmp5  = rbx;
4762 
4763     BLOCK_COMMENT("Entry:");
4764     __ enter(); // required for proper stackwalking of RuntimeStub frame
4765 
4766     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
4767                        // len => rcx, k => r8
4768                        // r9 and r10 may be used to save non-volatile registers
4769 #ifdef _WIN64
4770     // last argument is on stack on Win64
4771     __ movl(k, Address(rsp, 6 * wordSize));
4772 #endif
4773     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
4774     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
4775 
4776     restore_arg_regs();
4777 
4778     __ leave(); // required for proper stackwalking of RuntimeStub frame
4779     __ ret(0);
4780 
4781     return start;
4782   }
4783 
4784   address generate_libmExp() {
4785     StubCodeMark mark(this, "StubRoutines", "libmExp");
4786 
4787     address start = __ pc();
4788 
4789     const XMMRegister x0  = xmm0;
4790     const XMMRegister x1  = xmm1;
4791     const XMMRegister x2  = xmm2;
4792     const XMMRegister x3  = xmm3;
4793 
4794     const XMMRegister x4  = xmm4;
4795     const XMMRegister x5  = xmm5;
4796     const XMMRegister x6  = xmm6;
4797     const XMMRegister x7  = xmm7;
4798 
4799     const Register tmp   = r11;
4800 
4801     BLOCK_COMMENT("Entry:");
4802     __ enter(); // required for proper stackwalking of RuntimeStub frame
4803 
4804     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4805 
4806     __ leave(); // required for proper stackwalking of RuntimeStub frame
4807     __ ret(0);
4808 
4809     return start;
4810 
4811   }
4812 
4813   address generate_libmLog() {
4814     StubCodeMark mark(this, "StubRoutines", "libmLog");
4815 
4816     address start = __ pc();
4817 
4818     const XMMRegister x0 = xmm0;
4819     const XMMRegister x1 = xmm1;
4820     const XMMRegister x2 = xmm2;
4821     const XMMRegister x3 = xmm3;
4822 
4823     const XMMRegister x4 = xmm4;
4824     const XMMRegister x5 = xmm5;
4825     const XMMRegister x6 = xmm6;
4826     const XMMRegister x7 = xmm7;
4827 
4828     const Register tmp1 = r11;
4829     const Register tmp2 = r8;
4830 
4831     BLOCK_COMMENT("Entry:");
4832     __ enter(); // required for proper stackwalking of RuntimeStub frame
4833 
4834     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
4835 
4836     __ leave(); // required for proper stackwalking of RuntimeStub frame
4837     __ ret(0);
4838 
4839     return start;
4840 
4841   }
4842 
4843   address generate_libmLog10() {
4844     StubCodeMark mark(this, "StubRoutines", "libmLog10");
4845 
4846     address start = __ pc();
4847 
4848     const XMMRegister x0 = xmm0;
4849     const XMMRegister x1 = xmm1;
4850     const XMMRegister x2 = xmm2;
4851     const XMMRegister x3 = xmm3;
4852 
4853     const XMMRegister x4 = xmm4;
4854     const XMMRegister x5 = xmm5;
4855     const XMMRegister x6 = xmm6;
4856     const XMMRegister x7 = xmm7;
4857 
4858     const Register tmp = r11;
4859 
4860     BLOCK_COMMENT("Entry:");
4861     __ enter(); // required for proper stackwalking of RuntimeStub frame
4862 
4863     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4864 
4865     __ leave(); // required for proper stackwalking of RuntimeStub frame
4866     __ ret(0);
4867 
4868     return start;
4869 
4870   }
4871 
4872   address generate_libmPow() {
4873     StubCodeMark mark(this, "StubRoutines", "libmPow");
4874 
4875     address start = __ pc();
4876 
4877     const XMMRegister x0 = xmm0;
4878     const XMMRegister x1 = xmm1;
4879     const XMMRegister x2 = xmm2;
4880     const XMMRegister x3 = xmm3;
4881 
4882     const XMMRegister x4 = xmm4;
4883     const XMMRegister x5 = xmm5;
4884     const XMMRegister x6 = xmm6;
4885     const XMMRegister x7 = xmm7;
4886 
4887     const Register tmp1 = r8;
4888     const Register tmp2 = r9;
4889     const Register tmp3 = r10;
4890     const Register tmp4 = r11;
4891 
4892     BLOCK_COMMENT("Entry:");
4893     __ enter(); // required for proper stackwalking of RuntimeStub frame
4894 
4895     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4896 
4897     __ leave(); // required for proper stackwalking of RuntimeStub frame
4898     __ ret(0);
4899 
4900     return start;
4901 
4902   }
4903 
4904   address generate_libmSin() {
4905     StubCodeMark mark(this, "StubRoutines", "libmSin");
4906 
4907     address start = __ pc();
4908 
4909     const XMMRegister x0 = xmm0;
4910     const XMMRegister x1 = xmm1;
4911     const XMMRegister x2 = xmm2;
4912     const XMMRegister x3 = xmm3;
4913 
4914     const XMMRegister x4 = xmm4;
4915     const XMMRegister x5 = xmm5;
4916     const XMMRegister x6 = xmm6;
4917     const XMMRegister x7 = xmm7;
4918 
4919     const Register tmp1 = r8;
4920     const Register tmp2 = r9;
4921     const Register tmp3 = r10;
4922     const Register tmp4 = r11;
4923 
4924     BLOCK_COMMENT("Entry:");
4925     __ enter(); // required for proper stackwalking of RuntimeStub frame
4926 
4927 #ifdef _WIN64
4928     __ push(rsi);
4929     __ push(rdi);
4930 #endif
4931     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4932 
4933 #ifdef _WIN64
4934     __ pop(rdi);
4935     __ pop(rsi);
4936 #endif
4937 
4938     __ leave(); // required for proper stackwalking of RuntimeStub frame
4939     __ ret(0);
4940 
4941     return start;
4942 
4943   }
4944 
4945   address generate_libmCos() {
4946     StubCodeMark mark(this, "StubRoutines", "libmCos");
4947 
4948     address start = __ pc();
4949 
4950     const XMMRegister x0 = xmm0;
4951     const XMMRegister x1 = xmm1;
4952     const XMMRegister x2 = xmm2;
4953     const XMMRegister x3 = xmm3;
4954 
4955     const XMMRegister x4 = xmm4;
4956     const XMMRegister x5 = xmm5;
4957     const XMMRegister x6 = xmm6;
4958     const XMMRegister x7 = xmm7;
4959 
4960     const Register tmp1 = r8;
4961     const Register tmp2 = r9;
4962     const Register tmp3 = r10;
4963     const Register tmp4 = r11;
4964 
4965     BLOCK_COMMENT("Entry:");
4966     __ enter(); // required for proper stackwalking of RuntimeStub frame
4967 
4968 #ifdef _WIN64
4969     __ push(rsi);
4970     __ push(rdi);
4971 #endif
4972     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4973 
4974 #ifdef _WIN64
4975     __ pop(rdi);
4976     __ pop(rsi);
4977 #endif
4978 
4979     __ leave(); // required for proper stackwalking of RuntimeStub frame
4980     __ ret(0);
4981 
4982     return start;
4983 
4984   }
4985 
4986   address generate_libmTan() {
4987     StubCodeMark mark(this, "StubRoutines", "libmTan");
4988 
4989     address start = __ pc();
4990 
4991     const XMMRegister x0 = xmm0;
4992     const XMMRegister x1 = xmm1;
4993     const XMMRegister x2 = xmm2;
4994     const XMMRegister x3 = xmm3;
4995 
4996     const XMMRegister x4 = xmm4;
4997     const XMMRegister x5 = xmm5;
4998     const XMMRegister x6 = xmm6;
4999     const XMMRegister x7 = xmm7;
5000 
5001     const Register tmp1 = r8;
5002     const Register tmp2 = r9;
5003     const Register tmp3 = r10;
5004     const Register tmp4 = r11;
5005 
5006     BLOCK_COMMENT("Entry:");
5007     __ enter(); // required for proper stackwalking of RuntimeStub frame
5008 
5009 #ifdef _WIN64
5010     __ push(rsi);
5011     __ push(rdi);
5012 #endif
5013     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5014 
5015 #ifdef _WIN64
5016     __ pop(rdi);
5017     __ pop(rsi);
5018 #endif
5019 
5020     __ leave(); // required for proper stackwalking of RuntimeStub frame
5021     __ ret(0);
5022 
5023     return start;
5024 
5025   }
5026 
5027 #undef __
5028 #define __ masm->
5029 
5030   // Continuation point for throwing of implicit exceptions that are
5031   // not handled in the current activation. Fabricates an exception
5032   // oop and initiates normal exception dispatching in this
5033   // frame. Since we need to preserve callee-saved values (currently
5034   // only for C2, but done for C1 as well) we need a callee-saved oop
5035   // map and therefore have to make these stubs into RuntimeStubs
5036   // rather than BufferBlobs.  If the compiler needs all registers to
5037   // be preserved between the fault point and the exception handler
5038   // then it must assume responsibility for that in
5039   // AbstractCompiler::continuation_for_implicit_null_exception or
5040   // continuation_for_implicit_division_by_zero_exception. All other
5041   // implicit exceptions (e.g., NullPointerException or
5042   // AbstractMethodError on entry) are either at call sites or
5043   // otherwise assume that stack unwinding will be initiated, so
5044   // caller saved registers were assumed volatile in the compiler.
5045   address generate_throw_exception(const char* name,
5046                                    address runtime_entry,
5047                                    Register arg1 = noreg,
5048                                    Register arg2 = noreg) {
5049     // Information about frame layout at time of blocking runtime call.
5050     // Note that we only have to preserve callee-saved registers since
5051     // the compilers are responsible for supplying a continuation point
5052     // if they expect all registers to be preserved.
5053     enum layout {
5054       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
5055       rbp_off2,
5056       return_off,
5057       return_off2,
5058       framesize // inclusive of return address
5059     };
5060 
5061     int insts_size = 512;
5062     int locs_size  = 64;
5063 
5064     CodeBuffer code(name, insts_size, locs_size);
5065     OopMapSet* oop_maps  = new OopMapSet();
5066     MacroAssembler* masm = new MacroAssembler(&code);
5067 
5068     address start = __ pc();
5069 
5070     // This is an inlined and slightly modified version of call_VM
5071     // which has the ability to fetch the return PC out of
5072     // thread-local storage and also sets up last_Java_sp slightly
5073     // differently than the real call_VM
5074 
5075     __ enter(); // required for proper stackwalking of RuntimeStub frame
5076 
5077     assert(is_even(framesize/2), "sp not 16-byte aligned");
5078 
5079     // return address and rbp are already in place
5080     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
5081 
5082     int frame_complete = __ pc() - start;
5083 
5084     // Set up last_Java_sp and last_Java_fp
5085     address the_pc = __ pc();
5086     __ set_last_Java_frame(rsp, rbp, the_pc);
5087     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
5088 
5089     // Call runtime
5090     if (arg1 != noreg) {
5091       assert(arg2 != c_rarg1, "clobbered");
5092       __ movptr(c_rarg1, arg1);
5093     }
5094     if (arg2 != noreg) {
5095       __ movptr(c_rarg2, arg2);
5096     }
5097     __ movptr(c_rarg0, r15_thread);
5098     BLOCK_COMMENT("call runtime_entry");
5099     __ call(RuntimeAddress(runtime_entry));
5100 
5101     // Generate oop map
5102     OopMap* map = new OopMap(framesize, 0);
5103 
5104     oop_maps->add_gc_map(the_pc - start, map);
5105 
5106     __ reset_last_Java_frame(true);
5107 
5108     __ leave(); // required for proper stackwalking of RuntimeStub frame
5109 
5110     // check for pending exceptions
5111 #ifdef ASSERT
5112     Label L;
5113     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
5114             (int32_t) NULL_WORD);
5115     __ jcc(Assembler::notEqual, L);
5116     __ should_not_reach_here();
5117     __ bind(L);
5118 #endif // ASSERT
5119     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5120 
5121 
5122     // codeBlob framesize is in words (not VMRegImpl::slot_size)
5123     RuntimeStub* stub =
5124       RuntimeStub::new_runtime_stub(name,
5125                                     &code,
5126                                     frame_complete,
5127                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
5128                                     oop_maps, false);
5129     return stub->entry_point();
5130   }
5131 
5132   void create_control_words() {
5133     // Round to nearest, 53-bit mode, exceptions masked
5134     StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
5135     // Round to zero, 53-bit mode, exception mased
5136     StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
5137     // Round to nearest, 24-bit mode, exceptions masked
5138     StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
5139     // Round to nearest, 64-bit mode, exceptions masked
5140     StubRoutines::_fpu_cntrl_wrd_64    = 0x037F;
5141     // Round to nearest, 64-bit mode, exceptions masked
5142     StubRoutines::_mxcsr_std           = 0x1F80;
5143     // Note: the following two constants are 80-bit values
5144     //       layout is critical for correct loading by FPU.
5145     // Bias for strict fp multiply/divide
5146     StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
5147     StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
5148     StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
5149     // Un-Bias for strict fp multiply/divide
5150     StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
5151     StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
5152     StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
5153   }
5154 
5155   // Initialization
5156   void generate_initial() {
5157     // Generates all stubs and initializes the entry points
5158 
5159     // This platform-specific settings are needed by generate_call_stub()
5160     create_control_words();
5161 
5162     // entry points that exist in all platforms Note: This is code
5163     // that could be shared among different platforms - however the
5164     // benefit seems to be smaller than the disadvantage of having a
5165     // much more complicated generator structure. See also comment in
5166     // stubRoutines.hpp.
5167 
5168     StubRoutines::_forward_exception_entry = generate_forward_exception();
5169 
5170     StubRoutines::_call_stub_entry =
5171       generate_call_stub(StubRoutines::_call_stub_return_address);
5172 
5173     // is referenced by megamorphic call
5174     StubRoutines::_catch_exception_entry = generate_catch_exception();
5175 
5176     // atomic calls
5177     StubRoutines::_atomic_xchg_entry          = generate_atomic_xchg();
5178     StubRoutines::_atomic_xchg_long_entry     = generate_atomic_xchg_long();
5179     StubRoutines::_atomic_cmpxchg_entry       = generate_atomic_cmpxchg();
5180     StubRoutines::_atomic_cmpxchg_byte_entry  = generate_atomic_cmpxchg_byte();
5181     StubRoutines::_atomic_cmpxchg_long_entry  = generate_atomic_cmpxchg_long();
5182     StubRoutines::_atomic_add_entry           = generate_atomic_add();
5183     StubRoutines::_atomic_add_long_entry      = generate_atomic_add_long();
5184     StubRoutines::_fence_entry                = generate_orderaccess_fence();
5185 
5186     // platform dependent
5187     StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp();
5188     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
5189 
5190     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
5191 
5192     // Build this early so it's available for the interpreter.
5193     StubRoutines::_throw_StackOverflowError_entry =
5194       generate_throw_exception("StackOverflowError throw_exception",
5195                                CAST_FROM_FN_PTR(address,
5196                                                 SharedRuntime::
5197                                                 throw_StackOverflowError));
5198     StubRoutines::_throw_delayed_StackOverflowError_entry =
5199       generate_throw_exception("delayed StackOverflowError throw_exception",
5200                                CAST_FROM_FN_PTR(address,
5201                                                 SharedRuntime::
5202                                                 throw_delayed_StackOverflowError));
5203     if (UseCRC32Intrinsics) {
5204       // set table address before stub generation which use it
5205       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
5206       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5207     }
5208 
5209     if (UseCRC32CIntrinsics) {
5210       bool supports_clmul = VM_Version::supports_clmul();
5211       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
5212       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
5213       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
5214     }
5215     if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
5216       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
5217           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
5218           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5219         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
5220         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
5221         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
5222         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
5223         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
5224         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
5225         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
5226         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
5227         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
5228         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
5229         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
5230         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
5231         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
5232         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
5233       }
5234       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
5235         StubRoutines::_dexp = generate_libmExp();
5236       }
5237       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5238         StubRoutines::_dlog = generate_libmLog();
5239       }
5240       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
5241         StubRoutines::_dlog10 = generate_libmLog10();
5242       }
5243       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
5244         StubRoutines::_dpow = generate_libmPow();
5245       }
5246       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5247         StubRoutines::_dsin = generate_libmSin();
5248       }
5249       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5250         StubRoutines::_dcos = generate_libmCos();
5251       }
5252       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5253         StubRoutines::_dtan = generate_libmTan();
5254       }
5255     }
5256   }
5257 
5258   void generate_all() {
5259     // Generates all stubs and initializes the entry points
5260 
5261     // These entry points require SharedInfo::stack0 to be set up in
5262     // non-core builds and need to be relocatable, so they each
5263     // fabricate a RuntimeStub internally.
5264     StubRoutines::_throw_AbstractMethodError_entry =
5265       generate_throw_exception("AbstractMethodError throw_exception",
5266                                CAST_FROM_FN_PTR(address,
5267                                                 SharedRuntime::
5268                                                 throw_AbstractMethodError));
5269 
5270     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5271       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5272                                CAST_FROM_FN_PTR(address,
5273                                                 SharedRuntime::
5274                                                 throw_IncompatibleClassChangeError));
5275 
5276     StubRoutines::_throw_NullPointerException_at_call_entry =
5277       generate_throw_exception("NullPointerException at call throw_exception",
5278                                CAST_FROM_FN_PTR(address,
5279                                                 SharedRuntime::
5280                                                 throw_NullPointerException_at_call));
5281 
5282     // entry points that are platform specific
5283     StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
5284     StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
5285     StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
5286     StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
5287 
5288     StubRoutines::x86::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
5289     StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
5290     StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
5291     StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
5292 
5293     // support for verify_oop (must happen after universe_init)
5294     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
5295 
5296     // arraycopy stubs used by compilers
5297     generate_arraycopy_stubs();
5298 
5299     // Load barrier stubs
5300     if (UseLoadBarrier) {
5301       address loadbarrier_address = CAST_FROM_FN_PTR(address, SharedRuntime::z_load_barrier_on_oop_field_preloaded);
5302       address loadbarrier_weak_address = CAST_FROM_FN_PTR(address, SharedRuntime::z_load_barrier_on_weak_oop_field_preloaded);
5303 
5304       Register rr = as_Register(0);
5305       for (int i = 0; i < RegisterImpl::number_of_registers; i++) {
5306         if (rr != rsp) {
5307           StubRoutines::x86::_load_barrier_slow_stub[i] = generate_load_barrier_stub(rr, loadbarrier_address, false);
5308           StubRoutines::x86::_load_barrier_weak_slow_stub[i] = generate_load_barrier_stub(rr, loadbarrier_weak_address, true);
5309 
5310         } else {
5311           StubRoutines::x86::_load_barrier_slow_stub[i] = (address)NULL;
5312           StubRoutines::x86::_load_barrier_weak_slow_stub[i] = (address)NULL;
5313         }
5314         rr = rr->successor();
5315       }
5316     } else {
5317       for (int i = 0; i < RegisterImpl::number_of_registers; i++) {
5318         StubRoutines::x86::_load_barrier_slow_stub[i] = (address)NULL;
5319         StubRoutines::x86::_load_barrier_weak_slow_stub[i] = (address)NULL;
5320       }
5321     }
5322 
5323     // don't bother generating these AES intrinsic stubs unless global flag is set
5324     if (UseAESIntrinsics) {
5325       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
5326       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5327       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5328       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5329       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
5330     }
5331     if (UseAESCTRIntrinsics){
5332       StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
5333       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
5334     }
5335 
5336     if (UseSHA1Intrinsics) {
5337       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
5338       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
5339       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
5340       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
5341     }
5342     if (UseSHA256Intrinsics) {
5343       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
5344       char* dst = (char*)StubRoutines::x86::_k256_W;
5345       char* src = (char*)StubRoutines::x86::_k256;
5346       for (int ii = 0; ii < 16; ++ii) {
5347         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
5348         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
5349       }
5350       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
5351       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
5352       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
5353       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
5354     }
5355     if (UseSHA512Intrinsics) {
5356       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
5357       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
5358       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
5359       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
5360     }
5361 
5362     // Generate GHASH intrinsics code
5363     if (UseGHASHIntrinsics) {
5364       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
5365       StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
5366       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5367     }
5368 
5369     // Safefetch stubs.
5370     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5371                                                        &StubRoutines::_safefetch32_fault_pc,
5372                                                        &StubRoutines::_safefetch32_continuation_pc);
5373     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5374                                                        &StubRoutines::_safefetchN_fault_pc,
5375                                                        &StubRoutines::_safefetchN_continuation_pc);
5376 #ifdef COMPILER2
5377     if (UseMultiplyToLenIntrinsic) {
5378       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5379     }
5380     if (UseSquareToLenIntrinsic) {
5381       StubRoutines::_squareToLen = generate_squareToLen();
5382     }
5383     if (UseMulAddIntrinsic) {
5384       StubRoutines::_mulAdd = generate_mulAdd();
5385     }
5386 #ifndef _WINDOWS
5387     if (UseMontgomeryMultiplyIntrinsic) {
5388       StubRoutines::_montgomeryMultiply
5389         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
5390     }
5391     if (UseMontgomerySquareIntrinsic) {
5392       StubRoutines::_montgomerySquare
5393         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
5394     }
5395 #endif // WINDOWS
5396 #endif // COMPILER2
5397 
5398     if (UseVectorizedMismatchIntrinsic) {
5399       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
5400     }
5401   }
5402 
5403  public:
5404   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5405     if (all) {
5406       generate_all();
5407     } else {
5408       generate_initial();
5409     }
5410   }
5411 }; // end class declaration
5412 
5413 void StubGenerator_generate(CodeBuffer* code, bool all) {
5414   StubGenerator g(code, all);
5415 }